summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec1
-rw-r--r--kernel/Kconfig.preempt13
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/audit.c278
-rw-r--r--kernel/audit.h13
-rw-r--r--kernel/audit_fsnotify.c11
-rw-r--r--kernel/audit_tree.c6
-rw-r--r--kernel/audit_watch.c3
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/auditsc.c63
-rw-r--r--kernel/bpf/Kconfig2
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arena.c30
-rw-r--r--kernel/bpf/arraymap.c21
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c6
-rw-r--r--kernel/bpf/bpf_inode_storage.c6
-rw-r--r--kernel/bpf/bpf_iter.c6
-rw-r--r--kernel/bpf/bpf_lru_list.c10
-rw-r--r--kernel/bpf/bpf_struct_ops.c12
-rw-r--r--kernel/bpf/bpf_task_storage.c6
-rw-r--r--kernel/bpf/btf.c99
-rw-r--r--kernel/bpf/cgroup.c11
-rw-r--r--kernel/bpf/core.c81
-rw-r--r--kernel/bpf/cpumap.c6
-rw-r--r--kernel/bpf/crypto.c2
-rw-r--r--kernel/bpf/devmap.c2
-rw-r--r--kernel/bpf/hashtab.c43
-rw-r--r--kernel/bpf/helpers.c628
-rw-r--r--kernel/bpf/inode.c6
-rw-r--r--kernel/bpf/liveness.c733
-rw-r--r--kernel/bpf/local_storage.c2
-rw-r--r--kernel/bpf/log.c30
-rw-r--r--kernel/bpf/memalloc.c2
-rw-r--r--kernel/bpf/rqspinlock.c2
-rw-r--r--kernel/bpf/stackmap.c20
-rw-r--r--kernel/bpf/syscall.c125
-rw-r--r--kernel/bpf/tnum.c63
-rw-r--r--kernel/bpf/trampoline.c18
-rw-r--r--kernel/bpf/verifier.c876
-rw-r--r--kernel/cgroup/cgroup-internal.h11
-rw-r--r--kernel/cgroup/cgroup-v1.c19
-rw-r--r--kernel/cgroup/cgroup.c268
-rw-r--r--kernel/cgroup/cpuset-internal.h5
-rw-r--r--kernel/cgroup/cpuset-v1.c12
-rw-r--r--kernel/cgroup/cpuset.c763
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/freezer.c16
-rw-r--r--kernel/cgroup/namespace.c29
-rw-r--r--kernel/cgroup/rstat.c3
-rw-r--r--kernel/configs/hardening.config4
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/dma/contiguous.c2
-rw-r--r--kernel/dma/debug.c48
-rw-r--r--kernel/dma/debug.h20
-rw-r--r--kernel/dma/mapping.c4
-rw-r--r--kernel/dma/pool.c4
-rw-r--r--kernel/entry/common.c16
-rw-r--r--kernel/events/callchain.c40
-rw-r--r--kernel/events/core.c382
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/events/uprobes.c113
-rw-r--r--kernel/fork.c19
-rw-r--r--kernel/futex/core.c16
-rw-r--r--kernel/futex/futex.h6
-rw-r--r--kernel/futex/requeue.c6
-rw-r--r--kernel/futex/syscalls.c106
-rw-r--r--kernel/irq/Kconfig6
-rw-r--r--kernel/irq/chip.c37
-rw-r--r--kernel/irq/devres.c127
-rw-r--r--kernel/irq/handle.c49
-rw-r--r--kernel/irq/irq_test.c55
-rw-r--r--kernel/irq/irqdesc.c7
-rw-r--r--kernel/irq/msi.c3
-rw-r--r--kernel/kexec_handover.c29
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/locking/ww_mutex.h6
-rw-r--r--kernel/module/Kconfig2
-rw-r--r--kernel/module/tree_lookup.c2
-rw-r--r--kernel/nscommon.c77
-rw-r--r--kernel/nsproxy.c8
-rw-r--r--kernel/nstree.c247
-rw-r--r--kernel/params.c7
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pid_namespace.c47
-rw-r--r--kernel/power/energy_model.c29
-rw-r--r--kernel/power/hibernate.c1
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/rcu/tree.h1
-rw-r--r--kernel/rcu/tree_plugin.h8
-rw-r--r--kernel/rseq.c10
-rw-r--r--kernel/sched/build_policy.c1
-rw-r--r--kernel/sched/core.c76
-rw-r--r--kernel/sched/deadline.c124
-rw-r--r--kernel/sched/debug.c6
-rw-r--r--kernel/sched/ext.c1566
-rw-r--r--kernel/sched/ext.h25
-rw-r--r--kernel/sched/ext_idle.c174
-rw-r--r--kernel/sched/ext_internal.h1078
-rw-r--r--kernel/sched/fair.c498
-rw-r--r--kernel/sched/pelt.h4
-rw-r--r--kernel/sched/rq-offsets.c12
-rw-r--r--kernel/sched/sched.h48
-rw-r--r--kernel/sched/topology.c75
-rw-r--r--kernel/seccomp.c44
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/smp.c11
-rw-r--r--kernel/softirq.c145
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c7
-rw-r--r--kernel/time/hrtimer.c51
-rw-r--r--kernel/time/itimer.c3
-rw-r--r--kernel/time/namespace.c34
-rw-r--r--kernel/time/posix-timers.c7
-rw-r--r--kernel/time/sched_clock.c4
-rw-r--r--kernel/time/tick-common.c16
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/timekeeping.c10
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/vsyscall.c4
-rw-r--r--kernel/trace/bpf_trace.c201
-rw-r--r--kernel/trace/fgraph.c14
-rw-r--r--kernel/trace/fprobe.c7
-rw-r--r--kernel/trace/ftrace.c19
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.c4
-rw-r--r--kernel/trace/rv/rv.c4
-rw-r--r--kernel/trace/trace.c32
-rw-r--r--kernel/trace/trace.h10
-rw-r--r--kernel/trace/trace_dynevent.c4
-rw-r--r--kernel/trace/trace_events_user.c2
-rw-r--r--kernel/trace/trace_functions_graph.c22
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_osnoise.c6
-rw-r--r--kernel/tsacct.c3
-rw-r--r--kernel/user.c5
-rw-r--r--kernel/user_namespace.c24
-rw-r--r--kernel/utsname.c33
-rw-r--r--kernel/vhost_task.c3
-rw-r--r--kernel/workqueue.c80
143 files changed, 6644 insertions, 3817 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 2ee603a98813..1224dd937df0 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -97,6 +97,7 @@ config KEXEC_JUMP
config KEXEC_HANDOVER
bool "kexec handover"
depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
+ depends on !DEFERRED_STRUCT_PAGE_INIT
select MEMBLOCK_KHO_SCRATCH
select KEXEC_FILE
select DEBUG_FS
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 54ea59ff8fbe..da326800c1c9 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -103,6 +103,19 @@ config PREEMPT_RT
Select this if you are building a kernel for systems which
require real-time guarantees.
+config PREEMPT_RT_NEEDS_BH_LOCK
+ bool "Enforce softirq synchronisation on PREEMPT_RT"
+ depends on PREEMPT_RT
+ help
+ Enforce synchronisation across the softirqs context. On PREEMPT_RT
+ the softirq is preemptible. This enforces the same per-CPU BLK
+ semantic non-PREEMPT_RT builds have. This should not be needed
+ because per-CPU locks were added to avoid the per-CPU BKL.
+
+ This switch provides the old behaviour for testing reasons. Select
+ this if you suspect an error with preemptible softirq and want test
+ the old synchronized behaviour.
+
config PREEMPT_COUNT
bool
diff --git a/kernel/Makefile b/kernel/Makefile
index c60623448235..41751834e764 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \
sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
- kthread.o sys_ni.o nsproxy.o \
+ kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
@@ -122,7 +122,7 @@ obj-$(CONFIG_KCSAN) += kcsan/
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
-obj-$(CONFIG_CFI_CLANG) += cfi.o
+obj-$(CONFIG_CFI) += cfi.o
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/audit.c b/kernel/audit.c
index 61b5744d0bb6..26a332ffb1b8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -54,6 +54,7 @@
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <linux/security.h>
+#include <linux/lsm_hooks.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>
@@ -81,6 +82,13 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK;
/* private audit network namespace index */
static unsigned int audit_net_id;
+/* Number of modules that provide a security context.
+ List of lsms that provide a security context */
+static u32 audit_subj_secctx_cnt;
+static u32 audit_obj_secctx_cnt;
+static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT];
+static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT];
+
/**
* struct audit_net - audit private network namespace data
* @sk: communication socket
@@ -195,8 +203,10 @@ static struct audit_ctl_mutex {
* to place it on a transmit queue. Multiple audit_buffers can be in
* use simultaneously. */
struct audit_buffer {
- struct sk_buff *skb; /* formatted skb ready to send */
+ struct sk_buff *skb; /* the skb for audit_log functions */
+ struct sk_buff_head skb_list; /* formatted skbs, ready to send */
struct audit_context *ctx; /* NULL or associated context */
+ struct audit_stamp stamp; /* audit stamp for these records */
gfp_t gfp_mask;
};
@@ -279,6 +289,33 @@ static pid_t auditd_pid_vnr(void)
}
/**
+ * audit_cfg_lsm - Identify a security module as providing a secctx.
+ * @lsmid: LSM identity
+ * @flags: which contexts are provided
+ *
+ * Description:
+ * Increments the count of the security modules providing a secctx.
+ * If the LSM id is already in the list leave it alone.
+ */
+void audit_cfg_lsm(const struct lsm_id *lsmid, int flags)
+{
+ int i;
+
+ if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) {
+ for (i = 0 ; i < audit_subj_secctx_cnt; i++)
+ if (audit_subj_lsms[i] == lsmid)
+ return;
+ audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid;
+ }
+ if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) {
+ for (i = 0 ; i < audit_obj_secctx_cnt; i++)
+ if (audit_obj_lsms[i] == lsmid)
+ return;
+ audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid;
+ }
+}
+
+/**
* audit_get_sk - Return the audit socket for the given network namespace
* @net: the destination network namespace
*
@@ -1113,7 +1150,6 @@ static int is_audit_feature_set(int i)
return af.features & AUDIT_FEATURE_TO_MASK(i);
}
-
static int audit_get_feature(struct sk_buff *skb)
{
u32 seq;
@@ -1473,7 +1509,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
case AUDIT_SIGNAL_INFO:
if (lsmprop_is_set(&audit_sig_lsm)) {
err = security_lsmprop_to_secctx(&audit_sig_lsm,
- &lsmctx);
+ &lsmctx, LSM_ID_UNDEF);
if (err < 0)
return err;
}
@@ -1776,10 +1812,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set);
static void audit_buffer_free(struct audit_buffer *ab)
{
+ struct sk_buff *skb;
+
if (!ab)
return;
- kfree_skb(ab->skb);
+ while ((skb = skb_dequeue(&ab->skb_list)))
+ kfree_skb(skb);
kmem_cache_free(audit_buffer_cache, ab);
}
@@ -1792,9 +1831,14 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
if (!ab)
return NULL;
+ skb_queue_head_init(&ab->skb_list);
+
ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
goto err;
+
+ skb_queue_tail(&ab->skb_list, ab->skb);
+
if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
goto err;
@@ -1833,11 +1877,11 @@ unsigned int audit_serial(void)
}
static inline void audit_get_stamp(struct audit_context *ctx,
- struct timespec64 *t, unsigned int *serial)
+ struct audit_stamp *stamp)
{
- if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
- ktime_get_coarse_real_ts64(t);
- *serial = audit_serial();
+ if (!ctx || !auditsc_get_stamp(ctx, stamp)) {
+ ktime_get_coarse_real_ts64(&stamp->ctime);
+ stamp->serial = audit_serial();
}
}
@@ -1860,8 +1904,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
int type)
{
struct audit_buffer *ab;
- struct timespec64 t;
- unsigned int serial;
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
@@ -1916,12 +1958,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
return NULL;
}
- audit_get_stamp(ab->ctx, &t, &serial);
+ audit_get_stamp(ab->ctx, &ab->stamp);
/* cancel dummy context to enable supporting records */
if (ctx)
ctx->dummy = 0;
audit_log_format(ab, "audit(%llu.%03lu:%u): ",
- (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
+ (unsigned long long)ab->stamp.ctime.tv_sec,
+ ab->stamp.ctime.tv_nsec/1000000,
+ ab->stamp.serial);
return ab;
}
@@ -2177,33 +2221,179 @@ void audit_log_key(struct audit_buffer *ab, char *key)
audit_log_format(ab, "(null)");
}
-int audit_log_task_context(struct audit_buffer *ab)
+/**
+ * audit_buffer_aux_new - Add an aux record buffer to the skb list
+ * @ab: audit_buffer
+ * @type: message type
+ *
+ * Aux records are allocated and added to the skb list of
+ * the "main" record. The ab->skb is reset to point to the
+ * aux record on its creation. When the aux record in complete
+ * ab->skb has to be reset to point to the "main" record.
+ * This allows the audit_log_ functions to be ignorant of
+ * which kind of record it is logging to. It also avoids adding
+ * special data for aux records.
+ *
+ * On success ab->skb will point to the new aux record.
+ * Returns 0 on success, -ENOMEM should allocation fail.
+ */
+static int audit_buffer_aux_new(struct audit_buffer *ab, int type)
+{
+ WARN_ON(ab->skb != skb_peek(&ab->skb_list));
+
+ ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask);
+ if (!ab->skb)
+ goto err;
+ if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
+ goto err;
+ skb_queue_tail(&ab->skb_list, ab->skb);
+
+ audit_log_format(ab, "audit(%llu.%03lu:%u): ",
+ (unsigned long long)ab->stamp.ctime.tv_sec,
+ ab->stamp.ctime.tv_nsec/1000000,
+ ab->stamp.serial);
+
+ return 0;
+
+err:
+ kfree_skb(ab->skb);
+ ab->skb = skb_peek(&ab->skb_list);
+ return -ENOMEM;
+}
+
+/**
+ * audit_buffer_aux_end - Switch back to the "main" record from an aux record
+ * @ab: audit_buffer
+ *
+ * Restores the "main" audit record to ab->skb.
+ */
+static void audit_buffer_aux_end(struct audit_buffer *ab)
+{
+ ab->skb = skb_peek(&ab->skb_list);
+}
+
+/**
+ * audit_log_subj_ctx - Add LSM subject information
+ * @ab: audit_buffer
+ * @prop: LSM subject properties.
+ *
+ * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record.
+ */
+int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop)
{
- struct lsm_prop prop;
struct lsm_context ctx;
+ char *space = "";
int error;
+ int i;
- security_current_getlsmprop_subj(&prop);
- if (!lsmprop_is_set(&prop))
+ security_current_getlsmprop_subj(prop);
+ if (!lsmprop_is_set(prop))
return 0;
- error = security_lsmprop_to_secctx(&prop, &ctx);
- if (error < 0) {
- if (error != -EINVAL)
- goto error_path;
+ if (audit_subj_secctx_cnt < 2) {
+ error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF);
+ if (error < 0) {
+ if (error != -EINVAL)
+ goto error_path;
+ return 0;
+ }
+ audit_log_format(ab, " subj=%s", ctx.context);
+ security_release_secctx(&ctx);
return 0;
}
-
- audit_log_format(ab, " subj=%s", ctx.context);
- security_release_secctx(&ctx);
+ /* Multiple LSMs provide contexts. Include an aux record. */
+ audit_log_format(ab, " subj=?");
+ error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS);
+ if (error)
+ goto error_path;
+
+ for (i = 0; i < audit_subj_secctx_cnt; i++) {
+ error = security_lsmprop_to_secctx(prop, &ctx,
+ audit_subj_lsms[i]->id);
+ if (error < 0) {
+ /*
+ * Don't print anything. An LSM like BPF could
+ * claim to support contexts, but only do so under
+ * certain conditions.
+ */
+ if (error == -EOPNOTSUPP)
+ continue;
+ if (error != -EINVAL)
+ audit_panic("error in audit_log_subj_ctx");
+ } else {
+ audit_log_format(ab, "%ssubj_%s=%s", space,
+ audit_subj_lsms[i]->name, ctx.context);
+ space = " ";
+ security_release_secctx(&ctx);
+ }
+ }
+ audit_buffer_aux_end(ab);
return 0;
error_path:
- audit_panic("error in audit_log_task_context");
+ audit_panic("error in audit_log_subj_ctx");
return error;
}
+EXPORT_SYMBOL(audit_log_subj_ctx);
+
+int audit_log_task_context(struct audit_buffer *ab)
+{
+ struct lsm_prop prop;
+
+ security_current_getlsmprop_subj(&prop);
+ return audit_log_subj_ctx(ab, &prop);
+}
EXPORT_SYMBOL(audit_log_task_context);
+int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop)
+{
+ int i;
+ int rc;
+ int error = 0;
+ char *space = "";
+ struct lsm_context ctx;
+
+ if (audit_obj_secctx_cnt < 2) {
+ error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF);
+ if (error < 0) {
+ if (error != -EINVAL)
+ goto error_path;
+ return error;
+ }
+ audit_log_format(ab, " obj=%s", ctx.context);
+ security_release_secctx(&ctx);
+ return 0;
+ }
+ audit_log_format(ab, " obj=?");
+ error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS);
+ if (error)
+ goto error_path;
+
+ for (i = 0; i < audit_obj_secctx_cnt; i++) {
+ rc = security_lsmprop_to_secctx(prop, &ctx,
+ audit_obj_lsms[i]->id);
+ if (rc < 0) {
+ audit_log_format(ab, "%sobj_%s=?", space,
+ audit_obj_lsms[i]->name);
+ if (rc != -EINVAL)
+ audit_panic("error in audit_log_obj_ctx");
+ error = rc;
+ } else {
+ audit_log_format(ab, "%sobj_%s=%s", space,
+ audit_obj_lsms[i]->name, ctx.context);
+ security_release_secctx(&ctx);
+ }
+ space = " ";
+ }
+
+ audit_buffer_aux_end(ab);
+ return error;
+
+error_path:
+ audit_panic("error in audit_log_obj_ctx");
+ return error;
+}
+
void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm)
{
@@ -2411,6 +2601,28 @@ int audit_signal_info(int sig, struct task_struct *t)
}
/**
+ * __audit_log_end - enqueue one audit record
+ * @skb: the buffer to send
+ */
+static void __audit_log_end(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh;
+
+ if (audit_rate_check()) {
+ /* setup the netlink header, see the comments in
+ * kauditd_send_multicast_skb() for length quirks */
+ nlh = nlmsg_hdr(skb);
+ nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+
+ /* queue the netlink packet */
+ skb_queue_tail(&audit_queue, skb);
+ } else {
+ audit_log_lost("rate limit exceeded");
+ kfree_skb(skb);
+ }
+}
+
+/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
@@ -2422,25 +2634,15 @@ int audit_signal_info(int sig, struct task_struct *t)
void audit_log_end(struct audit_buffer *ab)
{
struct sk_buff *skb;
- struct nlmsghdr *nlh;
if (!ab)
return;
- if (audit_rate_check()) {
- skb = ab->skb;
- ab->skb = NULL;
-
- /* setup the netlink header, see the comments in
- * kauditd_send_multicast_skb() for length quirks */
- nlh = nlmsg_hdr(skb);
- nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+ while ((skb = skb_dequeue(&ab->skb_list)))
+ __audit_log_end(skb);
- /* queue the netlink packet and poke the kauditd thread */
- skb_queue_tail(&audit_queue, skb);
- wake_up_interruptible(&kauditd_wait);
- } else
- audit_log_lost("rate limit exceeded");
+ /* poke the kauditd thread */
+ wake_up_interruptible(&kauditd_wait);
audit_buffer_free(ab);
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 2a24d01c5fb0..0f05933a173b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -99,6 +99,12 @@ struct audit_proctitle {
char *value; /* the cmdline field */
};
+/* A timestamp/serial pair to identify an event */
+struct audit_stamp {
+ struct timespec64 ctime; /* time of syscall entry */
+ unsigned int serial; /* serial number for record */
+};
+
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
@@ -108,10 +114,9 @@ struct audit_context {
AUDIT_CTX_URING, /* in use by io_uring */
} context;
enum audit_state state, current_state;
- unsigned int serial; /* serial number for record */
+ struct audit_stamp stamp; /* event identifier */
int major; /* syscall number */
int uring_op; /* uring operation */
- struct timespec64 ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
u64 prio;
@@ -263,7 +268,7 @@ extern void audit_put_tty(struct tty_struct *tty);
extern unsigned int audit_serial(void);
#ifdef CONFIG_AUDITSYSCALL
extern int auditsc_get_stamp(struct audit_context *ctx,
- struct timespec64 *t, unsigned int *serial);
+ struct audit_stamp *stamp);
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
@@ -304,7 +309,7 @@ extern void audit_filter_inodes(struct task_struct *tsk,
struct audit_context *ctx);
extern struct list_head *audit_killed_trees(void);
#else /* CONFIG_AUDITSYSCALL */
-#define auditsc_get_stamp(c, t, s) 0
+#define auditsc_get_stamp(c, s) 0
#define audit_put_watch(w) do { } while (0)
#define audit_get_watch(w) do { } while (0)
#define audit_to_watch(k, p, l, o) (-EINVAL)
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index c565fbf66ac8..b92805b317a2 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -76,17 +76,18 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
struct audit_fsnotify_mark *audit_mark;
struct path path;
struct dentry *dentry;
- struct inode *inode;
int ret;
if (pathname[0] != '/' || pathname[len-1] == '/')
return ERR_PTR(-EINVAL);
- dentry = kern_path_locked(pathname, &path);
+ dentry = kern_path_parent(pathname, &path);
if (IS_ERR(dentry))
return ERR_CAST(dentry); /* returning an error */
- inode = path.dentry->d_inode;
- inode_unlock(inode);
+ if (d_really_is_negative(dentry)) {
+ audit_mark = ERR_PTR(-ENOENT);
+ goto out;
+ }
audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
if (unlikely(!audit_mark)) {
@@ -100,7 +101,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
audit_update_mark(audit_mark, dentry->d_inode);
audit_mark->rule = krule;
- ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0);
+ ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0);
if (ret < 0) {
audit_mark->path = NULL;
fsnotify_put_mark(&audit_mark->mark);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b0eae2a3c895..1605df0a171e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -93,8 +93,10 @@ static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
+ size_t sz;
- tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
+ sz = strlen(s) + 1;
+ tree = kmalloc(struct_size(tree, pathname, sz), GFP_KERNEL);
if (tree) {
refcount_set(&tree->count, 1);
tree->goner = 0;
@@ -103,7 +105,7 @@ static struct audit_tree *alloc_tree(const char *s)
INIT_LIST_HEAD(&tree->list);
INIT_LIST_HEAD(&tree->same_root);
tree->root = NULL;
- strcpy(tree->pathname, s);
+ strscpy(tree->pathname, s, sz);
}
return tree;
}
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0ebbbe37a60f..a700e3c8925f 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -349,7 +349,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
struct dentry *d;
- d = kern_path_locked_negative(watch->path, parent);
+ d = kern_path_parent(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
@@ -359,7 +359,6 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
watch->ino = d_backing_inode(d)->i_ino;
}
- inode_unlock(d_backing_inode(parent->dentry));
dput(d);
return 0;
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index e3f42018ed46..c401082d9b25 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1326,7 +1326,7 @@ int audit_compare_dname_path(const struct qstr *dname, const char *path, int par
/* handle trailing slashes */
pathlen -= parentlen;
- while (p[pathlen - 1] == '/')
+ while (pathlen > 0 && p[pathlen - 1] == '/')
pathlen--;
if (pathlen != dlen)
@@ -1440,7 +1440,7 @@ static int update_lsm_rule(struct audit_krule *r)
}
/* This function will re-initialize the lsm_rule field of all applicable rules.
- * It will traverse the filter lists serarching for rules that contain LSM
+ * It will traverse the filter lists searching for rules that contain LSM
* specific filter fields. When such a rule is found, it is copied, the
* LSM field is re-initialized, and the old rule is replaced with the
* updated rule. */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index eb98cd6fe91f..d1966144bdfe 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -994,10 +994,10 @@ static void audit_reset_context(struct audit_context *ctx)
*/
ctx->current_state = ctx->state;
- ctx->serial = 0;
+ ctx->stamp.serial = 0;
+ ctx->stamp.ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
ctx->major = 0;
ctx->uring_op = 0;
- ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
memset(ctx->argv, 0, sizeof(ctx->argv));
ctx->return_code = 0;
ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
@@ -1098,7 +1098,6 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
char *comm)
{
struct audit_buffer *ab;
- struct lsm_context ctx;
int rc = 0;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
@@ -1108,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
from_kuid(&init_user_ns, auid),
from_kuid(&init_user_ns, uid), sessionid);
- if (lsmprop_is_set(prop)) {
- if (security_lsmprop_to_secctx(prop, &ctx) < 0) {
- audit_log_format(ab, " obj=(none)");
- rc = 1;
- } else {
- audit_log_format(ab, " obj=%s", ctx.context);
- security_release_secctx(&ctx);
- }
- }
+ if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop))
+ rc = 1;
+
audit_log_format(ab, " ocomm=");
audit_log_untrustedstring(ab, comm);
audit_log_end(ab);
@@ -1392,15 +1385,8 @@ static void show_special(struct audit_context *context, int *call_panic)
from_kgid(&init_user_ns, context->ipc.gid),
context->ipc.mode);
if (lsmprop_is_set(&context->ipc.oprop)) {
- struct lsm_context lsmctx;
-
- if (security_lsmprop_to_secctx(&context->ipc.oprop,
- &lsmctx) < 0) {
+ if (audit_log_obj_ctx(ab, &context->ipc.oprop))
*call_panic = 1;
- } else {
- audit_log_format(ab, " obj=%s", lsmctx.context);
- security_release_secctx(&lsmctx);
- }
}
if (context->ipc.has_perm) {
audit_log_end(ab);
@@ -1557,17 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
from_kgid(&init_user_ns, n->gid),
MAJOR(n->rdev),
MINOR(n->rdev));
- if (lsmprop_is_set(&n->oprop)) {
- struct lsm_context ctx;
-
- if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) {
- if (call_panic)
- *call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx.context);
- security_release_secctx(&ctx);
- }
- }
+ if (lsmprop_is_set(&n->oprop) &&
+ audit_log_obj_ctx(ab, &n->oprop))
+ *call_panic = 2;
/* log the audit_names record type */
switch (n->type) {
@@ -1785,8 +1763,9 @@ static void audit_log_exit(void)
audit_log_pid_context(context, context->target_pid,
context->target_auid, context->target_uid,
context->target_sessionid,
- &context->target_ref, context->target_comm))
- call_panic = 1;
+ &context->target_ref,
+ context->target_comm))
+ call_panic = 1;
if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
@@ -1917,7 +1896,7 @@ void __audit_uring_entry(u8 op)
ctx->context = AUDIT_CTX_URING;
ctx->current_state = ctx->state;
- ktime_get_coarse_real_ts64(&ctx->ctime);
+ ktime_get_coarse_real_ts64(&ctx->stamp.ctime);
}
/**
@@ -2039,7 +2018,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
context->argv[3] = a4;
context->context = AUDIT_CTX_SYSCALL;
context->current_state = state;
- ktime_get_coarse_real_ts64(&context->ctime);
+ ktime_get_coarse_real_ts64(&context->stamp.ctime);
}
/**
@@ -2508,21 +2487,17 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
/**
* auditsc_get_stamp - get local copies of audit_context values
* @ctx: audit_context for the task
- * @t: timespec64 to store time recorded in the audit_context
- * @serial: serial value that is recorded in the audit_context
+ * @stamp: timestamp to record
*
* Also sets the context as auditable.
*/
-int auditsc_get_stamp(struct audit_context *ctx,
- struct timespec64 *t, unsigned int *serial)
+int auditsc_get_stamp(struct audit_context *ctx, struct audit_stamp *stamp)
{
if (ctx->context == AUDIT_CTX_UNUSED)
return 0;
- if (!ctx->serial)
- ctx->serial = audit_serial();
- t->tv_sec = ctx->ctime.tv_sec;
- t->tv_nsec = ctx->ctime.tv_nsec;
- *serial = ctx->serial;
+ if (!ctx->stamp.serial)
+ ctx->stamp.serial = audit_serial();
+ *stamp = ctx->stamp;
if (!ctx->prio) {
ctx->prio = 1;
ctx->current_state = AUDIT_STATE_RECORD;
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index 17067dcb4386..eb3de35734f0 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -3,7 +3,7 @@
# BPF interpreter that, for example, classic socket filters depend on.
config BPF
bool
- select CRYPTO_LIB_SHA1
+ select CRYPTO_LIB_SHA256
# Used by archs to tell that they support BPF JIT compiler plus which
# flavour. Only one of the two can be selected for a specific arch since
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 269c04a24664..7fd0badfacb1 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
@@ -62,3 +62,4 @@ CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 5b37753799d2..1074ac4459f2 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -633,3 +633,33 @@ static int __init kfunc_init(void)
return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
}
late_initcall(kfunc_init);
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+ u64 user_vm_start;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it will not
+ * disappear while we are handling the fault.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(fault_ip);
+ rcu_read_unlock();
+ if (!prog)
+ return;
+
+ /* Use main prog for stream access */
+ prog = prog->aux->main_prog_aux->prog;
+
+ user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
+ addr += clear_lo32(user_vm_start);
+
+ bpf_stream_stage(ss, prog, BPF_STDERR, ({
+ bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n",
+ write ? "WRITE" : "READ", addr);
+ bpf_stream_dump_stack(ss);
+ }));
+}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3d080916faf9..80b1765a3159 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -12,6 +12,7 @@
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/btf_ids.h>
+#include <crypto/sha2.h>
#include "map_in_map.h"
@@ -174,6 +175,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
return array->value + (u64)array->elem_size * (index & array->index_mask);
}
+static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size,
+ void *hash_buf)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ sha256(array->value, (u64)array->elem_size * array->map.max_entries,
+ hash_buf);
+ memcpy(array->map.sha, hash_buf, sizeof(array->map.sha));
+ return 0;
+}
+
static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
u32 off)
{
@@ -431,7 +443,7 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
-static void array_map_free_timers_wq(struct bpf_map *map)
+static void array_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -439,12 +451,14 @@ static void array_map_free_timers_wq(struct bpf_map *map)
/* We don't reset or free fields other than timer and workqueue
* on uref dropping to zero.
*/
- if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) {
+ if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
for (i = 0; i < array->map.max_entries; i++) {
if (btf_record_has_field(map->record, BPF_TIMER))
bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
if (btf_record_has_field(map->record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
+ if (btf_record_has_field(map->record, BPF_TASK_WORK))
+ bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i));
}
}
}
@@ -783,7 +797,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
- .map_release_uref = array_map_free_timers_wq,
+ .map_release_uref = array_map_free_internal_structs,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
@@ -800,6 +814,7 @@ const struct bpf_map_ops array_map_ops = {
.map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
+ .map_get_hash = &array_map_get_hash,
};
const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 148da8f7ff36..0687a760974a 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -45,8 +45,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
- migrate_disable();
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
if (!local_storage)
goto out;
@@ -55,8 +54,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
bpf_local_storage_destroy(local_storage);
bpf_cgrp_storage_unlock();
out:
- rcu_read_unlock();
- migrate_enable();
+ rcu_read_unlock_migrate();
}
static struct bpf_local_storage_data *
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 15a3eb9b02d9..e54cce2b9175 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -62,8 +62,7 @@ void bpf_inode_storage_free(struct inode *inode)
if (!bsb)
return;
- migrate_disable();
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
local_storage = rcu_dereference(bsb->storage);
if (!local_storage)
@@ -71,8 +70,7 @@ void bpf_inode_storage_free(struct inode *inode)
bpf_local_storage_destroy(local_storage);
out:
- rcu_read_unlock();
- migrate_enable();
+ rcu_read_unlock_migrate();
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 0cbcae727079..6ac35430c573 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -705,13 +705,11 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
migrate_enable();
rcu_read_unlock_trace();
} else {
- rcu_read_lock();
- migrate_disable();
+ rcu_read_lock_dont_migrate();
old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
bpf_reset_run_ctx(old_run_ctx);
- migrate_enable();
- rcu_read_unlock();
+ rcu_read_unlock_migrate();
}
/* bpf program can only return 0 or 1:
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 2d6e1c98d8ad..e7a2fc60523f 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -19,14 +19,6 @@
#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
-static int get_next_cpu(int cpu)
-{
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(cpu_possible_mask);
- return cpu;
-}
-
/* Local list helpers */
static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
{
@@ -482,7 +474,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
- steal = get_next_cpu(steal);
+ steal = cpumask_next_wrap(steal, cpu_possible_mask);
} while (!node && steal != first_steal);
loc_l->next_steal = steal;
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 687a3e9c76f5..a41e6730edcf 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1174,6 +1174,18 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
+u32 bpf_struct_ops_id(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ return st_map->map.id;
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
+
static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 1109475953c0..a1dc1bf0848a 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -70,8 +70,7 @@ void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
- migrate_disable();
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
local_storage = rcu_dereference(task->bpf_storage);
if (!local_storage)
@@ -81,8 +80,7 @@ void bpf_task_storage_free(struct task_struct *task)
bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
out:
- rcu_read_unlock();
- migrate_enable();
+ rcu_read_unlock_migrate();
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 64739308902f..0de8fc8a0e0b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3478,60 +3478,45 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
return BTF_FIELD_FOUND;
}
-#define field_mask_test_name(field_type, field_type_str) \
- if (field_mask & field_type && !strcmp(name, field_type_str)) { \
- type = field_type; \
- goto end; \
- }
-
static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type,
- u32 field_mask, u32 *seen_mask,
- int *align, int *sz)
-{
- int type = 0;
+ u32 field_mask, u32 *seen_mask, int *align, int *sz)
+{
+ const struct {
+ enum btf_field_type type;
+ const char *const name;
+ const bool is_unique;
+ } field_types[] = {
+ { BPF_SPIN_LOCK, "bpf_spin_lock", true },
+ { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true },
+ { BPF_TIMER, "bpf_timer", true },
+ { BPF_WORKQUEUE, "bpf_wq", true },
+ { BPF_TASK_WORK, "bpf_task_work", true },
+ { BPF_LIST_HEAD, "bpf_list_head", false },
+ { BPF_LIST_NODE, "bpf_list_node", false },
+ { BPF_RB_ROOT, "bpf_rb_root", false },
+ { BPF_RB_NODE, "bpf_rb_node", false },
+ { BPF_REFCOUNT, "bpf_refcount", false },
+ };
+ int type = 0, i;
const char *name = __btf_name_by_offset(btf, var_type->name_off);
-
- if (field_mask & BPF_SPIN_LOCK) {
- if (!strcmp(name, "bpf_spin_lock")) {
- if (*seen_mask & BPF_SPIN_LOCK)
- return -E2BIG;
- *seen_mask |= BPF_SPIN_LOCK;
- type = BPF_SPIN_LOCK;
- goto end;
- }
- }
- if (field_mask & BPF_RES_SPIN_LOCK) {
- if (!strcmp(name, "bpf_res_spin_lock")) {
- if (*seen_mask & BPF_RES_SPIN_LOCK)
- return -E2BIG;
- *seen_mask |= BPF_RES_SPIN_LOCK;
- type = BPF_RES_SPIN_LOCK;
- goto end;
- }
- }
- if (field_mask & BPF_TIMER) {
- if (!strcmp(name, "bpf_timer")) {
- if (*seen_mask & BPF_TIMER)
- return -E2BIG;
- *seen_mask |= BPF_TIMER;
- type = BPF_TIMER;
- goto end;
- }
- }
- if (field_mask & BPF_WORKQUEUE) {
- if (!strcmp(name, "bpf_wq")) {
- if (*seen_mask & BPF_WORKQUEUE)
+ const char *field_type_name;
+ enum btf_field_type field_type;
+ bool is_unique;
+
+ for (i = 0; i < ARRAY_SIZE(field_types); ++i) {
+ field_type = field_types[i].type;
+ field_type_name = field_types[i].name;
+ is_unique = field_types[i].is_unique;
+ if (!(field_mask & field_type) || strcmp(name, field_type_name))
+ continue;
+ if (is_unique) {
+ if (*seen_mask & field_type)
return -E2BIG;
- *seen_mask |= BPF_WORKQUEUE;
- type = BPF_WORKQUEUE;
- goto end;
+ *seen_mask |= field_type;
}
+ type = field_type;
+ goto end;
}
- field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
- field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
- field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
- field_mask_test_name(BPF_RB_NODE, "bpf_rb_node");
- field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
/* Only return BPF_KPTR when all other types with matchable names fail */
if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
@@ -3545,8 +3530,6 @@ end:
return type;
}
-#undef field_mask_test_name
-
/* Repeat a number of fields for a specified number of times.
*
* Copy the fields starting from the first field and repeat them for
@@ -3693,6 +3676,7 @@ static int btf_find_field_one(const struct btf *btf,
case BPF_LIST_NODE:
case BPF_RB_NODE:
case BPF_REFCOUNT:
+ case BPF_TASK_WORK:
ret = btf_find_struct(btf, var_type, off, sz, field_type,
info_cnt ? &info[0] : &tmp);
if (ret < 0)
@@ -3985,6 +3969,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->timer_off = -EINVAL;
rec->wq_off = -EINVAL;
rec->refcount_off = -EINVAL;
+ rec->task_work_off = -EINVAL;
for (i = 0; i < cnt; i++) {
field_type_size = btf_field_type_size(info_arr[i].type);
if (info_arr[i].off + field_type_size > value_size) {
@@ -4024,6 +4009,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* Cache offset for faster lookup at runtime */
rec->wq_off = rec->fields[i].offset;
break;
+ case BPF_TASK_WORK:
+ WARN_ON_ONCE(rec->task_work_off >= 0);
+ rec->task_work_off = rec->fields[i].offset;
+ break;
case BPF_REFCOUNT:
WARN_ON_ONCE(rec->refcount_off >= 0);
/* Cache offset for faster lookup at runtime */
@@ -6762,7 +6751,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
+ if (btf_type_is_small_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -7334,7 +7323,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id,
if (btf_type_is_ptr(t))
/* kernel size of pointer. Not BPF's size of pointer*/
return sizeof(void *);
- if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
+ if (btf_type_is_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
return t->size;
return -EINVAL;
}
@@ -7343,7 +7332,7 @@ static u8 __get_type_fmodel_flags(const struct btf_type *t)
{
u8 flags = 0;
- if (__btf_type_is_struct(t))
+ if (btf_type_is_struct(t))
flags |= BTF_FMODEL_STRUCT_ARG;
if (btf_type_is_signed_int(t))
flags |= BTF_FMODEL_SIGNED_ARG;
@@ -7384,7 +7373,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = __get_type_size(btf, func->type, &t);
- if (ret < 0 || __btf_type_is_struct(t)) {
+ if (ret < 0 || btf_type_is_struct(t)) {
bpf_log(log,
"The function %s return type %s is unsupported.\n",
tname, btf_type_str(t));
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 180b630279b9..248f517d66d0 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,14 +27,15 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
/*
* cgroup bpf destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup bpf
- * destruction work items don't end up filling up max_active of system_wq
+ * destruction work items don't end up filling up max_active of system_percpu_wq
* which may lead to deadlock.
*/
static struct workqueue_struct *cgroup_bpf_destroy_wq;
static int __init cgroup_bpf_wq_init(void)
{
- cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
+ cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy",
+ WQ_PERCPU, 1);
if (!cgroup_bpf_destroy_wq)
panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
return 0;
@@ -71,8 +72,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
u32 func_ret;
run_ctx.retval = retval;
- migrate_disable();
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
array = rcu_dereference(cgrp->effective[atype]);
item = &array->items[0];
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
@@ -88,8 +88,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
item++;
}
bpf_reset_run_ctx(old_run_ctx);
- rcu_read_unlock();
- migrate_enable();
+ rcu_read_unlock_migrate();
return run_ctx.retval;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5d1650af899d..d595fe512498 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -18,6 +18,7 @@
*/
#include <uapi/linux/btf.h>
+#include <crypto/sha1.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
@@ -38,6 +39,7 @@
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
#include <linux/execmem.h>
+#include <crypto/sha2.h>
#include <asm/barrier.h>
#include <linux/unaligned.h>
@@ -119,6 +121,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
+ fp->aux->main_prog_aux = aux;
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
fp->blinding_requested = bpf_jit_blinding_enabled(fp);
@@ -293,28 +296,18 @@ void __bpf_prog_free(struct bpf_prog *fp)
int bpf_prog_calc_tag(struct bpf_prog *fp)
{
- const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64);
- u32 raw_size = bpf_prog_tag_scratch_size(fp);
- u32 digest[SHA1_DIGEST_WORDS];
- u32 ws[SHA1_WORKSPACE_WORDS];
- u32 i, bsize, psize, blocks;
+ size_t size = bpf_prog_insn_size(fp);
struct bpf_insn *dst;
bool was_ld_map;
- u8 *raw, *todo;
- __be32 *result;
- __be64 *bits;
+ u32 i;
- raw = vmalloc(raw_size);
- if (!raw)
+ dst = vmalloc(size);
+ if (!dst)
return -ENOMEM;
- sha1_init_raw(digest);
- memset(ws, 0, sizeof(ws));
-
/* We need to take out the map fd for the digest calculation
* since they are unstable from user space side.
*/
- dst = (void *)raw;
for (i = 0, was_ld_map = false; i < fp->len; i++) {
dst[i] = fp->insnsi[i];
if (!was_ld_map &&
@@ -334,33 +327,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
was_ld_map = false;
}
}
-
- psize = bpf_prog_insn_size(fp);
- memset(&raw[psize], 0, raw_size - psize);
- raw[psize++] = 0x80;
-
- bsize = round_up(psize, SHA1_BLOCK_SIZE);
- blocks = bsize / SHA1_BLOCK_SIZE;
- todo = raw;
- if (bsize - psize >= sizeof(__be64)) {
- bits = (__be64 *)(todo + bsize - sizeof(__be64));
- } else {
- bits = (__be64 *)(todo + bsize + bits_offset);
- blocks++;
- }
- *bits = cpu_to_be64((psize - 1) << 3);
-
- while (blocks--) {
- sha1_transform(digest, todo, ws);
- todo += SHA1_BLOCK_SIZE;
- }
-
- result = (__force __be32 *)digest;
- for (i = 0; i < SHA1_DIGEST_WORDS; i++)
- result[i] = cpu_to_be32(digest[i]);
- memcpy(fp->tag, result, sizeof(fp->tag));
-
- vfree(raw);
+ sha256((u8 *)dst, size, fp->digest);
+ vfree(dst);
return 0;
}
@@ -2366,8 +2334,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
const struct bpf_insn *insn)
{
/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
- * is not working properly, or interpreter is being used when
- * prog->jit_requested is not 0, so warn about it!
+ * is not working properly, so warn about it!
*/
WARN_ON_ONCE(1);
return 0;
@@ -2394,6 +2361,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
map->owner->type = prog_type;
map->owner->jited = fp->jited;
map->owner->xdp_has_frags = aux->xdp_has_frags;
+ map->owner->expected_attach_type = fp->expected_attach_type;
map->owner->attach_func_proto = aux->attach_func_proto;
for_each_cgroup_storage_type(i) {
map->owner->storage_cookie[i] =
@@ -2405,6 +2373,10 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
ret = map->owner->type == prog_type &&
map->owner->jited == fp->jited &&
map->owner->xdp_has_frags == aux->xdp_has_frags;
+ if (ret &&
+ map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+ map->owner->expected_attach_type != fp->expected_attach_type)
+ ret = false;
for_each_cgroup_storage_type(i) {
if (!ret)
break;
@@ -2468,8 +2440,9 @@ out:
return ret;
}
-static void bpf_prog_select_func(struct bpf_prog *fp)
+static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
{
+ bool select_interpreter = false;
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
u32 idx = (round_up(stack_depth, 32) / 32) - 1;
@@ -2478,15 +2451,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
* But for non-JITed programs, we don't need bpf_func, so no bounds
* check needed.
*/
- if (!fp->jit_requested &&
- !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) {
+ if (idx < ARRAY_SIZE(interpreters)) {
fp->bpf_func = interpreters[idx];
+ select_interpreter = true;
} else {
fp->bpf_func = __bpf_prog_ret0_warn;
}
#else
fp->bpf_func = __bpf_prog_ret0_warn;
#endif
+ return select_interpreter;
}
/**
@@ -2505,7 +2479,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
/* In case of BPF to BPF calls, verifier did all the prep
* work with regards to JITing, etc.
*/
- bool jit_needed = fp->jit_requested;
+ bool jit_needed = false;
if (fp->bpf_func)
goto finalize;
@@ -2514,7 +2488,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
bpf_prog_has_kfunc_call(fp))
jit_needed = true;
- bpf_prog_select_func(fp);
+ if (!bpf_prog_select_interpreter(fp))
+ jit_needed = true;
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
@@ -3024,7 +2999,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output);
/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
- .func = NULL,
+ /* func is unused for tail_call, we set it to pass the
+ * get_helper_proto check
+ */
+ .func = BPF_PTR_POISON,
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_CTX,
@@ -3324,9 +3302,8 @@ static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
rcu_read_unlock();
if (!prog)
return true;
- if (bpf_is_subprog(prog))
- return true;
- ctxp->prog = prog;
+ /* Make sure we return the main prog if we found a subprog */
+ ctxp->prog = prog->aux->main_prog_aux->prog;
return false;
}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b2b7b8ec2c2a..703e5df1f4ef 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -186,7 +186,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
struct xdp_buff xdp;
int i, nframes = 0;
- xdp_set_return_frame_no_direct();
xdp.rxq = &rxq;
for (i = 0; i < n; i++) {
@@ -231,7 +230,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
}
- xdp_clear_return_frame_no_direct();
stats->pass += nframes;
return nframes;
@@ -255,6 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
rcu_read_lock();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ xdp_set_return_frame_no_direct();
ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats);
if (unlikely(ret->skb_n))
@@ -264,6 +263,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
if (stats->redirect)
xdp_do_flush();
+ xdp_clear_return_frame_no_direct();
bpf_net_ctx_clear(bpf_net_ctx);
rcu_read_unlock();
@@ -550,7 +550,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
if (old_rcpu) {
INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
- queue_rcu_work(system_wq, &old_rcpu->free_work);
+ queue_rcu_work(system_percpu_wq, &old_rcpu->free_work);
}
}
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
index 94854cd9c4cc..83c4d9943084 100644
--- a/kernel/bpf/crypto.c
+++ b/kernel/bpf/crypto.c
@@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
siv_len = siv ? __bpf_dynptr_size(siv) : 0;
src_len = __bpf_dynptr_size(src);
dst_len = __bpf_dynptr_size(dst);
- if (!src_len || !dst_len)
+ if (!src_len || !dst_len || src_len > dst_len)
return -EINVAL;
if (siv_len != ctx->siv_len)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 482d284a1553..2625601de76e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -865,7 +865,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_dtab_netdev *dev;
dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
- GFP_NOWAIT | __GFP_NOWARN,
+ GFP_NOWAIT,
dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 71f9931ac64c..c2fcd0cd51e5 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -215,7 +215,20 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
-static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
+static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ if (btf_record_has_field(htab->map.record, BPF_TIMER))
+ bpf_obj_free_timer(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
+ if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
+ if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
+ bpf_obj_free_task_work(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
+}
+
+static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
@@ -227,12 +240,7 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- if (btf_record_has_field(htab->map.record, BPF_TIMER))
- bpf_obj_free_timer(htab->map.record,
- htab_elem_value(elem, htab->map.key_size));
- if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
- bpf_obj_free_workqueue(htab->map.record,
- htab_elem_value(elem, htab->map.key_size));
+ htab_free_internal_structs(htab, elem);
cond_resched();
}
}
@@ -1490,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab)
}
}
-static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
+static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
{
int i;
@@ -1502,28 +1510,23 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
hlist_nulls_for_each_entry(l, n, head, hash_node) {
/* We only free timer on uref dropping to zero */
- if (btf_record_has_field(htab->map.record, BPF_TIMER))
- bpf_obj_free_timer(htab->map.record,
- htab_elem_value(l, htab->map.key_size));
- if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
- bpf_obj_free_workqueue(htab->map.record,
- htab_elem_value(l, htab->map.key_size));
+ htab_free_internal_structs(htab, l);
}
cond_resched_rcu();
}
rcu_read_unlock();
}
-static void htab_map_free_timers_and_wq(struct bpf_map *map)
+static void htab_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
/* We only free timer and workqueue on uref dropping to zero */
- if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) {
+ if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
if (!htab_is_prealloc(htab))
- htab_free_malloced_timers_and_wq(htab);
+ htab_free_malloced_internal_structs(htab);
else
- htab_free_prealloced_timers_and_wq(htab);
+ htab_free_prealloced_internal_structs(htab);
}
}
@@ -2255,7 +2258,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers_and_wq,
+ .map_release_uref = htab_map_free_internal_structs,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@@ -2276,7 +2279,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers_and_wq,
+ .map_release_uref = htab_map_free_internal_structs,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6b4877e85a68..c9fab9a356df 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -25,6 +25,9 @@
#include <linux/kasan.h>
#include <linux/bpf_verifier.h>
#include <linux/uaccess.h>
+#include <linux/verification.h>
+#include <linux/task_work.h>
+#include <linux/irq_work.h>
#include "../../lib/kstrtox.h"
@@ -774,11 +777,9 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
int nest_level;
- preempt_disable();
nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
this_cpu_dec(bpf_bprintf_nest_level);
- preempt_enable();
return -EBUSY;
}
*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
@@ -791,7 +792,6 @@ void bpf_put_buffers(void)
if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
return;
this_cpu_dec(bpf_bprintf_nest_level);
- preempt_enable();
}
void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
@@ -1084,6 +1084,17 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
+{
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ *arr_idx = ((char *)value - array->value) / array->elem_size;
+ return arr_idx;
+ }
+ return (void *)value - round_up(map->key_size, 8);
+}
+
struct bpf_async_cb {
struct bpf_map *map;
struct bpf_prog *prog;
@@ -1166,15 +1177,8 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
* bpf_map_delete_elem() on the same timer.
*/
this_cpu_write(hrtimer_running, t);
- if (map->map_type == BPF_MAP_TYPE_ARRAY) {
- struct bpf_array *array = container_of(map, struct bpf_array, map);
- /* compute the key */
- idx = ((char *)value - array->value) / array->elem_size;
- key = &idx;
- } else { /* hash or lru */
- key = value - round_up(map->key_size, 8);
- }
+ key = map_key_from_value(map, value, &idx);
callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
/* The verifier checked that return value is zero. */
@@ -1200,15 +1204,7 @@ static void bpf_wq_work(struct work_struct *work)
if (!callback_fn)
return;
- if (map->map_type == BPF_MAP_TYPE_ARRAY) {
- struct bpf_array *array = container_of(map, struct bpf_array, map);
-
- /* compute the key */
- idx = ((char *)value - array->value) / array->elem_size;
- key = &idx;
- } else { /* hash or lru */
- key = value - round_up(map->key_size, 8);
- }
+ key = map_key_from_value(map, value, &idx);
rcu_read_lock_trace();
migrate_disable();
@@ -1274,8 +1270,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
goto out;
}
- /* allocate hrtimer via map_kmalloc to use memcg accounting */
- cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+ /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until
+ * kmalloc_nolock() is available, avoid locking issues by using
+ * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM).
+ */
+ cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
if (!cb) {
ret = -ENOMEM;
goto out;
@@ -1597,7 +1596,7 @@ void bpf_timer_cancel_and_free(void *val)
* timer callback.
*/
if (this_cpu_read(hrtimer_running)) {
- queue_work(system_unbound_wq, &t->cb.delete_work);
+ queue_work(system_dfl_wq, &t->cb.delete_work);
return;
}
@@ -1610,7 +1609,7 @@ void bpf_timer_cancel_and_free(void *val)
if (hrtimer_try_to_cancel(&t->timer) >= 0)
kfree_rcu(t, cb.rcu);
else
- queue_work(system_unbound_wq, &t->cb.delete_work);
+ queue_work(system_dfl_wq, &t->cb.delete_work);
} else {
bpf_timer_delete_work(&t->cb.delete_work);
}
@@ -1780,6 +1779,9 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s
return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
case BPF_DYNPTR_TYPE_XDP:
return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_SKB_META:
+ memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
+ return 0;
default:
WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1836,6 +1838,11 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
if (flags)
return -EINVAL;
return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ case BPF_DYNPTR_TYPE_SKB_META:
+ if (flags)
+ return -EINVAL;
+ memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len);
+ return 0;
default:
WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1882,6 +1889,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
return (unsigned long)(ptr->data + ptr->offset + offset);
case BPF_DYNPTR_TYPE_SKB:
case BPF_DYNPTR_TYPE_XDP:
+ case BPF_DYNPTR_TYPE_SKB_META:
/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
return 0;
default:
@@ -2537,7 +2545,7 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
{
struct cgroup *cgrp;
- cgrp = cgroup_get_from_id(cgid);
+ cgrp = __cgroup_get_from_id(cgid);
if (IS_ERR(cgrp))
return NULL;
return cgrp;
@@ -2710,6 +2718,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
return buffer__opt;
}
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
return NULL;
@@ -3341,39 +3351,30 @@ __bpf_kfunc void __bpf_trap(void)
* __get_kernel_nofault instead of plain dereference to make them safe.
*/
-/**
- * bpf_strcmp - Compare two strings
- * @s1__ign: One string
- * @s2__ign: Another string
- *
- * Return:
- * * %0 - Strings are equal
- * * %-1 - @s1__ign is smaller
- * * %1 - @s2__ign is smaller
- * * %-EFAULT - Cannot read one of the strings
- * * %-E2BIG - One of strings is too large
- * * %-ERANGE - One of strings is outside of kernel address space
- */
-__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
+static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
{
char c1, c2;
int i;
- if (!copy_from_kernel_nofault_allowed(s1__ign, 1) ||
- !copy_from_kernel_nofault_allowed(s2__ign, 1)) {
+ if (!copy_from_kernel_nofault_allowed(s1, 1) ||
+ !copy_from_kernel_nofault_allowed(s2, 1)) {
return -ERANGE;
}
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
- __get_kernel_nofault(&c1, s1__ign, char, err_out);
- __get_kernel_nofault(&c2, s2__ign, char, err_out);
+ __get_kernel_nofault(&c1, s1, char, err_out);
+ __get_kernel_nofault(&c2, s2, char, err_out);
+ if (ignore_case) {
+ c1 = tolower(c1);
+ c2 = tolower(c2);
+ }
if (c1 != c2)
return c1 < c2 ? -1 : 1;
if (c1 == '\0')
return 0;
- s1__ign++;
- s2__ign++;
+ s1++;
+ s2++;
}
return -E2BIG;
err_out:
@@ -3381,6 +3382,42 @@ err_out:
}
/**
+ * bpf_strcmp - Compare two strings
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strcasecmp(s1__ign, s2__ign, false);
+}
+
+/**
+ * bpf_strcasecmp - Compare two strings, ignoring the case of the characters
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strcasecmp(s1__ign, s2__ign, true);
+}
+
+/**
* bpf_strnchr - Find a character in a length limited string
* @s__ign: The string to be searched
* @count: The number of characters to be searched
@@ -3664,10 +3701,17 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
- for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) {
+ for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
__get_kernel_nofault(&c2, s2__ign + j, char, err_out);
if (c2 == '\0')
return i;
+ /*
+ * We allow reading an extra byte from s2 (note the
+ * `i + j <= len` above) to cover the case when s2 is
+ * a suffix of the first len chars of s1.
+ */
+ if (i + j == len)
+ break;
__get_kernel_nofault(&c1, s1__ign + j, char, err_out);
if (c1 == '\0')
return -ENOENT;
@@ -3702,9 +3746,490 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
{
return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
}
+#ifdef CONFIG_KEYS
+/**
+ * bpf_lookup_user_key - lookup a key by its serial
+ * @serial: key handle serial number
+ * @flags: lookup-specific flags
+ *
+ * Search a key with a given *serial* and the provided *flags*.
+ * If found, increment the reference count of the key by one, and
+ * return it in the bpf_key structure.
+ *
+ * The bpf_key structure must be passed to bpf_key_put() when done
+ * with it, so that the key reference count is decremented and the
+ * bpf_key structure is freed.
+ *
+ * Permission checks are deferred to the time the key is used by
+ * one of the available key-specific kfuncs.
+ *
+ * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
+ * special keyring (e.g. session keyring), if it doesn't yet exist.
+ * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
+ * for the key construction, and to retrieve uninstantiated keys (keys
+ * without data attached to them).
+ *
+ * Return: a bpf_key pointer with a valid key pointer if the key is found, a
+ * NULL pointer otherwise.
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
+{
+ key_ref_t key_ref;
+ struct bpf_key *bkey;
+
+ if (flags & ~KEY_LOOKUP_ALL)
+ return NULL;
+
+ /*
+ * Permission check is deferred until the key is used, as the
+ * intent of the caller is unknown here.
+ */
+ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
+ if (IS_ERR(key_ref))
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
+ if (!bkey) {
+ key_put(key_ref_to_ptr(key_ref));
+ return NULL;
+ }
+
+ bkey->key = key_ref_to_ptr(key_ref);
+ bkey->has_ref = true;
+
+ return bkey;
+}
+
+/**
+ * bpf_lookup_system_key - lookup a key by a system-defined ID
+ * @id: key ID
+ *
+ * Obtain a bpf_key structure with a key pointer set to the passed key ID.
+ * The key pointer is marked as invalid, to prevent bpf_key_put() from
+ * attempting to decrement the key reference count on that pointer. The key
+ * pointer set in such way is currently understood only by
+ * verify_pkcs7_signature().
+ *
+ * Set *id* to one of the values defined in include/linux/verification.h:
+ * 0 for the primary keyring (immutable keyring of system keys);
+ * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
+ * (where keys can be added only if they are vouched for by existing keys
+ * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
+ * keyring (primarily used by the integrity subsystem to verify a kexec'ed
+ * kerned image and, possibly, the initramfs signature).
+ *
+ * Return: a bpf_key pointer with an invalid key pointer set from the
+ * pre-determined ID on success, a NULL pointer otherwise
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
+{
+ struct bpf_key *bkey;
+
+ if (system_keyring_id_check(id) < 0)
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
+ if (!bkey)
+ return NULL;
+
+ bkey->key = (struct key *)(unsigned long)id;
+ bkey->has_ref = false;
+
+ return bkey;
+}
+
+/**
+ * bpf_key_put - decrement key reference count if key is valid and free bpf_key
+ * @bkey: bpf_key structure
+ *
+ * Decrement the reference count of the key inside *bkey*, if the pointer
+ * is valid, and free *bkey*.
+ */
+__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
+{
+ if (bkey->has_ref)
+ key_put(bkey->key);
+
+ kfree(bkey);
+}
+
+/**
+ * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
+ * @data_p: data to verify
+ * @sig_p: signature of the data
+ * @trusted_keyring: keyring with keys trusted for signature verification
+ *
+ * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
+ * with keys in a keyring referenced by *trusted_keyring*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
+ struct bpf_dynptr *sig_p,
+ struct bpf_key *trusted_keyring)
+{
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+ struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
+ struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
+ const void *data, *sig;
+ u32 data_len, sig_len;
+ int ret;
+
+ if (trusted_keyring->has_ref) {
+ /*
+ * Do the permission check deferred in bpf_lookup_user_key().
+ * See bpf_lookup_user_key() for more details.
+ *
+ * A call to key_task_permission() here would be redundant, as
+ * it is already done by keyring_search() called by
+ * find_asymmetric_key().
+ */
+ ret = key_validate(trusted_keyring->key);
+ if (ret < 0)
+ return ret;
+ }
+
+ data_len = __bpf_dynptr_size(data_ptr);
+ data = __bpf_dynptr_data(data_ptr, data_len);
+ sig_len = __bpf_dynptr_size(sig_ptr);
+ sig = __bpf_dynptr_data(sig_ptr, sig_len);
+
+ return verify_pkcs7_signature(data, data_len, sig, sig_len,
+ trusted_keyring->key,
+ VERIFYING_BPF_SIGNATURE, NULL,
+ NULL);
+#else
+ return -EOPNOTSUPP;
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+}
+#endif /* CONFIG_KEYS */
+
+typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);
+
+enum bpf_task_work_state {
+ /* bpf_task_work is ready to be used */
+ BPF_TW_STANDBY = 0,
+ /* irq work scheduling in progress */
+ BPF_TW_PENDING,
+ /* task work scheduling in progress */
+ BPF_TW_SCHEDULING,
+ /* task work is scheduled successfully */
+ BPF_TW_SCHEDULED,
+ /* callback is running */
+ BPF_TW_RUNNING,
+ /* associated BPF map value is deleted */
+ BPF_TW_FREED,
+};
+
+struct bpf_task_work_ctx {
+ enum bpf_task_work_state state;
+ refcount_t refcnt;
+ struct callback_head work;
+ struct irq_work irq_work;
+ /* bpf_prog that schedules task work */
+ struct bpf_prog *prog;
+ /* task for which callback is scheduled */
+ struct task_struct *task;
+ /* the map and map value associated with this context */
+ struct bpf_map *map;
+ void *map_val;
+ enum task_work_notify_mode mode;
+ bpf_task_work_callback_t callback_fn;
+ struct rcu_head rcu;
+} __aligned(8);
+
+/* Actual type for struct bpf_task_work */
+struct bpf_task_work_kern {
+ struct bpf_task_work_ctx *ctx;
+};
+
+static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx)
+{
+ if (ctx->prog) {
+ bpf_prog_put(ctx->prog);
+ ctx->prog = NULL;
+ }
+ if (ctx->task) {
+ bpf_task_release(ctx->task);
+ ctx->task = NULL;
+ }
+}
+
+static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
+{
+ return refcount_inc_not_zero(&ctx->refcnt);
+}
+
+static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
+{
+ if (!refcount_dec_and_test(&ctx->refcnt))
+ return;
+
+ bpf_task_work_ctx_reset(ctx);
+
+ /* bpf_mem_free expects migration to be disabled */
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, ctx);
+ migrate_enable();
+}
+
+static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
+{
+ /*
+ * Scheduled task_work callback holds ctx ref, so if we successfully
+ * cancelled, we put that ref on callback's behalf. If we couldn't
+ * cancel, callback will inevitably run or has already completed
+ * running, and it would have taken care of its ctx ref itself.
+ */
+ if (task_work_cancel(ctx->task, &ctx->work))
+ bpf_task_work_ctx_put(ctx);
+}
+
+static void bpf_task_work_callback(struct callback_head *cb)
+{
+ struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work);
+ enum bpf_task_work_state state;
+ u32 idx;
+ void *key;
+
+ /* Read lock is needed to protect ctx and map key/value access */
+ guard(rcu_tasks_trace)();
+ /*
+ * This callback may start running before bpf_task_work_irq() switched to
+ * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
+ */
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
+ if (state == BPF_TW_SCHEDULED)
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
+ if (state == BPF_TW_FREED) {
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
+
+ migrate_disable();
+ ctx->callback_fn(ctx->map, key, ctx->map_val);
+ migrate_enable();
+
+ bpf_task_work_ctx_reset(ctx);
+ (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
+
+ bpf_task_work_ctx_put(ctx);
+}
+
+static void bpf_task_work_irq(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+ enum bpf_task_work_state state;
+ int err;
+
+ guard(rcu_tasks_trace)();
+
+ if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ err = task_work_add(ctx->task, &ctx->work, ctx->mode);
+ if (err) {
+ bpf_task_work_ctx_reset(ctx);
+ /*
+ * try to switch back to STANDBY for another task_work reuse, but we might have
+ * gone to FREED already, which is fine as we already cleaned up after ourselves
+ */
+ (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ /*
+ * It's technically possible for just scheduled task_work callback to
+ * complete running by now, going SCHEDULING -> RUNNING and then
+ * dropping its ctx refcount. Instead of capturing extra ref just to
+ * protected below ctx->state access, we rely on RCU protection to
+ * perform below SCHEDULING -> SCHEDULED attempt.
+ */
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
+ if (state == BPF_TW_FREED)
+ bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */
+}
+
+static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw,
+ struct bpf_map *map)
+{
+ struct bpf_task_work_kern *twk = (void *)tw;
+ struct bpf_task_work_ctx *ctx, *old_ctx;
+
+ ctx = READ_ONCE(twk->ctx);
+ if (ctx)
+ return ctx;
+
+ ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx));
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ memset(ctx, 0, sizeof(*ctx));
+ refcount_set(&ctx->refcnt, 1); /* map's own ref */
+ ctx->state = BPF_TW_STANDBY;
+
+ old_ctx = cmpxchg(&twk->ctx, NULL, ctx);
+ if (old_ctx) {
+ /*
+ * tw->ctx is set by concurrent BPF program, release allocated
+ * memory and try to reuse already set context.
+ */
+ bpf_mem_free(&bpf_global_ma, ctx);
+ return old_ctx;
+ }
+
+ return ctx; /* Success */
+}
+
+static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw,
+ struct bpf_map *map)
+{
+ struct bpf_task_work_ctx *ctx;
+
+ ctx = bpf_task_work_fetch_ctx(tw, map);
+ if (IS_ERR(ctx))
+ return ctx;
+
+ /* try to get ref for task_work callback to hold */
+ if (!bpf_task_work_ctx_tryget(ctx))
+ return ERR_PTR(-EBUSY);
+
+ if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
+ /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
+ bpf_task_work_ctx_put(ctx);
+ return ERR_PTR(-EBUSY);
+ }
+
+ /*
+ * If no process or bpffs is holding a reference to the map, no new callbacks should be
+ * scheduled. This does not address any race or correctness issue, but rather is a policy
+ * choice: dropping user references should stop everything.
+ */
+ if (!atomic64_read(&map->usercnt)) {
+ /* drop ref we just got for task_work callback itself */
+ bpf_task_work_ctx_put(ctx);
+ /* transfer map's ref into cancel_and_free() */
+ bpf_task_work_cancel_and_free(tw);
+ return ERR_PTR(-EBUSY);
+ }
+
+ return ctx;
+}
+
+static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
+ struct bpf_map *map, bpf_task_work_callback_t callback_fn,
+ struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
+{
+ struct bpf_prog *prog;
+ struct bpf_task_work_ctx *ctx;
+ int err;
+
+ BTF_TYPE_EMIT(struct bpf_task_work);
+
+ prog = bpf_prog_inc_not_zero(aux->prog);
+ if (IS_ERR(prog))
+ return -EBADF;
+ task = bpf_task_acquire(task);
+ if (!task) {
+ err = -EBADF;
+ goto release_prog;
+ }
+
+ ctx = bpf_task_work_acquire_ctx(tw, map);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto release_all;
+ }
+
+ ctx->task = task;
+ ctx->callback_fn = callback_fn;
+ ctx->prog = prog;
+ ctx->mode = mode;
+ ctx->map = map;
+ ctx->map_val = (void *)tw - map->record->task_work_off;
+ init_task_work(&ctx->work, bpf_task_work_callback);
+ init_irq_work(&ctx->irq_work, bpf_task_work_irq);
+
+ irq_work_queue(&ctx->irq_work);
+ return 0;
+
+release_all:
+ bpf_task_release(task);
+release_prog:
+ bpf_prog_put(prog);
+ return err;
+}
+
+/**
+ * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
+ * @map__map: bpf_map that embeds struct bpf_task_work in the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
+ void *map__map, bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
+}
+
+/**
+ * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
+ * @map__map: bpf_map that embeds struct bpf_task_work in the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
+ void *map__map, bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
+}
__bpf_kfunc_end_defs();
+static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+
+ bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */
+ bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */
+}
+
+void bpf_task_work_cancel_and_free(void *val)
+{
+ struct bpf_task_work_kern *twk = val;
+ struct bpf_task_work_ctx *ctx;
+ enum bpf_task_work_state state;
+
+ ctx = xchg(&twk->ctx, NULL);
+ if (!ctx)
+ return;
+
+ state = xchg(&ctx->state, BPF_TW_FREED);
+ if (state == BPF_TW_SCHEDULED) {
+ /* run in irq_work to avoid locks in NMI */
+ init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled);
+ irq_work_queue(&ctx->irq_work);
+ return;
+ }
+
+ bpf_task_work_ctx_put(ctx); /* put bpf map's ref */
+}
+
BTF_KFUNCS_START(generic_btf_ids)
#ifdef CONFIG_CRASH_DUMP
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
@@ -3743,6 +4268,14 @@ BTF_ID_FLAGS(func, bpf_throw)
#ifdef CONFIG_BPF_EVENTS
BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
#endif
+#ifdef CONFIG_KEYS
+BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
+#endif
+#endif
BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -3824,6 +4357,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
#endif
BTF_ID_FLAGS(func, __bpf_trap)
BTF_ID_FLAGS(func, bpf_strcmp);
+BTF_ID_FLAGS(func, bpf_strcasecmp);
BTF_ID_FLAGS(func, bpf_strchr);
BTF_ID_FLAGS(func, bpf_strchrnul);
BTF_ID_FLAGS(func, bpf_strnchr);
@@ -3838,6 +4372,8 @@ BTF_ID_FLAGS(func, bpf_strnstr);
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5c2e96b19392..f90bdcc0a047 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -442,7 +442,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
umode_t mode;
int ret;
- dentry = user_path_create(path_fd, pathname, &path, 0);
+ dentry = start_creating_user_path(path_fd, pathname, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -471,7 +471,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
ret = -EPERM;
}
out:
- done_path_create(&path, dentry);
+ end_creating_path(&path, dentry);
return ret;
}
@@ -788,7 +788,7 @@ static void bpf_free_inode(struct inode *inode)
const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.show_options = bpf_show_options,
.free_inode = bpf_free_inode,
};
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
new file mode 100644
index 000000000000..3c611aba7f52
--- /dev/null
+++ b/kernel/bpf/liveness.c
@@ -0,0 +1,733 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf_verifier.h>
+#include <linux/hashtable.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+
+/*
+ * This file implements live stack slots analysis. After accumulating
+ * stack usage data, the analysis answers queries about whether a
+ * particular stack slot may be read by an instruction or any of it's
+ * successors. This data is consumed by the verifier states caching
+ * mechanism to decide which stack slots are important when looking for a
+ * visited state corresponding to the current state.
+ *
+ * The analysis is call chain sensitive, meaning that data is collected
+ * and queried for tuples (call chain, subprogram instruction index).
+ * Such sensitivity allows identifying if some subprogram call always
+ * leads to writes in the caller's stack.
+ *
+ * The basic idea is as follows:
+ * - As the verifier accumulates a set of visited states, the analysis instance
+ * accumulates a conservative estimate of stack slots that can be read
+ * or must be written for each visited tuple (call chain, instruction index).
+ * - If several states happen to visit the same instruction with the same
+ * call chain, stack usage information for the corresponding tuple is joined:
+ * - "may_read" set represents a union of all possibly read slots
+ * (any slot in "may_read" set might be read at or after the instruction);
+ * - "must_write" set represents an intersection of all possibly written slots
+ * (any slot in "must_write" set is guaranteed to be written by the instruction).
+ * - The analysis is split into two phases:
+ * - read and write marks accumulation;
+ * - read and write marks propagation.
+ * - The propagation phase is a textbook live variable data flow analysis:
+ *
+ * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)]
+ * state[cc, i].live_before =
+ * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
+ *
+ * Where:
+ * - `U` stands for set union
+ * - `/` stands for set difference;
+ * - `cc` stands for a call chain;
+ * - `i` and `s` are instruction indexes;
+ *
+ * The above equations are computed for each call chain and instruction
+ * index until state stops changing.
+ * - Additionally, in order to transfer "must_write" information from a
+ * subprogram to call instructions invoking this subprogram,
+ * the "must_write_acc" set is tracked for each (cc, i) tuple.
+ * A set of stack slots that are guaranteed to be written by this
+ * instruction or any of its successors (within the subprogram).
+ * The equation for "must_write_acc" propagation looks as follows:
+ *
+ * state[cc, i].must_write_acc =
+ * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)]
+ * U state[cc, i].must_write
+ *
+ * (An intersection of all "must_write_acc" for instruction successors
+ * plus all "must_write" slots for the instruction itself).
+ * - After the propagation phase completes for a subprogram, information from
+ * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain:
+ * - "must_write_acc" set is intersected with the call site's "must_write" set;
+ * - "may_read" set is added to the call site's "may_read" set.
+ * - Any live stack queries must be taken after the propagation phase.
+ * - Accumulation and propagation phases can be entered multiple times,
+ * at any point in time:
+ * - "may_read" set only grows;
+ * - "must_write" set only shrinks;
+ * - for each visited verifier state with zero branches, all relevant
+ * read and write marks are already recorded by the analysis instance.
+ *
+ * Technically, the analysis is facilitated by the following data structures:
+ * - Call chain: for given verifier state, the call chain is a tuple of call
+ * instruction indexes leading to the current subprogram plus the subprogram
+ * entry point index.
+ * - Function instance: for a given call chain, for each instruction in
+ * the current subprogram, a mapping between instruction index and a
+ * set of "may_read", "must_write" and other marks accumulated for this
+ * instruction.
+ * - A hash table mapping call chains to function instances.
+ */
+
+struct callchain {
+ u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */
+ /* cached subprog_info[*].start for functions owning the frames:
+ * - sp_starts[curframe] used to get insn relative index within current function;
+ * - sp_starts[0..current-1] used for fast callchain_frame_up().
+ */
+ u32 sp_starts[MAX_CALL_FRAMES];
+ u32 curframe; /* depth of callsites and sp_starts arrays */
+};
+
+struct per_frame_masks {
+ u64 may_read; /* stack slots that may be read by this instruction */
+ u64 must_write; /* stack slots written by this instruction */
+ u64 must_write_acc; /* stack slots written by this instruction and its successors */
+ u64 live_before; /* stack slots that may be read by this insn and its successors */
+};
+
+/*
+ * A function instance created for a specific callchain.
+ * Encapsulates read and write marks for each instruction in the function.
+ * Marks are tracked for each frame in the callchain.
+ */
+struct func_instance {
+ struct hlist_node hl_node;
+ struct callchain callchain;
+ u32 insn_cnt; /* cached number of insns in the function */
+ bool updated;
+ bool must_write_dropped;
+ /* Per frame, per instruction masks, frames allocated lazily. */
+ struct per_frame_masks *frames[MAX_CALL_FRAMES];
+ /* For each instruction a flag telling if "must_write" had been initialized for it. */
+ bool *must_write_set;
+};
+
+struct live_stack_query {
+ struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */
+ u32 curframe;
+ u32 insn_idx;
+};
+
+struct bpf_liveness {
+ DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */
+ struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */
+ /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */
+ struct func_instance *cur_instance;
+ /*
+ * Below fields are used to accumulate stack write marks for instruction at
+ * @write_insn_idx before submitting the marks to @cur_instance.
+ */
+ u64 write_masks_acc[MAX_CALL_FRAMES];
+ u32 write_insn_idx;
+};
+
+/* Compute callchain corresponding to state @st at depth @frameno */
+static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st,
+ struct callchain *callchain, u32 frameno)
+{
+ struct bpf_subprog_info *subprog_info = env->subprog_info;
+ u32 i;
+
+ memset(callchain, 0, sizeof(*callchain));
+ for (i = 0; i <= frameno; i++) {
+ callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start;
+ if (i < st->curframe)
+ callchain->callsites[i] = st->frame[i + 1]->callsite;
+ }
+ callchain->curframe = frameno;
+ callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe];
+}
+
+static u32 hash_callchain(struct callchain *callchain)
+{
+ return jhash2(callchain->callsites, callchain->curframe, 0);
+}
+
+static bool same_callsites(struct callchain *a, struct callchain *b)
+{
+ int i;
+
+ if (a->curframe != b->curframe)
+ return false;
+ for (i = a->curframe; i >= 0; i--)
+ if (a->callsites[i] != b->callsites[i])
+ return false;
+ return true;
+}
+
+/*
+ * Find existing or allocate new function instance corresponding to @callchain.
+ * Instances are accumulated in env->liveness->func_instances and persist
+ * until the end of the verification process.
+ */
+static struct func_instance *__lookup_instance(struct bpf_verifier_env *env,
+ struct callchain *callchain)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ struct bpf_subprog_info *subprog;
+ struct func_instance *result;
+ u32 subprog_sz, size, key;
+
+ key = hash_callchain(callchain);
+ hash_for_each_possible(liveness->func_instances, result, hl_node, key)
+ if (same_callsites(&result->callchain, callchain))
+ return result;
+
+ subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]);
+ subprog_sz = (subprog + 1)->start - subprog->start;
+ size = sizeof(struct func_instance);
+ result = kvzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!result)
+ return ERR_PTR(-ENOMEM);
+ result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set),
+ GFP_KERNEL_ACCOUNT);
+ if (!result->must_write_set)
+ return ERR_PTR(-ENOMEM);
+ memcpy(&result->callchain, callchain, sizeof(*callchain));
+ result->insn_cnt = subprog_sz;
+ hash_add(liveness->func_instances, &result->hl_node, key);
+ return result;
+}
+
+static struct func_instance *lookup_instance(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ u32 frameno)
+{
+ struct callchain callchain;
+
+ compute_callchain(env, st, &callchain, frameno);
+ return __lookup_instance(env, &callchain);
+}
+
+int bpf_stack_liveness_init(struct bpf_verifier_env *env)
+{
+ env->liveness = kvzalloc(sizeof(*env->liveness), GFP_KERNEL_ACCOUNT);
+ if (!env->liveness)
+ return -ENOMEM;
+ hash_init(env->liveness->func_instances);
+ return 0;
+}
+
+void bpf_stack_liveness_free(struct bpf_verifier_env *env)
+{
+ struct func_instance *instance;
+ struct hlist_node *tmp;
+ int bkt, i;
+
+ if (!env->liveness)
+ return;
+ hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) {
+ for (i = 0; i <= instance->callchain.curframe; i++)
+ kvfree(instance->frames[i]);
+ kvfree(instance->must_write_set);
+ kvfree(instance);
+ }
+ kvfree(env->liveness);
+}
+
+/*
+ * Convert absolute instruction index @insn_idx to an index relative
+ * to start of the function corresponding to @instance.
+ */
+static int relative_idx(struct func_instance *instance, u32 insn_idx)
+{
+ return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe];
+}
+
+static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
+ u32 frame, u32 insn_idx)
+{
+ if (!instance->frames[frame])
+ return NULL;
+
+ return &instance->frames[frame][relative_idx(instance, insn_idx)];
+}
+
+static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env,
+ struct func_instance *instance,
+ u32 frame, u32 insn_idx)
+{
+ struct per_frame_masks *arr;
+
+ if (!instance->frames[frame]) {
+ arr = kvcalloc(instance->insn_cnt, sizeof(*arr), GFP_KERNEL_ACCOUNT);
+ instance->frames[frame] = arr;
+ if (!arr)
+ return ERR_PTR(-ENOMEM);
+ }
+ return get_frame_masks(instance, frame, insn_idx);
+}
+
+void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env)
+{
+ env->liveness->cur_instance = NULL;
+}
+
+/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */
+static int ensure_cur_instance(struct bpf_verifier_env *env)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ struct func_instance *instance;
+
+ if (liveness->cur_instance)
+ return 0;
+
+ instance = lookup_instance(env, env->cur_state, env->cur_state->curframe);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+
+ liveness->cur_instance = instance;
+ return 0;
+}
+
+/* Accumulate may_read masks for @frame at @insn_idx */
+static int mark_stack_read(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask)
+{
+ struct per_frame_masks *masks;
+ u64 new_may_read;
+
+ masks = alloc_frame_masks(env, instance, frame, insn_idx);
+ if (IS_ERR(masks))
+ return PTR_ERR(masks);
+ new_may_read = masks->may_read | mask;
+ if (new_may_read != masks->may_read &&
+ ((new_may_read | masks->live_before) != masks->live_before))
+ instance->updated = true;
+ masks->may_read |= mask;
+ return 0;
+}
+
+int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask)
+{
+ int err;
+
+ err = ensure_cur_instance(env);
+ err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask);
+ return err;
+}
+
+static void reset_stack_write_marks(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 insn_idx)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ int i;
+
+ liveness->write_insn_idx = insn_idx;
+ for (i = 0; i <= instance->callchain.curframe; i++)
+ liveness->write_masks_acc[i] = 0;
+}
+
+int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ int err;
+
+ err = ensure_cur_instance(env);
+ if (err)
+ return err;
+
+ reset_stack_write_marks(env, liveness->cur_instance, insn_idx);
+ return 0;
+}
+
+void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask)
+{
+ env->liveness->write_masks_acc[frame] |= mask;
+}
+
+static int commit_stack_write_marks(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ u32 idx, frame, curframe, old_must_write;
+ struct per_frame_masks *masks;
+ u64 mask;
+
+ if (!instance)
+ return 0;
+
+ curframe = instance->callchain.curframe;
+ idx = relative_idx(instance, liveness->write_insn_idx);
+ for (frame = 0; frame <= curframe; frame++) {
+ mask = liveness->write_masks_acc[frame];
+ /* avoid allocating frames for zero masks */
+ if (mask == 0 && !instance->must_write_set[idx])
+ continue;
+ masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx);
+ if (IS_ERR(masks))
+ return PTR_ERR(masks);
+ old_must_write = masks->must_write;
+ /*
+ * If instruction at this callchain is seen for a first time, set must_write equal
+ * to @mask. Otherwise take intersection with the previous value.
+ */
+ if (instance->must_write_set[idx])
+ mask &= old_must_write;
+ if (old_must_write != mask) {
+ masks->must_write = mask;
+ instance->updated = true;
+ }
+ if (old_must_write & ~mask)
+ instance->must_write_dropped = true;
+ }
+ instance->must_write_set[idx] = true;
+ liveness->write_insn_idx = 0;
+ return 0;
+}
+
+/*
+ * Merge stack writes marks in @env->liveness->write_masks_acc
+ * with information already in @env->liveness->cur_instance.
+ */
+int bpf_commit_stack_write_marks(struct bpf_verifier_env *env)
+{
+ return commit_stack_write_marks(env, env->liveness->cur_instance);
+}
+
+static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain)
+{
+ char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf);
+ char *buf = env->tmp_str_buf;
+ int i;
+
+ buf += snprintf(buf, buf_end - buf, "(");
+ for (i = 0; i <= callchain->curframe; i++)
+ buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]);
+ snprintf(buf, buf_end - buf, ")");
+ return env->tmp_str_buf;
+}
+
+static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain,
+ char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new)
+{
+ u64 changed_bits = old ^ new;
+ u64 new_ones = new & changed_bits;
+ u64 new_zeros = ~new & changed_bits;
+
+ if (!changed_bits)
+ return;
+ bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx);
+ if (new_ones) {
+ bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones);
+ bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf);
+ }
+ if (new_zeros) {
+ bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros);
+ bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf);
+ }
+ bpf_log(&env->log, "\n");
+}
+
+int bpf_jmp_offset(struct bpf_insn *insn)
+{
+ u8 code = insn->code;
+
+ if (code == (BPF_JMP32 | BPF_JA))
+ return insn->imm;
+ return insn->off;
+}
+
+__diag_push();
+__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl");
+
+inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
+{
+ static const struct opcode_info {
+ bool can_jump;
+ bool can_fallthrough;
+ } opcode_info_tbl[256] = {
+ [0 ... 255] = {.can_jump = false, .can_fallthrough = true},
+ #define _J(code, ...) \
+ [BPF_JMP | code] = __VA_ARGS__, \
+ [BPF_JMP32 | code] = __VA_ARGS__
+
+ _J(BPF_EXIT, {.can_jump = false, .can_fallthrough = false}),
+ _J(BPF_JA, {.can_jump = true, .can_fallthrough = false}),
+ _J(BPF_JEQ, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JNE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JLT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JLE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JGT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JGE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSGT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSGE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSLT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSLE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JCOND, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}),
+ #undef _J
+ };
+ struct bpf_insn *insn = &prog->insnsi[idx];
+ const struct opcode_info *opcode_info;
+ int i = 0, insn_sz;
+
+ opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)];
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ if (opcode_info->can_fallthrough)
+ succ[i++] = idx + insn_sz;
+
+ if (opcode_info->can_jump)
+ succ[i++] = idx + bpf_jmp_offset(insn) + 1;
+
+ return i;
+}
+
+__diag_pop();
+
+static struct func_instance *get_outer_instance(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct callchain callchain = instance->callchain;
+
+ /* Adjust @callchain to represent callchain one frame up */
+ callchain.callsites[callchain.curframe] = 0;
+ callchain.sp_starts[callchain.curframe] = 0;
+ callchain.curframe--;
+ callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe];
+ return __lookup_instance(env, &callchain);
+}
+
+static u32 callchain_subprog_start(struct callchain *callchain)
+{
+ return callchain->sp_starts[callchain->curframe];
+}
+
+/*
+ * Transfer @may_read and @must_write_acc marks from the first instruction of @instance,
+ * to the call instruction in function instance calling @instance.
+ */
+static int propagate_to_outer_instance(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct callchain *callchain = &instance->callchain;
+ u32 this_subprog_start, callsite, frame;
+ struct func_instance *outer_instance;
+ struct per_frame_masks *insn;
+ int err;
+
+ this_subprog_start = callchain_subprog_start(callchain);
+ outer_instance = get_outer_instance(env, instance);
+ callsite = callchain->callsites[callchain->curframe - 1];
+
+ reset_stack_write_marks(env, outer_instance, callsite);
+ for (frame = 0; frame < callchain->curframe; frame++) {
+ insn = get_frame_masks(instance, frame, this_subprog_start);
+ if (!insn)
+ continue;
+ bpf_mark_stack_write(env, frame, insn->must_write_acc);
+ err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before);
+ if (err)
+ return err;
+ }
+ commit_stack_write_marks(env, outer_instance);
+ return 0;
+}
+
+static inline bool update_insn(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 frame, u32 insn_idx)
+{
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ u64 new_before, new_after, must_write_acc;
+ struct per_frame_masks *insn, *succ_insn;
+ u32 succ_num, s, succ[2];
+ bool changed;
+
+ succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
+ if (unlikely(succ_num == 0))
+ return false;
+
+ changed = false;
+ insn = get_frame_masks(instance, frame, insn_idx);
+ new_before = 0;
+ new_after = 0;
+ /*
+ * New "must_write_acc" is an intersection of all "must_write_acc"
+ * of successors plus all "must_write" slots of instruction itself.
+ */
+ must_write_acc = U64_MAX;
+ for (s = 0; s < succ_num; ++s) {
+ succ_insn = get_frame_masks(instance, frame, succ[s]);
+ new_after |= succ_insn->live_before;
+ must_write_acc &= succ_insn->must_write_acc;
+ }
+ must_write_acc |= insn->must_write;
+ /*
+ * New "live_before" is a union of all "live_before" of successors
+ * minus slots written by instruction plus slots read by instruction.
+ */
+ new_before = (new_after & ~insn->must_write) | insn->may_read;
+ changed |= new_before != insn->live_before;
+ changed |= must_write_acc != insn->must_write_acc;
+ if (unlikely(env->log.level & BPF_LOG_LEVEL2) &&
+ (insn->may_read || insn->must_write ||
+ insn_idx == callchain_subprog_start(&instance->callchain) ||
+ aux[insn_idx].prune_point)) {
+ log_mask_change(env, &instance->callchain, "live",
+ frame, insn_idx, insn->live_before, new_before);
+ log_mask_change(env, &instance->callchain, "written",
+ frame, insn_idx, insn->must_write_acc, must_write_acc);
+ }
+ insn->live_before = new_before;
+ insn->must_write_acc = must_write_acc;
+ return changed;
+}
+
+/* Fixed-point computation of @live_before and @must_write_acc marks */
+static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+{
+ u32 i, frame, po_start, po_end, cnt, this_subprog_start;
+ struct callchain *callchain = &instance->callchain;
+ int *insn_postorder = env->cfg.insn_postorder;
+ struct bpf_subprog_info *subprog;
+ struct per_frame_masks *insn;
+ bool changed;
+ int err;
+
+ this_subprog_start = callchain_subprog_start(callchain);
+ /*
+ * If must_write marks were updated must_write_acc needs to be reset
+ * (to account for the case when new must_write sets became smaller).
+ */
+ if (instance->must_write_dropped) {
+ for (frame = 0; frame <= callchain->curframe; frame++) {
+ if (!instance->frames[frame])
+ continue;
+
+ for (i = 0; i < instance->insn_cnt; i++) {
+ insn = get_frame_masks(instance, frame, this_subprog_start + i);
+ insn->must_write_acc = 0;
+ }
+ }
+ }
+
+ subprog = bpf_find_containing_subprog(env, this_subprog_start);
+ po_start = subprog->postorder_start;
+ po_end = (subprog + 1)->postorder_start;
+ cnt = 0;
+ /* repeat until fixed point is reached */
+ do {
+ cnt++;
+ changed = false;
+ for (frame = 0; frame <= instance->callchain.curframe; frame++) {
+ if (!instance->frames[frame])
+ continue;
+
+ for (i = po_start; i < po_end; i++)
+ changed |= update_insn(env, instance, frame, insn_postorder[i]);
+ }
+ } while (changed);
+
+ if (env->log.level & BPF_LOG_LEVEL2)
+ bpf_log(&env->log, "%s live stack update done in %d iterations\n",
+ fmt_callchain(env, callchain), cnt);
+
+ /* transfer marks accumulated for outer frames to outer func instance (caller) */
+ if (callchain->curframe > 0) {
+ err = propagate_to_outer_instance(env, instance);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * Prepare all callchains within @env->cur_state for querying.
+ * This function should be called after each verifier.c:pop_stack()
+ * and whenever verifier.c:do_check_insn() processes subprogram exit.
+ * This would guarantee that visited verifier states with zero branches
+ * have their bpf_mark_stack_{read,write}() effects propagated in
+ * @env->liveness.
+ */
+int bpf_update_live_stack(struct bpf_verifier_env *env)
+{
+ struct func_instance *instance;
+ int err, frame;
+
+ bpf_reset_live_stack_callchain(env);
+ for (frame = env->cur_state->curframe; frame >= 0; --frame) {
+ instance = lookup_instance(env, env->cur_state, frame);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+
+ if (instance->updated) {
+ err = update_instance(env, instance);
+ if (err)
+ return err;
+ instance->updated = false;
+ instance->must_write_dropped = false;
+ }
+ }
+ return 0;
+}
+
+static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi)
+{
+ struct per_frame_masks *masks;
+
+ masks = get_frame_masks(instance, frameno, insn_idx);
+ return masks && (masks->live_before & BIT(spi));
+}
+
+int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct live_stack_query *q = &env->liveness->live_stack_query;
+ struct func_instance *instance;
+ u32 frame;
+
+ memset(q, 0, sizeof(*q));
+ for (frame = 0; frame <= st->curframe; frame++) {
+ instance = lookup_instance(env, st, frame);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+ q->instances[frame] = instance;
+ }
+ q->curframe = st->curframe;
+ q->insn_idx = st->insn_idx;
+ return 0;
+}
+
+bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi)
+{
+ /*
+ * Slot is alive if it is read before q->st->insn_idx in current func instance,
+ * or if for some outer func instance:
+ * - alive before callsite if callsite calls callback, otherwise
+ * - alive after callsite
+ */
+ struct live_stack_query *q = &env->liveness->live_stack_query;
+ struct func_instance *instance, *curframe_instance;
+ u32 i, callsite;
+ bool alive;
+
+ curframe_instance = q->instances[q->curframe];
+ if (is_live_before(curframe_instance, q->insn_idx, frameno, spi))
+ return true;
+
+ for (i = frameno; i < q->curframe; i++) {
+ callsite = curframe_instance->callchain.callsites[i];
+ instance = q->instances[i];
+ alive = bpf_calls_callback(env, callsite)
+ ? is_live_before(instance, callsite, frameno, spi)
+ : is_live_before(instance, callsite + 1, frameno, spi);
+ if (alive)
+ return true;
+ }
+
+ return false;
+}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 632d51b05fe9..c93a756e035c 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -165,7 +165,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
}
new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
- __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN,
+ __GFP_ZERO | GFP_NOWAIT,
map->numa_node);
if (!new)
return -ENOMEM;
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 38050f4ee400..f50533169cc3 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -498,6 +498,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type)
return "skb";
case BPF_DYNPTR_TYPE_XDP:
return "xdp";
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return "skb_meta";
case BPF_DYNPTR_TYPE_INVALID:
return "<invalid>";
default:
@@ -540,19 +542,6 @@ static char slot_type_char[] = {
[STACK_IRQ_FLAG] = 'f'
};
-static void print_liveness(struct bpf_verifier_env *env,
- enum bpf_reg_liveness live)
-{
- if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
- verbose(env, "_");
- if (live & REG_LIVE_READ)
- verbose(env, "r");
- if (live & REG_LIVE_WRITTEN)
- verbose(env, "w");
- if (live & REG_LIVE_DONE)
- verbose(env, "D");
-}
-
#define UNUM_MAX_DECIMAL U16_MAX
#define SNUM_MAX_DECIMAL S16_MAX
#define SNUM_MIN_DECIMAL S16_MIN
@@ -770,7 +759,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
if (!print_all && !reg_scratched(env, i))
continue;
verbose(env, " R%d", i);
- print_liveness(env, reg->live);
verbose(env, "=");
print_reg_state(env, state, reg);
}
@@ -803,9 +791,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
break;
types_buf[j] = '\0';
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
- verbose(env, "=%s", types_buf);
+ verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
print_reg_state(env, state, reg);
break;
case STACK_DYNPTR:
@@ -814,7 +800,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
reg = &state->stack[i].spilled_ptr;
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
if (reg->id)
verbose_a("id=%d", reg->id);
@@ -829,9 +814,8 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
if (!reg->ref_obj_id)
continue;
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
- verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ (-i - 1) * BPF_REG_SIZE,
iter_type_str(reg->iter.btf, reg->iter.btf_id),
reg->ref_obj_id, iter_state_str(reg->iter.state),
reg->iter.depth);
@@ -839,9 +823,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
case STACK_MISC:
case STACK_ZERO:
default:
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
- verbose(env, "=%s", types_buf);
+ verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
break;
}
}
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 889374722d0a..bd45dda9dc35 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -736,7 +736,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
/* Defer barriers into worker to let the rest of map memory to be freed */
memset(ma, 0, sizeof(*ma));
INIT_WORK(&copy->work, free_mem_alloc_deferred);
- queue_work(system_unbound_wq, &copy->work);
+ queue_work(system_dfl_wq, &copy->work);
}
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 5ab354d55d82..a00561b1d3e5 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -471,7 +471,7 @@ queue:
* any MCS node. This is not the most elegant solution, but is
* simple enough.
*/
- if (unlikely(idx >= _Q_MAX_NODES)) {
+ if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) {
lockevent_inc(lock_no_node);
RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
while (!queued_spin_trylock(lock)) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3615c06b7dfa..4d53cdd1374c 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
- trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
false, false);
if (unlikely(!trace))
@@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else if (kernel && task)
trace = get_callchain_entry_for_task(task, max_depth);
else
- trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
crosstask, false);
if (unlikely(!trace) || trace->nr < skip) {
@@ -646,7 +646,15 @@ static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall */
-int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return bpf_stackmap_extract(map, key, value, true);
+}
+
+/* Called from syscall */
+int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
+ bool delete)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *old_bucket;
@@ -663,7 +671,10 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
memcpy(value, bucket->data, trace_len);
memset(value + trace_len, 0, map->value_size - trace_len);
- old_bucket = xchg(&smap->buckets[id], bucket);
+ if (delete)
+ old_bucket = bucket;
+ else
+ old_bucket = xchg(&smap->buckets[id], bucket);
if (old_bucket)
pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
return 0;
@@ -754,6 +765,7 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_free = stack_map_free,
.map_get_next_key = stack_map_get_next_key,
.map_lookup_elem = stack_map_lookup_elem,
+ .map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem,
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
.map_check_btf = map_check_no_btf,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0fbfa8532c39..a48fa86f82a7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
+#include <crypto/sha2.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
@@ -38,6 +39,7 @@
#include <linux/tracepoint.h>
#include <linux/overflow.h>
#include <linux/cookie.h>
+#include <linux/verification.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -318,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
err = bpf_percpu_cgroup_storage_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
- err = bpf_stackmap_copy(map, key, value);
+ err = bpf_stackmap_extract(map, key, value, false);
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
err = bpf_fd_array_map_lookup_elem(map, key, value);
} else if (IS_FD_HASH(map)) {
@@ -672,6 +674,7 @@ void btf_record_free(struct btf_record *rec)
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
/* Nothing to release */
break;
default:
@@ -725,6 +728,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
/* Nothing to acquire */
break;
default:
@@ -783,6 +787,13 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
bpf_wq_cancel_and_free(obj + rec->wq_off);
}
+void bpf_obj_free_task_work(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK)))
+ return;
+ bpf_task_work_cancel_and_free(obj + rec->task_work_off);
+}
+
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -807,6 +818,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
case BPF_WORKQUEUE:
bpf_wq_cancel_and_free(field_ptr);
break;
+ case BPF_TASK_WORK:
+ bpf_task_work_cancel_and_free(field_ptr);
+ break;
case BPF_KPTR_UNREF:
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
@@ -860,6 +874,7 @@ static void bpf_map_free(struct bpf_map *map)
* the free of values or special fields allocated from bpf memory
* allocator.
*/
+ kfree(map->excl_prog_sha);
migrate_disable();
map->ops->map_free(map);
migrate_enable();
@@ -905,7 +920,7 @@ static void bpf_map_free_in_work(struct bpf_map *map)
/* Avoid spawning kworkers, since they all might contend
* for the same mutex like slab_mutex.
*/
- queue_work(system_unbound_wq, &map->work);
+ queue_work(system_dfl_wq, &map->work);
}
static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
@@ -1237,7 +1252,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
- BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,
+ BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR |
+ BPF_TASK_WORK,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
@@ -1269,6 +1285,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
break;
case BPF_TIMER:
case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
@@ -1338,9 +1355,9 @@ static bool bpf_net_capable(void)
return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
}
-#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
+#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
-static int map_create(union bpf_attr *attr, bool kernel)
+static int map_create(union bpf_attr *attr, bpfptr_t uattr)
{
const struct bpf_map_ops *ops;
struct bpf_token *token = NULL;
@@ -1534,7 +1551,29 @@ static int map_create(union bpf_attr *attr, bool kernel)
attr->btf_vmlinux_value_type_id;
}
- err = security_bpf_map_create(map, attr, token, kernel);
+ if (attr->excl_prog_hash) {
+ bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel);
+
+ if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) {
+ err = -EINVAL;
+ goto free_map;
+ }
+
+ map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!map->excl_prog_sha) {
+ err = -ENOMEM;
+ goto free_map;
+ }
+
+ if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) {
+ err = -EFAULT;
+ goto free_map;
+ }
+ } else if (attr->excl_prog_hash_size) {
+ return -EINVAL;
+ }
+
+ err = security_bpf_map_create(map, attr, token, uattr.is_kernel);
if (err)
goto free_map_sec;
@@ -1627,7 +1666,8 @@ struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
}
EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
-int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
+ bool delete)
{
return -ENOTSUPP;
}
@@ -2158,7 +2198,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
} else if (map->map_type == BPF_MAP_TYPE_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
if (!bpf_map_is_offloaded(map)) {
bpf_disable_instrumentation();
rcu_read_lock();
@@ -2761,8 +2802,44 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
}
+static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr,
+ bool is_kernel)
+{
+ bpfptr_t usig = make_bpfptr(attr->signature, is_kernel);
+ struct bpf_dynptr_kern sig_ptr, insns_ptr;
+ struct bpf_key *key = NULL;
+ void *sig;
+ int err = 0;
+
+ if (system_keyring_id_check(attr->keyring_id) == 0)
+ key = bpf_lookup_system_key(attr->keyring_id);
+ else
+ key = bpf_lookup_user_key(attr->keyring_id, 0);
+
+ if (!key)
+ return -EINVAL;
+
+ sig = kvmemdup_bpfptr(usig, attr->signature_size);
+ if (IS_ERR(sig)) {
+ bpf_key_put(key);
+ return -ENOMEM;
+ }
+
+ bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0,
+ attr->signature_size);
+ bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0,
+ prog->len * sizeof(struct bpf_insn));
+
+ err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr,
+ (struct bpf_dynptr *)&sig_ptr, key);
+
+ bpf_key_put(key);
+ kvfree(sig);
+ return err;
+}
+
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt
+#define BPF_PROG_LOAD_LAST_FIELD keyring_id
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
@@ -2926,6 +3003,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
/* eBPF programs must be GPL compatible to use GPL-ed functions */
prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
+ if (attr->signature) {
+ err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel);
+ if (err)
+ goto free_prog;
+ }
+
prog->orig_prog = NULL;
prog->jited = 0;
@@ -5161,6 +5244,9 @@ static int bpf_map_get_info_by_fd(struct file *file,
info_len = min_t(u32, sizeof(info), info_len);
memset(&info, 0, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_len))
+ return -EFAULT;
+
info.type = map->map_type;
info.id = map->id;
info.key_size = map->key_size;
@@ -5185,6 +5271,25 @@ static int bpf_map_get_info_by_fd(struct file *file,
return err;
}
+ if (info.hash) {
+ char __user *uhash = u64_to_user_ptr(info.hash);
+
+ if (!map->ops->map_get_hash)
+ return -EINVAL;
+
+ if (info.hash_size != SHA256_DIGEST_SIZE)
+ return -EINVAL;
+
+ err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
+ if (err != 0)
+ return err;
+
+ if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0)
+ return -EFAULT;
+ } else if (info.hash_size) {
+ return -EINVAL;
+ }
+
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
return -EFAULT;
@@ -6008,7 +6113,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
switch (cmd) {
case BPF_MAP_CREATE:
- err = map_create(&attr, uattr.is_kernel);
+ err = map_create(&attr, uattr);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index fa353c5d550f..f8e70e9c3998 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -116,31 +116,55 @@ struct tnum tnum_xor(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
-/* Generate partial products by multiplying each bit in the multiplier (tnum a)
- * with the multiplicand (tnum b), and add the partial products after
- * appropriately bit-shifting them. Instead of directly performing tnum addition
- * on the generated partial products, equivalenty, decompose each partial
- * product into two tnums, consisting of the value-sum (acc_v) and the
- * mask-sum (acc_m) and then perform tnum addition on them. The following paper
- * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398.
+/* Perform long multiplication, iterating through the bits in a using rshift:
+ * - if LSB(a) is a known 0, keep current accumulator
+ * - if LSB(a) is a known 1, add b to current accumulator
+ * - if LSB(a) is unknown, take a union of the above cases.
+ *
+ * For example:
+ *
+ * acc_0: acc_1:
+ *
+ * 11 * -> 11 * -> 11 * -> union(0011, 1001) == x0x1
+ * x1 01 11
+ * ------ ------ ------
+ * 11 11 11
+ * xx 00 11
+ * ------ ------ ------
+ * ???? 0011 1001
*/
struct tnum tnum_mul(struct tnum a, struct tnum b)
{
- u64 acc_v = a.value * b.value;
- struct tnum acc_m = TNUM(0, 0);
+ struct tnum acc = TNUM(0, 0);
while (a.value || a.mask) {
/* LSB of tnum a is a certain 1 */
if (a.value & 1)
- acc_m = tnum_add(acc_m, TNUM(0, b.mask));
+ acc = tnum_add(acc, b);
/* LSB of tnum a is uncertain */
- else if (a.mask & 1)
- acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask));
+ else if (a.mask & 1) {
+ /* acc = tnum_union(acc_0, acc_1), where acc_0 and
+ * acc_1 are partial accumulators for cases
+ * LSB(a) = certain 0 and LSB(a) = certain 1.
+ * acc_0 = acc + 0 * b = acc.
+ * acc_1 = acc + 1 * b = tnum_add(acc, b).
+ */
+
+ acc = tnum_union(acc, tnum_add(acc, b));
+ }
/* Note: no case for LSB is certain 0 */
a = tnum_rshift(a, 1);
b = tnum_lshift(b, 1);
}
- return tnum_add(TNUM(acc_v, 0), acc_m);
+ return acc;
+}
+
+bool tnum_overlap(struct tnum a, struct tnum b)
+{
+ u64 mu;
+
+ mu = ~a.mask & ~b.mask;
+ return (a.value & mu) == (b.value & mu);
}
/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
@@ -155,6 +179,19 @@ struct tnum tnum_intersect(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
+/* Returns a tnum with the uncertainty from both a and b, and in addition, new
+ * uncertainty at any position that a and b disagree. This represents a
+ * superset of the union of the concrete sets of both a and b. Despite the
+ * overapproximation, it is optimal.
+ */
+struct tnum tnum_union(struct tnum a, struct tnum b)
+{
+ u64 v = a.value & b.value;
+ u64 mu = (a.value ^ b.value) | a.mask | b.mask;
+
+ return TNUM(v & ~mu, mu);
+}
+
struct tnum tnum_cast(struct tnum a, u8 size)
{
a.value &= (1ULL << (size * 8)) - 1;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 0e364614c3a2..5949095e51c3 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -899,8 +899,7 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
- rcu_read_lock();
- migrate_disable();
+ rcu_read_lock_dont_migrate();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
@@ -949,8 +948,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
update_prog_stats(prog, start);
this_cpu_dec(*(prog->active));
- migrate_enable();
- rcu_read_unlock();
+ rcu_read_unlock_migrate();
}
static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
@@ -960,8 +958,7 @@ static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
/* Runtime stats are exported via actual BPF_LSM_CGROUP
* programs, not the shims.
*/
- rcu_read_lock();
- migrate_disable();
+ rcu_read_lock_dont_migrate();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
@@ -974,8 +971,7 @@ static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
- migrate_enable();
- rcu_read_unlock();
+ rcu_read_unlock_migrate();
}
u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
@@ -1033,8 +1029,7 @@ static u64 notrace __bpf_prog_enter(struct bpf_prog *prog,
struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
- rcu_read_lock();
- migrate_disable();
+ rcu_read_lock_dont_migrate();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
@@ -1048,8 +1043,7 @@ static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start,
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- migrate_enable();
- rcu_read_unlock();
+ rcu_read_unlock_migrate();
}
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c4f69a9e9af6..73bba397672a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -674,6 +674,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
return BPF_DYNPTR_TYPE_SKB;
case DYNPTR_TYPE_XDP:
return BPF_DYNPTR_TYPE_XDP;
+ case DYNPTR_TYPE_SKB_META:
+ return BPF_DYNPTR_TYPE_SKB_META;
default:
return BPF_DYNPTR_TYPE_INVALID;
}
@@ -690,6 +692,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
return DYNPTR_TYPE_SKB;
case BPF_DYNPTR_TYPE_XDP:
return DYNPTR_TYPE_XDP;
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return DYNPTR_TYPE_SKB_META;
default:
return 0;
}
@@ -783,8 +787,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
}
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
- state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
return 0;
}
@@ -801,29 +804,7 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat
__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
- /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot?
- *
- * While we don't allow reading STACK_INVALID, it is still possible to
- * do <8 byte writes marking some but not all slots as STACK_MISC. Then,
- * helpers or insns can do partial read of that part without failing,
- * but check_stack_range_initialized, check_stack_read_var_off, and
- * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of
- * the slot conservatively. Hence we need to prevent those liveness
- * marking walks.
- *
- * This was not a problem before because STACK_INVALID is only set by
- * default (where the default reg state has its reg->parent as NULL), or
- * in clean_live_states after REG_LIVE_DONE (at which point
- * mark_reg_read won't walk reg->parent chain), but not randomly during
- * verifier state exploration (like we did above). Hence, for our case
- * parentage chain will still be live (i.e. reg->parent may be
- * non-NULL), while earlier reg->parent was NULL, so we need
- * REG_LIVE_WRITTEN to screen off read marker propagation when it is
- * done later on reads or by mark_dynptr_read as well to unnecessary
- * mark registers in verifier state.
- */
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
- state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
}
static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
@@ -932,9 +913,7 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
- /* Same reason as unmark_stack_slots_dynptr above */
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
- state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
return 0;
}
@@ -1052,7 +1031,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
else
st->type |= PTR_UNTRUSTED;
}
- st->live |= REG_LIVE_WRITTEN;
st->ref_obj_id = i == 0 ? id : 0;
st->iter.btf = btf;
st->iter.btf_id = btf_id;
@@ -1062,6 +1040,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
for (j = 0; j < BPF_REG_SIZE; j++)
slot->slot_type[j] = STACK_ITER;
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
mark_stack_slot_scratched(env, spi - i);
}
@@ -1087,12 +1066,10 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
__mark_reg_not_init(env, st);
- /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
- st->live |= REG_LIVE_WRITTEN;
-
for (j = 0; j < BPF_REG_SIZE; j++)
slot->slot_type[j] = STACK_INVALID;
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
mark_stack_slot_scratched(env, spi - i);
}
@@ -1182,9 +1159,9 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
slot = &state->stack[spi];
st = &slot->spilled_ptr;
+ bpf_mark_stack_write(env, reg->frameno, BIT(spi));
__mark_reg_known_zero(st);
st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
- st->live |= REG_LIVE_WRITTEN;
st->ref_obj_id = id;
st->irq.kfunc_class = kfunc_class;
@@ -1238,8 +1215,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
__mark_reg_not_init(env, st);
- /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
- st->live |= REG_LIVE_WRITTEN;
+ bpf_mark_stack_write(env, reg->frameno, BIT(spi));
for (i = 0; i < BPF_REG_SIZE; i++)
slot->slot_type[i] = STACK_INVALID;
@@ -1754,6 +1730,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return err;
dst_state->speculative = src->speculative;
dst_state->in_sleepable = src->in_sleepable;
+ dst_state->cleaned = src->cleaned;
dst_state->curframe = src->curframe;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
@@ -1946,9 +1923,24 @@ static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_stat
return 0;
visit = scc_visit_lookup(env, callchain);
if (!visit) {
- verifier_bug(env, "scc exit: no visit info for call chain %s",
- format_callchain(env, callchain));
- return -EFAULT;
+ /*
+ * If path traversal stops inside an SCC, corresponding bpf_scc_visit
+ * must exist for non-speculative paths. For non-speculative paths
+ * traversal stops when:
+ * a. Verification error is found, maybe_exit_scc() is not called.
+ * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member
+ * of any SCC.
+ * c. A checkpoint is reached and matched. Checkpoints are created by
+ * is_state_visited(), which calls maybe_enter_scc(), which allocates
+ * bpf_scc_visit instances for checkpoints within SCCs.
+ * (c) is the only case that can reach this point.
+ */
+ if (!st->speculative) {
+ verifier_bug(env, "scc exit: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ return 0;
}
if (visit->entry_state != st)
return 0;
@@ -2017,7 +2009,7 @@ static void free_backedges(struct bpf_scc_visit *visit)
for (backedge = visit->backedges; backedge; backedge = next) {
free_verifier_state(&backedge->state, false);
next = backedge->next;
- kvfree(backedge);
+ kfree(backedge);
}
visit->backedges = NULL;
}
@@ -2232,10 +2224,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
/* transfer reg's id which is unique for every map_lookup_elem
* as UID of the inner map.
*/
- if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
- reg->map_uid = reg->id;
- if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE))
+ if (btf_record_has_field(map->inner_map_meta->record,
+ BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
reg->map_uid = reg->id;
+ }
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -2274,7 +2266,8 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
{
return base_type(reg->type) == PTR_TO_MEM &&
- (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
+ (reg->type &
+ (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META));
}
/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
@@ -2873,8 +2866,6 @@ static void init_reg_state(struct bpf_verifier_env *env,
for (i = 0; i < MAX_BPF_REG; i++) {
mark_reg_not_init(env, regs, i);
- regs[i].live = REG_LIVE_NONE;
- regs[i].parent = NULL;
regs[i].subreg_def = DEF_NOT_SUBREG;
}
@@ -2958,7 +2949,7 @@ static int cmp_subprogs(const void *a, const void *b)
}
/* Find subprogram that contains instruction at 'off' */
-static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off)
+struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *vals = env->subprog_info;
int l, r, m;
@@ -2983,7 +2974,7 @@ static int find_subprog(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *p;
- p = find_containing_subprog(env, off);
+ p = bpf_find_containing_subprog(env, off);
if (!p || p->start != off)
return -ENOENT;
return p - env->subprog_info;
@@ -3494,15 +3485,6 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
return 0;
}
-static int jmp_offset(struct bpf_insn *insn)
-{
- u8 code = insn->code;
-
- if (code == (BPF_JMP32 | BPF_JA))
- return insn->imm;
- return insn->off;
-}
-
static int check_subprogs(struct bpf_verifier_env *env)
{
int i, subprog_start, subprog_end, off, cur_subprog = 0;
@@ -3529,7 +3511,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
- off = i + jmp_offset(&insn[i]) + 1;
+ off = i + bpf_jmp_offset(&insn[i]) + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
return -EINVAL;
@@ -3555,69 +3537,15 @@ next:
return 0;
}
-/* Parentage chain of this register (or stack slot) should take care of all
- * issues like callee-saved registers, stack slot allocation time, etc.
- */
-static int mark_reg_read(struct bpf_verifier_env *env,
- const struct bpf_reg_state *state,
- struct bpf_reg_state *parent, u8 flag)
-{
- bool writes = parent == state->parent; /* Observe write marks */
- int cnt = 0;
-
- while (parent) {
- /* if read wasn't screened by an earlier write ... */
- if (writes && state->live & REG_LIVE_WRITTEN)
- break;
- if (verifier_bug_if(parent->live & REG_LIVE_DONE, env,
- "type %s var_off %lld off %d",
- reg_type_str(env, parent->type),
- parent->var_off.value, parent->off))
- return -EFAULT;
- /* The first condition is more likely to be true than the
- * second, checked it first.
- */
- if ((parent->live & REG_LIVE_READ) == flag ||
- parent->live & REG_LIVE_READ64)
- /* The parentage chain never changes and
- * this parent was already marked as LIVE_READ.
- * There is no need to keep walking the chain again and
- * keep re-marking all parents as LIVE_READ.
- * This case happens when the same register is read
- * multiple times without writes into it in-between.
- * Also, if parent has the stronger REG_LIVE_READ64 set,
- * then no need to set the weak REG_LIVE_READ32.
- */
- break;
- /* ... then we depend on parent's value */
- parent->live |= flag;
- /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
- if (flag == REG_LIVE_READ64)
- parent->live &= ~REG_LIVE_READ32;
- state = parent;
- parent = state->parent;
- writes = true;
- cnt++;
- }
-
- if (env->longest_mark_read_walk < cnt)
- env->longest_mark_read_walk = cnt;
- return 0;
-}
-
static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
int spi, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
int err, i;
for (i = 0; i < nr_slots; i++) {
- struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
-
- err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+ err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i));
if (err)
return err;
-
mark_stack_slot_scratched(env, spi - i);
}
return 0;
@@ -3663,7 +3591,7 @@ static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
*/
-static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
+static bool is_reg64(struct bpf_insn *insn,
u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
{
u8 code, class, op;
@@ -3774,14 +3702,14 @@ static int insn_def_regno(const struct bpf_insn *insn)
}
/* Return TRUE if INSN has defined any 32-bit value explicitly. */
-static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static bool insn_has_def32(struct bpf_insn *insn)
{
int dst_reg = insn_def_regno(insn);
if (dst_reg == -1)
return false;
- return !is_reg64(env, insn, dst_reg, NULL, DST_OP);
+ return !is_reg64(insn, dst_reg, NULL, DST_OP);
}
static void mark_insn_zext(struct bpf_verifier_env *env,
@@ -3812,7 +3740,7 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r
mark_reg_scratched(env, regno);
reg = &regs[regno];
- rw64 = is_reg64(env, insn, regno, reg, t);
+ rw64 = is_reg64(insn, regno, reg, t);
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (reg->type == NOT_INIT) {
@@ -3826,15 +3754,13 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r
if (rw64)
mark_insn_zext(env, reg);
- return mark_reg_read(env, reg, reg->parent,
- rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
+ return 0;
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
verbose(env, "frame pointer is read only\n");
return -EACCES;
}
- reg->live |= REG_LIVE_WRITTEN;
reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
if (t == DST_OP)
mark_reg_unknown(env, regs, regno);
@@ -4195,7 +4121,7 @@ static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
}
}
/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
-static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
+void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
{
DECLARE_BITMAP(mask, 64);
bool first = true;
@@ -4250,8 +4176,6 @@ static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_histo
}
}
-static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
-
/* For given verifier state backtrack_insn() is called from the last insn to
* the first insn. Its purpose is to compute a bitmask of registers and
* stack slots that needs precision in the parent verifier state.
@@ -4278,7 +4202,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
verbose(env, "mark_precise: frame%d: regs=%s ",
bt->frame, env->tmp_str_buf);
- fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
verbose(env, "stack=%s before ", env->tmp_str_buf);
verbose(env, "%d: ", idx);
verbose_insn(env, insn);
@@ -4479,7 +4403,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* backtracking, as these registers are set by the function
* invoking callback.
*/
- if (subseq_idx >= 0 && calls_callback(env, subseq_idx))
+ if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx))
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
bt_clear_reg(bt, i);
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
@@ -4918,7 +4842,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env,
bt_frame_reg_mask(bt, fr));
verbose(env, "mark_precise: frame%d: parent state regs=%s ",
fr, env->tmp_str_buf);
- fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
bt_frame_stack_mask(bt, fr));
verbose(env, "stack=%s: ", env->tmp_str_buf);
print_verifier_state(env, st, fr, true);
@@ -5041,12 +4965,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
/* Copy src state preserving dst->parent and dst->live fields */
static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
{
- struct bpf_reg_state *parent = dst->parent;
- enum bpf_reg_liveness live = dst->live;
-
*dst = *src;
- dst->parent = parent;
- dst->live = live;
}
static void save_register_state(struct bpf_verifier_env *env,
@@ -5057,8 +4976,6 @@ static void save_register_state(struct bpf_verifier_env *env,
int i;
copy_register_state(&state->stack[spi].spilled_ptr, reg);
- if (size == BPF_REG_SIZE)
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
state->stack[spi].slot_type[i - 1] = STACK_SPILL;
@@ -5152,6 +5069,18 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
if (err)
return err;
+ if (!(off % BPF_REG_SIZE) && size == BPF_REG_SIZE) {
+ /* only mark the slot as written if all 8 bytes were written
+ * otherwise read propagation may incorrectly stop too soon
+ * when stack slots are partially written.
+ * This heuristic means that read propagation will be
+ * conservative, since it will add reg_live_read marks
+ * to stack slots all the way to first state when programs
+ * writes+reads less than 8 bytes
+ */
+ bpf_mark_stack_write(env, state->frameno, BIT(spi));
+ }
+
check_fastcall_stack_contract(env, state, insn_idx, off);
mark_stack_slot_scratched(env, spi);
if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
@@ -5195,17 +5124,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
for (i = 0; i < BPF_REG_SIZE; i++)
scrub_spilled_slot(&state->stack[spi].slot_type[i]);
- /* only mark the slot as written if all 8 bytes were written
- * otherwise read propagation may incorrectly stop too soon
- * when stack slots are partially written.
- * This heuristic means that read propagation will be
- * conservative, since it will add reg_live_read marks
- * to stack slots all the way to first state when programs
- * writes+reads less than 8 bytes
- */
- if (size == BPF_REG_SIZE)
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
-
/* when we zero initialize stack slots mark them as such */
if ((reg && register_is_null(reg)) ||
(!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
@@ -5398,7 +5316,6 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env,
/* have read misc data from the stack */
mark_reg_unknown(env, state->regs, dst_regno);
}
- state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
}
/* Read the stack at 'off' and put the results into the register indicated by
@@ -5421,12 +5338,16 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
struct bpf_reg_state *reg;
u8 *stype, type;
int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
+ int err;
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
mark_stack_slot_scratched(env, spi);
check_fastcall_stack_contract(env, state, env->insn_idx, off);
+ err = bpf_mark_stack_read(env, reg_state->frameno, env->insn_idx, BIT(spi));
+ if (err)
+ return err;
if (is_spilled_reg(&reg_state->stack[spi])) {
u8 spill_size = 1;
@@ -5441,7 +5362,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
return -EACCES;
}
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
if (dst_regno < 0)
return 0;
@@ -5495,7 +5415,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
insn_flags = 0; /* not restoring original register state */
}
}
- state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
} else if (dst_regno >= 0) {
/* restore register state from stack */
copy_register_state(&state->regs[dst_regno], reg);
@@ -5503,7 +5422,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
*/
- state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
} else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
/* If dst_regno==-1, the caller is asking us whether
* it is acceptable to use this value as a SCALAR_VALUE
@@ -5515,7 +5433,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
off);
return -EACCES;
}
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
} else {
for (i = 0; i < size; i++) {
type = stype[(slot - i) % BPF_REG_SIZE];
@@ -5529,7 +5446,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
off, i, size);
return -EACCES;
}
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
if (dst_regno >= 0)
mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
insn_flags = 0; /* we are not restoring spilled register */
@@ -8157,10 +8073,10 @@ mark:
/* reading any byte out of 8-byte 'spill_slot' will cause
* the whole slot to be marked as 'read'
*/
- mark_reg_read(env, &state->stack[spi].spilled_ptr,
- state->stack[spi].spilled_ptr.parent,
- REG_LIVE_READ64);
- /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not
+ err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi));
+ if (err)
+ return err;
+ /* We do not call bpf_mark_stack_write(), as we can not
* be sure that whether stack slot is written to or not. Hence,
* we must still conservatively propagate reads upwards even if
* helper may write to the entire memory range.
@@ -8515,38 +8431,70 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
return 0;
}
-static int process_timer_func(struct bpf_verifier_env *env, int regno,
- struct bpf_call_arg_meta *meta)
+/* Check if @regno is a pointer to a specific field in a map value */
+static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
+ enum btf_field_type field_type)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
bool is_const = tnum_is_const(reg->var_off);
struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
+ const char *struct_name = btf_field_type_name(field_type);
+ int field_off = -1;
if (!is_const) {
verbose(env,
- "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
- regno);
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, struct_name);
return -EINVAL;
}
if (!map->btf) {
- verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
- map->name);
+ verbose(env, "map '%s' has to have BTF in order to use %s\n", map->name,
+ struct_name);
+ return -EINVAL;
+ }
+ if (!btf_record_has_field(map->record, field_type)) {
+ verbose(env, "map '%s' has no valid %s\n", map->name, struct_name);
return -EINVAL;
}
- if (!btf_record_has_field(map->record, BPF_TIMER)) {
- verbose(env, "map '%s' has no valid bpf_timer\n", map->name);
+ switch (field_type) {
+ case BPF_TIMER:
+ field_off = map->record->timer_off;
+ break;
+ case BPF_TASK_WORK:
+ field_off = map->record->task_work_off;
+ break;
+ default:
+ verifier_bug(env, "unsupported BTF field type: %s\n", struct_name);
return -EINVAL;
}
- if (map->record->timer_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
- val + reg->off, map->record->timer_off);
+ if (field_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n",
+ val + reg->off, struct_name, field_off);
return -EINVAL;
}
+ return 0;
+}
+
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+
+ err = check_map_field_pointer(env, regno, BPF_TIMER);
+ if (err)
+ return err;
+
if (meta->map_ptr) {
verifier_bug(env, "Two map pointers in a timer helper");
return -EFAULT;
}
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n");
+ return -EOPNOTSUPP;
+ }
meta->map_uid = reg->map_uid;
meta->map_ptr = map;
return 0;
@@ -8569,6 +8517,26 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno,
return 0;
}
+static int process_task_work_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+
+ err = check_map_field_pointer(env, regno, BPF_TASK_WORK);
+ if (err)
+ return err;
+
+ if (meta->map.ptr) {
+ verifier_bug(env, "Two map pointers in a bpf_task_work helper");
+ return -EFAULT;
+ }
+ meta->map.uid = reg->map_uid;
+ meta->map.ptr = map;
+ return 0;
+}
+
static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
@@ -10398,6 +10366,8 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
struct bpf_func_state *callee,
int insn_idx);
+static bool is_task_work_add_kfunc(u32 func_id);
+
static int set_callee_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee, int insn_idx);
@@ -10616,7 +10586,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
insn_idx, subprog,
- is_bpf_wq_set_callback_impl_kfunc(insn->imm));
+ is_bpf_wq_set_callback_impl_kfunc(insn->imm) ||
+ is_task_work_add_kfunc(insn->imm));
if (!async_cb)
return -EFAULT;
callee = async_cb->frame[0];
@@ -10717,6 +10688,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* and go analyze first insn of the callee */
*insn_idx = env->subprog_info[subprog].start - 1;
+ bpf_reset_live_stack_callchain(env);
+
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
print_verifier_state(env, state, caller->frameno, true);
@@ -10842,7 +10815,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
- callee->callback_ret_range = retval_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 0);
return 0;
}
@@ -10929,6 +10902,36 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr;
+
+ /*
+ * callback_fn(struct bpf_map *map, void *key, void *value);
+ */
+ callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_async_callback_fn = true;
+ callee->callback_ret_range = retval_range(S32_MIN, S32_MAX);
+ return 0;
+}
+
static bool is_rbtree_lock_required_kfunc(u32 btf_id);
/* Are we currently verifying the callback for a rbtree helper that must
@@ -10992,8 +10995,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
}
/* we are going to rely on register's precise value */
- err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64);
- err = err ?: mark_chain_precision(env, BPF_REG_0);
+ err = mark_chain_precision(env, BPF_REG_0);
if (err)
return err;
@@ -11003,7 +11005,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
"At callback return", "R0");
return -EINVAL;
}
- if (!calls_callback(env, callee->callsite)) {
+ if (!bpf_calls_callback(env, callee->callsite)) {
verifier_bug(env, "in callback at %d, callsite %d !calls_callback",
*insn_idx, callee->callsite);
return -EFAULT;
@@ -11354,7 +11356,7 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
return -EINVAL;
*ptr = env->ops->get_func_proto(func_id, env->prog);
- return *ptr ? 0 : -EINVAL;
+ return *ptr && (*ptr)->func ? 0 : -EINVAL;
}
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -11641,7 +11643,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
return -EFAULT;
- if (dynptr_type == BPF_DYNPTR_TYPE_SKB)
+ if (dynptr_type == BPF_DYNPTR_TYPE_SKB ||
+ dynptr_type == BPF_DYNPTR_TYPE_SKB_META)
/* this will trigger clear_all_pkt_pointers(), which will
* invalidate all dynptr slices associated with the skb
*/
@@ -11896,17 +11899,11 @@ static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_re
if (regno == BPF_REG_0) {
/* Function return value */
- reg->live |= REG_LIVE_WRITTEN;
reg->subreg_def = reg_size == sizeof(u64) ?
DEF_NOT_SUBREG : env->insn_idx + 1;
- } else {
+ } else if (reg_size == sizeof(u64)) {
/* Function argument */
- if (reg_size == sizeof(u64)) {
- mark_insn_zext(env, reg);
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
- } else {
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
- }
+ mark_insn_zext(env, reg);
}
}
@@ -12059,6 +12056,7 @@ enum {
KF_ARG_RB_NODE_ID,
KF_ARG_WORKQUEUE_ID,
KF_ARG_RES_SPIN_LOCK_ID,
+ KF_ARG_TASK_WORK_ID,
};
BTF_ID_LIST(kf_arg_btf_ids)
@@ -12069,6 +12067,7 @@ BTF_ID(struct, bpf_rb_root)
BTF_ID(struct, bpf_rb_node)
BTF_ID(struct, bpf_wq)
BTF_ID(struct, bpf_res_spin_lock)
+BTF_ID(struct, bpf_task_work)
static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
const struct btf_param *arg, int type)
@@ -12117,6 +12116,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
}
+static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID);
+}
+
static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg)
{
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID);
@@ -12204,6 +12208,7 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_WORKQUEUE,
KF_ARG_PTR_TO_IRQ_FLAG,
KF_ARG_PTR_TO_RES_SPIN_LOCK,
+ KF_ARG_PTR_TO_TASK_WORK,
};
enum special_kfunc_type {
@@ -12228,6 +12233,8 @@ enum special_kfunc_type {
KF_bpf_rbtree_right,
KF_bpf_dynptr_from_skb,
KF_bpf_dynptr_from_xdp,
+ KF_bpf_dynptr_from_skb_meta,
+ KF_bpf_xdp_pull_data,
KF_bpf_dynptr_slice,
KF_bpf_dynptr_slice_rdwr,
KF_bpf_dynptr_clone,
@@ -12252,6 +12259,8 @@ enum special_kfunc_type {
KF_bpf_res_spin_lock_irqsave,
KF_bpf_res_spin_unlock_irqrestore,
KF___bpf_trap,
+ KF_bpf_task_work_schedule_signal,
+ KF_bpf_task_work_schedule_resume,
};
BTF_ID_LIST(special_kfunc_list)
@@ -12277,9 +12286,13 @@ BTF_ID(func, bpf_rbtree_right)
#ifdef CONFIG_NET
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_from_skb_meta)
+BTF_ID(func, bpf_xdp_pull_data)
#else
BTF_ID_UNUSED
BTF_ID_UNUSED
+BTF_ID_UNUSED
+BTF_ID_UNUSED
#endif
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
@@ -12318,6 +12331,14 @@ BTF_ID(func, bpf_res_spin_unlock)
BTF_ID(func, bpf_res_spin_lock_irqsave)
BTF_ID(func, bpf_res_spin_unlock_irqrestore)
BTF_ID(func, __bpf_trap)
+BTF_ID(func, bpf_task_work_schedule_signal)
+BTF_ID(func, bpf_task_work_schedule_resume)
+
+static bool is_task_work_add_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] ||
+ func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume];
+}
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -12349,6 +12370,11 @@ static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
}
+static bool is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data];
+}
+
static enum kfunc_ptr_arg_type
get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
struct bpf_kfunc_call_arg_meta *meta,
@@ -12408,6 +12434,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_wq(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_WORKQUEUE;
+ if (is_kfunc_arg_task_work(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_TASK_WORK;
+
if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_IRQ_FLAG;
@@ -12751,7 +12780,8 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id)
static bool is_async_callback_calling_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] ||
+ is_task_work_add_kfunc(btf_id);
}
static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
@@ -13132,7 +13162,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "pointer in R%d isn't map pointer\n", regno);
return -EINVAL;
}
- if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) {
+ if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 ||
+ reg->map_ptr->record->task_work_off >= 0)) {
/* Use map_uid (which is unique id of inner map) to reject:
* inner_map1 = bpf_map_lookup_elem(outer_map, key1)
* inner_map2 = bpf_map_lookup_elem(outer_map, key2)
@@ -13147,6 +13178,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
*/
if (meta->map.ptr != reg->map_ptr ||
meta->map.uid != reg->map_uid) {
+ if (reg->map_ptr->record->task_work_off >= 0) {
+ verbose(env,
+ "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n",
+ meta->map.uid, reg->map_uid);
+ return -EINVAL;
+ }
verbose(env,
"workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
meta->map.uid, reg->map_uid);
@@ -13185,6 +13222,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
case KF_ARG_PTR_TO_CONST_STR:
case KF_ARG_PTR_TO_WORKQUEUE:
+ case KF_ARG_PTR_TO_TASK_WORK:
case KF_ARG_PTR_TO_IRQ_FLAG:
case KF_ARG_PTR_TO_RES_SPIN_LOCK:
break;
@@ -13253,6 +13291,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
dynptr_arg_type |= DYNPTR_TYPE_SKB;
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) {
dynptr_arg_type |= DYNPTR_TYPE_XDP;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) {
+ dynptr_arg_type |= DYNPTR_TYPE_SKB_META;
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
(dynptr_arg_type & MEM_UNINIT)) {
enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
@@ -13476,6 +13516,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret < 0)
return ret;
break;
+ case KF_ARG_PTR_TO_TASK_WORK:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a map value\n", i);
+ return -EINVAL;
+ }
+ ret = process_task_work_func(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
case KF_ARG_PTR_TO_IRQ_FLAG:
if (reg->type != PTR_TO_STACK) {
verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i);
@@ -13842,6 +13891,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (is_task_work_add_kfunc(meta.func_id)) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_task_work_schedule_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
@@ -13901,6 +13960,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EACCES;
}
+ if (is_kfunc_rcu_protected(&meta) && !in_rcu_cs(env)) {
+ verbose(env, "kernel func %s requires RCU critical section protection\n", func_name);
+ return -EACCES;
+ }
+
/* In case of release function, we get register number of refcounted
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
@@ -14014,6 +14078,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* Ensures we don't access the memory after a release_reference() */
if (meta.ref_obj_id)
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+
+ if (is_kfunc_rcu_protected(&meta))
+ regs[BPF_REG_0].type |= MEM_RCU;
} else {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].btf = desc_btf;
@@ -14022,6 +14089,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
regs[BPF_REG_0].type |= PTR_UNTRUSTED;
+ else if (is_kfunc_rcu_protected(&meta))
+ regs[BPF_REG_0].type |= MEM_RCU;
if (is_iter_next_kfunc(&meta)) {
struct bpf_reg_state *cur_iter;
@@ -14066,6 +14135,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (is_kfunc_pkt_changing(&meta))
+ clear_all_pkt_pointers(env);
+
nargs = btf_type_vlen(meta.func_proto);
args = (const struct btf_param *)(meta.func_proto + 1);
for (i = 0; i < nargs; i++) {
@@ -15645,7 +15717,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
- dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
/* case: R1 = (s8, s16 s32)R2 */
@@ -15664,7 +15735,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (!no_sext)
dst_reg->id = 0;
coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
- dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
mark_reg_unknown(env, regs, insn->dst_reg);
@@ -15690,7 +15760,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
if (!is_src_reg_u32)
dst_reg->id = 0;
- dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = env->insn_idx + 1;
} else {
/* case: W1 = (s8, s16)W2 */
@@ -15701,7 +15770,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
copy_register_state(dst_reg, src_reg);
if (!no_sext)
dst_reg->id = 0;
- dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = env->insn_idx + 1;
coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
}
@@ -15886,6 +15954,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta
*/
if (tnum_is_const(t1) && tnum_is_const(t2))
return t1.value == t2.value;
+ if (!tnum_overlap(t1, t2))
+ return 0;
/* non-overlapping ranges */
if (umin1 > umax2 || umax1 < umin2)
return 0;
@@ -15910,6 +15980,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta
*/
if (tnum_is_const(t1) && tnum_is_const(t2))
return t1.value != t2.value;
+ if (!tnum_overlap(t1, t2))
+ return 1;
/* non-overlapping ranges */
if (umin1 > umax2 || umax1 < umin2)
return 1;
@@ -17117,9 +17189,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
}
if (frame->in_async_callback_fn) {
- /* enforce return zero from async callbacks like timer */
exit_ctx = "At async callback return";
- range = retval_range(0, 0);
+ range = frame->callback_ret_range;
goto enforce_retval;
}
@@ -17258,7 +17329,7 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *subprog;
- subprog = find_containing_subprog(env, off);
+ subprog = bpf_find_containing_subprog(env, off);
subprog->changes_pkt_data = true;
}
@@ -17266,7 +17337,7 @@ static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *subprog;
- subprog = find_containing_subprog(env, off);
+ subprog = bpf_find_containing_subprog(env, off);
subprog->might_sleep = true;
}
@@ -17280,8 +17351,8 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
{
struct bpf_subprog_info *caller, *callee;
- caller = find_containing_subprog(env, t);
- callee = find_containing_subprog(env, w);
+ caller = bpf_find_containing_subprog(env, t);
+ callee = bpf_find_containing_subprog(env, w);
caller->changes_pkt_data |= callee->changes_pkt_data;
caller->might_sleep |= callee->might_sleep;
}
@@ -17351,7 +17422,7 @@ static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
env->insn_aux_data[idx].calls_callback = true;
}
-static bool calls_callback(struct bpf_verifier_env *env, int insn_idx)
+bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx)
{
return env->insn_aux_data[insn_idx].calls_callback;
}
@@ -17783,6 +17854,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
*/
if (ret == 0 && is_kfunc_sleepable(&meta))
mark_subprog_might_sleep(env, t);
+ if (ret == 0 && is_kfunc_pkt_changing(&meta))
+ mark_subprog_changes_pkt_data(env, t);
}
return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
@@ -17825,7 +17898,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
static int check_cfg(struct bpf_verifier_env *env)
{
int insn_cnt = env->prog->len;
- int *insn_stack, *insn_state, *insn_postorder;
+ int *insn_stack, *insn_state;
int ex_insn_beg, i, ret = 0;
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
@@ -17838,14 +17911,6 @@ static int check_cfg(struct bpf_verifier_env *env)
return -ENOMEM;
}
- insn_postorder = env->cfg.insn_postorder =
- kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
- if (!insn_postorder) {
- kvfree(insn_state);
- kvfree(insn_stack);
- return -ENOMEM;
- }
-
ex_insn_beg = env->exception_callback_subprog
? env->subprog_info[env->exception_callback_subprog].start
: 0;
@@ -17863,7 +17928,6 @@ walk_cfg:
case DONE_EXPLORING:
insn_state[t] = EXPLORED;
env->cfg.cur_stack--;
- insn_postorder[env->cfg.cur_postorder++] = t;
break;
case KEEP_EXPLORING:
break;
@@ -17917,6 +17981,56 @@ err_free:
return ret;
}
+/*
+ * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range
+ * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start)
+ * with indices of 'i' instructions in postorder.
+ */
+static int compute_postorder(struct bpf_verifier_env *env)
+{
+ u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2];
+ int *stack = NULL, *postorder = NULL, *state = NULL;
+
+ postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ stack = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ if (!postorder || !state || !stack) {
+ kvfree(postorder);
+ kvfree(state);
+ kvfree(stack);
+ return -ENOMEM;
+ }
+ cur_postorder = 0;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ env->subprog_info[i].postorder_start = cur_postorder;
+ stack[0] = env->subprog_info[i].start;
+ stack_sz = 1;
+ do {
+ top = stack[stack_sz - 1];
+ state[top] |= DISCOVERED;
+ if (state[top] & EXPLORED) {
+ postorder[cur_postorder++] = top;
+ stack_sz--;
+ continue;
+ }
+ succ_cnt = bpf_insn_successors(env->prog, top, succ);
+ for (s = 0; s < succ_cnt; ++s) {
+ if (!state[succ[s]]) {
+ stack[stack_sz++] = succ[s];
+ state[succ[s]] |= DISCOVERED;
+ }
+ }
+ state[top] |= EXPLORED;
+ } while (stack_sz);
+ }
+ env->subprog_info[i].postorder_start = cur_postorder;
+ env->cfg.insn_postorder = postorder;
+ env->cfg.cur_postorder = cur_postorder;
+ kvfree(stack);
+ kvfree(state);
+ return 0;
+}
+
static int check_abnormal_return(struct bpf_verifier_env *env)
{
int i;
@@ -18449,16 +18563,15 @@ static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
}
static void clean_func_state(struct bpf_verifier_env *env,
- struct bpf_func_state *st)
+ struct bpf_func_state *st,
+ u32 ip)
{
- enum bpf_reg_liveness live;
+ u16 live_regs = env->insn_aux_data[ip].live_regs_before;
int i, j;
for (i = 0; i < BPF_REG_FP; i++) {
- live = st->regs[i].live;
/* liveness must not touch this register anymore */
- st->regs[i].live |= REG_LIVE_DONE;
- if (!(live & REG_LIVE_READ))
+ if (!(live_regs & BIT(i)))
/* since the register is unused, clear its state
* to make further comparison simpler
*/
@@ -18466,10 +18579,7 @@ static void clean_func_state(struct bpf_verifier_env *env,
}
for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
- live = st->stack[i].spilled_ptr.live;
- /* liveness must not touch this stack slot anymore */
- st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
- if (!(live & REG_LIVE_READ)) {
+ if (!bpf_stack_slot_alive(env, st->frameno, i)) {
__mark_reg_not_init(env, &st->stack[i].spilled_ptr);
for (j = 0; j < BPF_REG_SIZE; j++)
st->stack[i].slot_type[j] = STACK_INVALID;
@@ -18480,10 +18590,14 @@ static void clean_func_state(struct bpf_verifier_env *env,
static void clean_verifier_state(struct bpf_verifier_env *env,
struct bpf_verifier_state *st)
{
- int i;
+ int i, ip;
- for (i = 0; i <= st->curframe; i++)
- clean_func_state(env, st->frame[i]);
+ bpf_live_stack_query_init(env, st);
+ st->cleaned = true;
+ for (i = 0; i <= st->curframe; i++) {
+ ip = frame_insn_idx(st, i);
+ clean_func_state(env, st->frame[i], ip);
+ }
}
/* the parentage chains form a tree.
@@ -18494,25 +18608,23 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
* but a lot of states will get revised from liveness point of view when
* the verifier explores other branches.
* Example:
- * 1: r0 = 1
+ * 1: *(u64)(r10 - 8) = 1
* 2: if r1 == 100 goto pc+1
- * 3: r0 = 2
- * 4: exit
- * when the verifier reaches exit insn the register r0 in the state list of
- * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch
- * of insn 2 and goes exploring further. At the insn 4 it will walk the
- * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ.
+ * 3: *(u64)(r10 - 8) = 2
+ * 4: r0 = *(u64)(r10 - 8)
+ * 5: exit
+ * when the verifier reaches exit insn the stack slot -8 in the state list of
+ * insn 2 is not yet marked alive. Then the verifier pops the other_branch
+ * of insn 2 and goes exploring further. After the insn 4 read, liveness
+ * analysis would propagate read mark for -8 at insn 2.
*
* Since the verifier pushes the branch states as it sees them while exploring
* the program the condition of walking the branch instruction for the second
* time means that all states below this branch were already explored and
* their final liveness marks are already propagated.
* Hence when the verifier completes the search of state list in is_state_visited()
- * we can call this clean_live_states() function to mark all liveness states
- * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
- * will not be used.
- * This function also clears the registers and stack for states that !READ
- * to simplify state merging.
+ * we can call this clean_live_states() function to clear dead the registers and stack
+ * slots to simplify state merging.
*
* Important note here that walking the same branch instruction in the callee
* doesn't meant that the states are DONE. The verifier has to compare
@@ -18532,7 +18644,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
if (sl->state.insn_idx != insn ||
!same_callsites(&sl->state, cur))
continue;
- if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE)
+ if (sl->state.cleaned)
/* all regs in this state in all frames were already marked */
continue;
if (incomplete_read_marks(env, &sl->state))
@@ -18564,9 +18676,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
if (exact == EXACT)
return regs_exact(rold, rcur, idmap);
- if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT)
- /* explored state didn't use this */
- return true;
if (rold->type == NOT_INIT) {
if (exact == NOT_EXACT || rcur->type == NOT_INIT)
/* explored state can't have used this */
@@ -18690,7 +18799,6 @@ static struct bpf_reg_state unbound_reg;
static __init int unbound_reg_init(void)
{
__mark_reg_unknown_imprecise(&unbound_reg);
- unbound_reg.live |= REG_LIVE_READ;
return 0;
}
late_initcall(unbound_reg_init);
@@ -18743,13 +18851,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
return false;
- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)
- && exact == NOT_EXACT) {
- i += BPF_REG_SIZE - 1;
- /* explored state didn't use this */
- continue;
- }
-
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
continue;
@@ -18992,91 +19093,6 @@ static bool states_equal(struct bpf_verifier_env *env,
return true;
}
-/* Return 0 if no propagation happened. Return negative error code if error
- * happened. Otherwise, return the propagated bit.
- */
-static int propagate_liveness_reg(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg,
- struct bpf_reg_state *parent_reg)
-{
- u8 parent_flag = parent_reg->live & REG_LIVE_READ;
- u8 flag = reg->live & REG_LIVE_READ;
- int err;
-
- /* When comes here, read flags of PARENT_REG or REG could be any of
- * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
- * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
- */
- if (parent_flag == REG_LIVE_READ64 ||
- /* Or if there is no read flag from REG. */
- !flag ||
- /* Or if the read flag from REG is the same as PARENT_REG. */
- parent_flag == flag)
- return 0;
-
- err = mark_reg_read(env, reg, parent_reg, flag);
- if (err)
- return err;
-
- return flag;
-}
-
-/* A write screens off any subsequent reads; but write marks come from the
- * straight-line code between a state and its parent. When we arrive at an
- * equivalent state (jump target or such) we didn't arrive by the straight-line
- * code, so read marks in the state must propagate to the parent regardless
- * of the state's write marks. That's what 'parent == state->parent' comparison
- * in mark_reg_read() is for.
- */
-static int propagate_liveness(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *vstate,
- struct bpf_verifier_state *vparent,
- bool *changed)
-{
- struct bpf_reg_state *state_reg, *parent_reg;
- struct bpf_func_state *state, *parent;
- int i, frame, err = 0;
- bool tmp = false;
-
- changed = changed ?: &tmp;
- if (vparent->curframe != vstate->curframe) {
- WARN(1, "propagate_live: parent frame %d current frame %d\n",
- vparent->curframe, vstate->curframe);
- return -EFAULT;
- }
- /* Propagate read liveness of registers... */
- BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
- for (frame = 0; frame <= vstate->curframe; frame++) {
- parent = vparent->frame[frame];
- state = vstate->frame[frame];
- parent_reg = parent->regs;
- state_reg = state->regs;
- /* We don't need to worry about FP liveness, it's read-only */
- for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
- err = propagate_liveness_reg(env, &state_reg[i],
- &parent_reg[i]);
- if (err < 0)
- return err;
- *changed |= err > 0;
- if (err == REG_LIVE_READ64)
- mark_insn_zext(env, &parent_reg[i]);
- }
-
- /* Propagate stack slots. */
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
- i < parent->allocated_stack / BPF_REG_SIZE; i++) {
- parent_reg = &parent->stack[i].spilled_ptr;
- state_reg = &state->stack[i].spilled_ptr;
- err = propagate_liveness_reg(env, state_reg,
- parent_reg);
- *changed |= err > 0;
- if (err < 0)
- return err;
- }
- }
- return 0;
-}
-
/* find precise scalars in the previous equivalent state and
* propagate them into the current state
*/
@@ -19096,8 +19112,7 @@ static int propagate_precision(struct bpf_verifier_env *env,
first = true;
for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise ||
- !(state_reg->live & REG_LIVE_READ))
+ !state_reg->precise)
continue;
if (env->log.level & BPF_LOG_LEVEL2) {
if (first)
@@ -19114,8 +19129,7 @@ static int propagate_precision(struct bpf_verifier_env *env,
continue;
state_reg = &state->stack[i].spilled_ptr;
if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise ||
- !(state_reg->live & REG_LIVE_READ))
+ !state_reg->precise)
continue;
if (env->log.level & BPF_LOG_LEVEL2) {
if (first)
@@ -19165,9 +19179,6 @@ static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visi
changed = false;
for (backedge = visit->backedges; backedge; backedge = backedge->next) {
st = &backedge->state;
- err = propagate_liveness(env, st->equal_state, st, &changed);
- if (err)
- return err;
err = propagate_precision(env, st->equal_state, st, &changed);
if (err)
return err;
@@ -19191,7 +19202,7 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
fcur = cur->frame[fr];
for (i = 0; i < MAX_BPF_REG; i++)
if (memcmp(&fold->regs[i], &fcur->regs[i],
- offsetof(struct bpf_reg_state, parent)))
+ offsetof(struct bpf_reg_state, frameno)))
return false;
return true;
}
@@ -19289,7 +19300,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl;
struct bpf_verifier_state *cur = env->cur_state, *new;
bool force_new_state, add_new_state, loop;
- int i, j, n, err, states_cnt = 0;
+ int n, err, states_cnt = 0;
struct list_head *pos, *tmp, *head;
force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
@@ -19404,7 +19415,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
goto hit;
}
}
- if (calls_callback(env, insn_idx)) {
+ if (bpf_calls_callback(env, insn_idx)) {
if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
goto hit;
goto skip_inf_loop_check;
@@ -19447,25 +19458,15 @@ skip_inf_loop_check:
if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) {
hit:
sl->hit_cnt++;
- /* reached equivalent register/stack state,
- * prune the search.
- * Registers read by the continuation are read by us.
- * If we have any write marks in env->cur_state, they
- * will prevent corresponding reads in the continuation
- * from reaching our parent (an explored_state). Our
- * own state will get the read marks recorded, but
- * they'll be immediately forgotten as we're pruning
- * this state and will pop a new one.
- */
- err = propagate_liveness(env, &sl->state, cur, NULL);
/* if previous state reached the exit with precision and
* current state is equivalent to it (except precision marks)
* the precision needs to be propagated back in
* the current state.
*/
+ err = 0;
if (is_jmp_point(env, env->insn_idx))
- err = err ? : push_jmp_history(env, cur, 0, 0);
+ err = push_jmp_history(env, cur, 0, 0);
err = err ? : propagate_precision(env, &sl->state, cur, NULL);
if (err)
return err;
@@ -19553,7 +19554,7 @@ hit:
err = err ?: add_scc_backedge(env, &sl->state, backedge);
if (err) {
free_verifier_state(&backedge->state, false);
- kvfree(backedge);
+ kfree(backedge);
return err;
}
}
@@ -19636,7 +19637,7 @@ miss:
err = maybe_enter_scc(env, new);
if (err) {
free_verifier_state(new, false);
- kvfree(new_sl);
+ kfree(new_sl);
return err;
}
@@ -19645,38 +19646,6 @@ miss:
cur->dfs_depth = new->dfs_depth + 1;
clear_jmp_history(cur);
list_add(&new_sl->node, head);
-
- /* connect new state to parentage chain. Current frame needs all
- * registers connected. Only r6 - r9 of the callers are alive (pushed
- * to the stack implicitly by JITs) so in callers' frames connect just
- * r6 - r9 as an optimization. Callers will have r1 - r5 connected to
- * the state of the call instruction (with WRITTEN set), and r0 comes
- * from callee with its full parentage chain, anyway.
- */
- /* clear write marks in current state: the writes we did are not writes
- * our child did, so they don't screen off its reads from us.
- * (There are no read marks in current state, because reads always mark
- * their parent and current state never has children yet. Only
- * explored_states can get read marks.)
- */
- for (j = 0; j <= cur->curframe; j++) {
- for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
- cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
- for (i = 0; i < BPF_REG_FP; i++)
- cur->frame[j]->regs[i].live = REG_LIVE_NONE;
- }
-
- /* all stack frames are accessible from callee, clear them all */
- for (j = 0; j <= cur->curframe; j++) {
- struct bpf_func_state *frame = cur->frame[j];
- struct bpf_func_state *newframe = new->frame[j];
-
- for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {
- frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
- frame->stack[i].spilled_ptr.parent =
- &newframe->stack[i].spilled_ptr;
- }
- }
return 0;
}
@@ -19812,6 +19781,9 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env,
return PROCESS_BPF_EXIT;
if (env->cur_state->curframe) {
+ err = bpf_update_live_stack(env);
+ if (err)
+ return err;
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
if (err)
@@ -19997,7 +19969,7 @@ static int do_check(struct bpf_verifier_env *env)
for (;;) {
struct bpf_insn *insn;
struct bpf_insn_aux_data *insn_aux;
- int err;
+ int err, marks_err;
/* reset current history entry on each new instruction */
env->cur_hist_ent = NULL;
@@ -20090,7 +20062,15 @@ static int do_check(struct bpf_verifier_env *env)
if (state->speculative && insn_aux->nospec)
goto process_bpf_exit;
+ err = bpf_reset_stack_write_marks(env, env->insn_idx);
+ if (err)
+ return err;
err = do_check_insn(env, &do_print_state);
+ if (err >= 0 || error_recoverable_with_nospec(err)) {
+ marks_err = bpf_commit_stack_write_marks(env);
+ if (marks_err)
+ return marks_err;
+ }
if (error_recoverable_with_nospec(err) && state->speculative) {
/* Prevent this speculative path from ever reaching the
* insn that would have been unsafe to execute.
@@ -20131,6 +20111,9 @@ process_bpf_exit:
err = update_branch_counts(env, env->cur_state);
if (err)
return err;
+ err = bpf_update_live_stack(env);
+ if (err)
+ return err;
err = pop_stack(env, &prev_insn_idx, &env->insn_idx,
pop_log);
if (err < 0) {
@@ -20193,8 +20176,11 @@ static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf)
if (env->used_btfs[i].btf == btf)
return i;
- if (env->used_btf_cnt >= MAX_USED_BTFS)
+ if (env->used_btf_cnt >= MAX_USED_BTFS) {
+ verbose(env, "The total number of btfs per program has reached the limit of %u\n",
+ MAX_USED_BTFS);
return -E2BIG;
+ }
btf_get(btf);
@@ -20360,6 +20346,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
+ if (map->excl_prog_sha &&
+ memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) {
+ verbose(env, "program's hash doesn't match map's excl_prog_hash\n");
+ return -EACCES;
+ }
+
if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
btf_record_has_field(map->record, BPF_RB_ROOT)) {
if (is_tracing_prog_type(prog_type)) {
@@ -20699,12 +20691,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
static void adjust_insn_aux_data(struct bpf_verifier_env *env,
- struct bpf_insn_aux_data *new_data,
struct bpf_prog *new_prog, u32 off, u32 cnt)
{
- struct bpf_insn_aux_data *old_data = env->insn_aux_data;
+ struct bpf_insn_aux_data *data = env->insn_aux_data;
struct bpf_insn *insn = new_prog->insnsi;
- u32 old_seen = old_data[off].seen;
+ u32 old_seen = data[off].seen;
u32 prog_len;
int i;
@@ -20712,22 +20703,20 @@ static void adjust_insn_aux_data(struct bpf_verifier_env *env,
* (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
* original insn at old prog.
*/
- old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
+ data[off].zext_dst = insn_has_def32(insn + off + cnt - 1);
if (cnt == 1)
return;
prog_len = new_prog->len;
- memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
- memcpy(new_data + off + cnt - 1, old_data + off,
- sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ memmove(data + off + cnt - 1, data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1));
for (i = off; i < off + cnt - 1; i++) {
/* Expand insni[off]'s seen count to the patched range. */
- new_data[i].seen = old_seen;
- new_data[i].zext_dst = insn_has_def32(env, insn + i);
+ data[i].seen = old_seen;
+ data[i].zext_dst = insn_has_def32(insn + i);
}
- env->insn_aux_data = new_data;
- vfree(old_data);
}
static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
@@ -20765,10 +20754,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
struct bpf_insn_aux_data *new_data = NULL;
if (len > 1) {
- new_data = vzalloc(array_size(env->prog->len + len - 1,
- sizeof(struct bpf_insn_aux_data)));
+ new_data = vrealloc(env->insn_aux_data,
+ array_size(env->prog->len + len - 1,
+ sizeof(struct bpf_insn_aux_data)),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!new_data)
return NULL;
+
+ env->insn_aux_data = new_data;
}
new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
@@ -20777,10 +20770,9 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
verbose(env,
"insn %d cannot be patched due to 16-bit range\n",
env->insn_aux_data[off].orig_idx);
- vfree(new_data);
return NULL;
}
- adjust_insn_aux_data(env, new_data, new_prog, off, len);
+ adjust_insn_aux_data(env, new_prog, off, len);
adjust_subprog_starts(env, off, len);
adjust_poke_descs(new_prog, off, len);
return new_prog;
@@ -21131,7 +21123,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
* BPF_STX + SRC_OP, so it is safe to pass NULL
* here.
*/
- if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) {
+ if (is_reg64(&insn, load_reg, NULL, DST_OP)) {
if (class == BPF_LD &&
BPF_MODE(code) == BPF_IMM)
i++;
@@ -21400,10 +21392,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
continue;
case PTR_TO_ARENA:
if (BPF_MODE(insn->code) == BPF_MEMSX) {
- verbose(env, "sign extending loads from arena are not supported yet\n");
- return -EOPNOTSUPP;
+ if (!bpf_jit_supports_insn(insn, true)) {
+ verbose(env, "sign extending loads from arena are not supported yet\n");
+ return -EOPNOTSUPP;
+ }
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code);
+ } else {
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
}
- insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
env->prog->aux->num_exentries++;
continue;
default:
@@ -21578,6 +21574,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
func[i]->aux->poke_tab = prog->aux->poke_tab;
func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
+ func[i]->aux->main_prog_aux = prog->aux;
for (j = 0; j < prog->aux->size_poke_tab; j++) {
struct bpf_jit_poke_descriptor *poke;
@@ -21608,6 +21605,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) == BPF_PROBE_MEM ||
BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32SX ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
num_exentries++;
if ((BPF_CLASS(insn->code) == BPF_STX ||
@@ -23855,6 +23853,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
BTF_SET_START(btf_id_deny)
BTF_ID_UNUSED
#ifdef CONFIG_SMP
+BTF_ID(func, ___migrate_enable)
BTF_ID(func, migrate_disable)
BTF_ID(func, migrate_enable)
#endif
@@ -24084,67 +24083,6 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr,
return 0;
}
-static bool can_fallthrough(struct bpf_insn *insn)
-{
- u8 class = BPF_CLASS(insn->code);
- u8 opcode = BPF_OP(insn->code);
-
- if (class != BPF_JMP && class != BPF_JMP32)
- return true;
-
- if (opcode == BPF_EXIT || opcode == BPF_JA)
- return false;
-
- return true;
-}
-
-static bool can_jump(struct bpf_insn *insn)
-{
- u8 class = BPF_CLASS(insn->code);
- u8 opcode = BPF_OP(insn->code);
-
- if (class != BPF_JMP && class != BPF_JMP32)
- return false;
-
- switch (opcode) {
- case BPF_JA:
- case BPF_JEQ:
- case BPF_JNE:
- case BPF_JLT:
- case BPF_JLE:
- case BPF_JGT:
- case BPF_JGE:
- case BPF_JSGT:
- case BPF_JSGE:
- case BPF_JSLT:
- case BPF_JSLE:
- case BPF_JCOND:
- case BPF_JSET:
- return true;
- }
-
- return false;
-}
-
-static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
-{
- struct bpf_insn *insn = &prog->insnsi[idx];
- int i = 0, insn_sz;
- u32 dst;
-
- insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
- if (can_fallthrough(insn) && idx + 1 < prog->len)
- succ[i++] = idx + insn_sz;
-
- if (can_jump(insn)) {
- dst = idx + jmp_offset(insn) + 1;
- if (i == 0 || succ[0] != dst)
- succ[i++] = dst;
- }
-
- return i;
-}
-
/* Each field is a register bitmask */
struct insn_live_regs {
u16 use; /* registers read by instruction */
@@ -24342,7 +24280,7 @@ static int compute_live_registers(struct bpf_verifier_env *env)
u16 new_out = 0;
u16 new_in = 0;
- succ_num = insn_successors(env->prog, insn_idx, succ);
+ succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
for (int s = 0; s < succ_num; ++s)
new_out |= state[succ[s]].in;
new_in = (new_out & ~live->def) | live->use;
@@ -24379,9 +24317,6 @@ static int compute_live_registers(struct bpf_verifier_env *env)
out:
kvfree(state);
- kvfree(env->cfg.insn_postorder);
- env->cfg.insn_postorder = NULL;
- env->cfg.cur_postorder = 0;
return err;
}
@@ -24511,7 +24446,7 @@ dfs_continue:
stack[stack_sz++] = w;
}
/* Visit 'w' successors */
- succ_cnt = insn_successors(env->prog, w, succ);
+ succ_cnt = bpf_insn_successors(env->prog, w, succ);
for (j = 0; j < succ_cnt; ++j) {
if (pre[succ[j]]) {
low[w] = min(low[w], low[succ[j]]);
@@ -24684,6 +24619,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
+ ret = compute_postorder(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = bpf_stack_liveness_init(env);
+ if (ret)
+ goto skip_full_check;
+
ret = check_attach_btf_id(env);
if (ret)
goto skip_full_check;
@@ -24833,6 +24776,7 @@ err_unlock:
mutex_unlock(&bpf_verifier_lock);
vfree(env->insn_aux_data);
err_free_env:
+ bpf_stack_liveness_free(env);
kvfree(env->cfg.insn_postorder);
kvfree(env->scc_info);
kvfree(env);
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index b14e61c64a34..22051b4f1ccb 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -249,12 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
-void cgroup_attach_lock(bool lock_threadgroup);
-void cgroup_attach_unlock(bool lock_threadgroup);
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk);
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
+ enum cgroup_attach_lock_mode *lock_mode)
__acquires(&cgroup_threadgroup_rwsem);
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
+void cgroup_procs_write_finish(struct task_struct *task,
+ enum cgroup_attach_lock_mode lock_mode)
__releases(&cgroup_threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 2a4a387f867a..a9e029b570c8 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -10,6 +10,7 @@
#include <linux/sched/task.h>
#include <linux/magic.h>
#include <linux/slab.h>
+#include <linux/string.h>
#include <linux/vmalloc.h>
#include <linux/delayacct.h>
#include <linux/pid_namespace.h>
@@ -68,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
cgroup_lock();
- cgroup_attach_lock(true);
+ cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -80,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
- cgroup_attach_unlock(true);
+ cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
cgroup_unlock();
return retval;
@@ -117,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
cgroup_lock();
- cgroup_attach_lock(true);
+ cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
/* all tasks in @from are being moved, all csets are source */
spin_lock_irq(&css_set_lock);
@@ -153,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
} while (task && !ret);
out_err:
cgroup_migrate_finish(&mgctx);
- cgroup_attach_unlock(true);
+ cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
cgroup_unlock();
return ret;
}
@@ -502,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
struct task_struct *task;
const struct cred *cred, *tcred;
ssize_t ret;
- bool locked;
+ enum cgroup_attach_lock_mode lock_mode;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -531,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
ret = cgroup_attach_task(cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, lock_mode);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -1133,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
if (ctx->release_agent) {
spin_lock(&release_agent_path_lock);
- strcpy(root->release_agent_path, ctx->release_agent);
+ strscpy(root->release_agent_path, ctx->release_agent);
spin_unlock(&release_agent_path_lock);
}
@@ -1325,7 +1326,7 @@ static int __init cgroup1_wq_init(void)
* Cap @max_active to 1 too.
*/
cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
- 0, 1);
+ WQ_PERCPU, 1);
BUG_ON(!cgroup_pidlist_destroy_wq);
return 0;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 312c6a8b55bb..6ae5f48cf64e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -59,6 +59,7 @@
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/psi.h>
+#include <linux/nstree.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -124,10 +125,33 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
- * destruction work items don't end up filling up max_active of system_wq
+ * destruction work items don't end up filling up max_active of system_percpu_wq
* which may lead to deadlock.
+ *
+ * A cgroup destruction should enqueue work sequentially to:
+ * cgroup_offline_wq: use for css offline work
+ * cgroup_release_wq: use for css release work
+ * cgroup_free_wq: use for free work
+ *
+ * Rationale for using separate workqueues:
+ * The cgroup root free work may depend on completion of other css offline
+ * operations. If all tasks were enqueued to a single workqueue, this could
+ * create a deadlock scenario where:
+ * - Free work waits for other css offline work to complete.
+ * - But other css offline work is queued after free work in the same queue.
+ *
+ * Example deadlock scenario with single workqueue (cgroup_destroy_wq):
+ * 1. umount net_prio
+ * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
+ * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
+ * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
+ * 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
+ * which can never complete as it's behind in the same queue and
+ * workqueue's max_active is 1.
*/
-static struct workqueue_struct *cgroup_destroy_wq;
+static struct workqueue_struct *cgroup_offline_wq;
+static struct workqueue_struct *cgroup_release_wq;
+static struct workqueue_struct *cgroup_free_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
@@ -216,13 +240,22 @@ static u16 have_canfork_callback __read_mostly;
static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
+/*
+ * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem,
+ * read protected by either.
+ *
+ * Can only be turned on, but not turned off.
+ */
+bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
+
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .ns.count = REFCOUNT_INIT(2),
+ .ns.__ns_ref = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
- .ns.inum = PROC_CGROUP_INIT_INO,
+ .ns.inum = ns_init_inum(&init_cgroup_ns),
.root_cset = &init_css_set,
+ .ns.ns_type = ns_common_type(&init_cgroup_ns),
};
static struct file_system_type cgroup2_fs_type;
@@ -1302,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
{
bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
- /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+ /*
+ * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
+ * favordynmods can flip while task is between
+ * cgroup_threadgroup_change_begin() and end(), so down_write global
+ * cgroup_threadgroup_rwsem to synchronize them.
+ *
+ * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding
+ * cgroup_threadgroup_rwsem doesn't exlude tasks between
+ * cgroup_thread_group_change_begin() and end() and thus it's unsafe to
+ * turn off. As the scenario is unlikely, simply disallow disabling once
+ * enabled and print out a warning.
+ */
+ percpu_down_write(&cgroup_threadgroup_rwsem);
if (favor && !favoring) {
+ cgroup_enable_per_threadgroup_rwsem = true;
rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
} else if (!favor && favoring) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n");
rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
}
+ percpu_up_write(&cgroup_threadgroup_rwsem);
}
static int cgroup_init_root_id(struct cgroup_root *root)
@@ -2459,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
/**
* cgroup_attach_lock - Lock for ->attach()
- * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether acquire and acquire which rwsem
+ * @tsk: thread group to lock
*
* cgroup migration sometimes needs to stabilize threadgroups against forks and
* exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
@@ -2479,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
* Resolve the situation by always acquiring cpus_read_lock() before optionally
* write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
* CPU hotplug is disabled on entry.
+ *
+ * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead
+ * on dynamic cgroup modifications. see the comment above
+ * CGRP_ROOT_FAVOR_DYNMODS definition.
+ *
+ * tsk is not NULL only when writing to cgroup.procs.
*/
-void cgroup_attach_lock(bool lock_threadgroup)
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
{
cpus_read_lock();
- if (lock_threadgroup)
+
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
percpu_down_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ down_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
+ }
}
/**
* cgroup_attach_unlock - Undo cgroup_attach_lock()
- * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether release and release which rwsem
+ * @tsk: thread group to lock
*/
-void cgroup_attach_unlock(bool lock_threadgroup)
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
{
- if (lock_threadgroup)
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
percpu_up_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ up_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
+ }
+
cpus_read_unlock();
}
@@ -2944,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
/* look up all src csets */
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
@@ -2968,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *threadgroup_locked)
+ enum cgroup_attach_lock_mode *lock_mode)
{
struct task_struct *tsk;
pid_t pid;
@@ -2976,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL);
- /*
- * If we migrate a single thread, we don't care about threadgroup
- * stability. If the thread is `current`, it won't exit(2) under our
- * hands or change PID through exec(2). We exclude
- * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
- * callers by cgroup_mutex.
- * Therefore, we can skip the global lock.
- */
- lockdep_assert_held(&cgroup_mutex);
- *threadgroup_locked = pid || threadgroup;
- cgroup_attach_lock(*threadgroup_locked);
-
+retry_find_task:
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
tsk = ERR_PTR(-ESRCH);
- goto out_unlock_threadgroup;
+ goto out_unlock_rcu;
}
} else {
tsk = current;
@@ -3010,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
tsk = ERR_PTR(-EINVAL);
- goto out_unlock_threadgroup;
+ goto out_unlock_rcu;
}
-
get_task_struct(tsk);
- goto out_unlock_rcu;
+ rcu_read_unlock();
+
+ /*
+ * If we migrate a single thread, we don't care about threadgroup
+ * stability. If the thread is `current`, it won't exit(2) under our
+ * hands or change PID through exec(2). We exclude
+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers
+ * by cgroup_mutex. Therefore, we can skip the global lock.
+ */
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (pid || threadgroup) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP;
+ else
+ *lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ } else {
+ *lock_mode = CGRP_ATTACH_LOCK_NONE;
+ }
+
+ cgroup_attach_lock(*lock_mode, tsk);
+
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * A race with de_thread from another thread's exec()
+ * may strip us of our leadership. If this happens,
+ * throw this task away and try again.
+ */
+ cgroup_attach_unlock(*lock_mode, tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ }
+
+ return tsk;
-out_unlock_threadgroup:
- cgroup_attach_unlock(*threadgroup_locked);
- *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
+void cgroup_procs_write_finish(struct task_struct *task,
+ enum cgroup_attach_lock_mode lock_mode)
{
- struct cgroup_subsys *ss;
- int ssid;
+ cgroup_attach_unlock(lock_mode, task);
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
-
- cgroup_attach_unlock(threadgroup_locked);
-
- for_each_subsys(ss, ssid)
- if (ss->post_attach)
- ss->post_attach();
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -3088,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ enum cgroup_attach_lock_mode lock_mode;
bool has_tasks;
int ret;
@@ -3119,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
* write-locking can be skipped safely.
*/
has_tasks = !list_empty(&mgctx.preloaded_src_csets);
- cgroup_attach_lock(has_tasks);
+
+ if (has_tasks)
+ lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ else
+ lock_mode = CGRP_ATTACH_LOCK_NONE;
+
+ cgroup_attach_lock(lock_mode, NULL);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
@@ -3140,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- cgroup_attach_unlock(has_tasks);
+ cgroup_attach_unlock(lock_mode, NULL);
return ret;
}
@@ -3763,6 +3865,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
return 0;
}
+static int cgroup_core_local_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ unsigned int sequence;
+ u64 freeze_time;
+
+ do {
+ sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq);
+ freeze_time = cgrp->freezer.frozen_nsec;
+ /* Add in current freezer interval if the cgroup is freezing. */
+ if (test_bit(CGRP_FREEZE, &cgrp->flags))
+ freeze_time += (ktime_get_ns() -
+ cgrp->freezer.freeze_start_nsec);
+ } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence));
+
+ do_div(freeze_time, NSEC_PER_USEC);
+ seq_printf(seq, "frozen_usec %llu\n", freeze_time);
+
+ return 0;
+}
+
#ifdef CONFIG_CGROUP_SCHED
/**
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
@@ -4159,6 +4282,7 @@ static void cgroup_file_release(struct kernfs_open_file *of)
cft->release(of);
put_cgroup_ns(ctx->ns);
kfree(ctx);
+ of->priv = NULL;
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
@@ -5241,13 +5365,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
struct task_struct *task;
const struct cred *saved_cred;
ssize_t ret;
- bool threadgroup_locked;
+ enum cgroup_attach_lock_mode lock_mode;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -5273,7 +5397,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, threadgroup_locked);
+ cgroup_procs_write_finish(task, lock_mode);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -5355,6 +5479,11 @@ static struct cftype cgroup_base_files[] = {
.seq_show = cgroup_stat_show,
},
{
+ .name = "cgroup.stat.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_core_local_stat_show,
+ },
+ {
.name = "cgroup.freeze",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_freeze_show,
@@ -5558,7 +5687,7 @@ static void css_release_work_fn(struct work_struct *work)
cgroup_unlock();
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
}
static void css_release(struct percpu_ref *ref)
@@ -5567,7 +5696,7 @@ static void css_release(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_release_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_release_wq, &css->destroy_work);
}
static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5701,7 +5830,7 @@ err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
return ERR_PTR(err);
}
@@ -5763,6 +5892,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
* if the parent has to be frozen, the child has too.
*/
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
+ seqcount_init(&cgrp->freezer.freeze_seq);
if (cgrp->freezer.e_freeze) {
/*
* Set the CGRP_FREEZE flag, so when a process will be
@@ -5771,6 +5901,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
* consider it frozen immediately.
*/
set_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.freeze_start_nsec = ktime_get_ns();
set_bit(CGRP_FROZEN, &cgrp->flags);
}
@@ -5939,7 +6070,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_offline_wq, &css->destroy_work);
}
}
@@ -6312,6 +6443,7 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cpuset_fs_type));
#endif
+ ns_tree_add(&init_cgroup_ns);
return 0;
}
@@ -6325,8 +6457,14 @@ static int __init cgroup_wq_init(void)
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
- BUG_ON(!cgroup_destroy_wq);
+ cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_offline_wq);
+
+ cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_release_wq);
+
+ cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_free_wq);
return 0;
}
core_initcall(cgroup_wq_init);
@@ -6343,15 +6481,15 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
}
/*
- * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * __cgroup_get_from_id : get the cgroup associated with cgroup id
* @id: cgroup id
* On success return the cgrp or ERR_PTR on failure
- * Only cgroups within current task's cgroup NS are valid.
+ * There are no cgroup NS restrictions.
*/
-struct cgroup *cgroup_get_from_id(u64 id)
+struct cgroup *__cgroup_get_from_id(u64 id)
{
struct kernfs_node *kn;
- struct cgroup *cgrp, *root_cgrp;
+ struct cgroup *cgrp;
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
@@ -6373,6 +6511,22 @@ struct cgroup *cgroup_get_from_id(u64 id)
if (!cgrp)
return ERR_PTR(-ENOENT);
+ return cgrp;
+}
+
+/*
+ * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * @id: cgroup id
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
+ */
+struct cgroup *cgroup_get_from_id(u64 id)
+{
+ struct cgroup *cgrp, *root_cgrp;
+
+ cgrp = __cgroup_get_from_id(id);
+ if (IS_ERR(cgrp))
+ return cgrp;
root_cgrp = current_cgns_cgroup_dfl();
if (!cgroup_is_descendant(cgrp, root_cgrp)) {
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 383963e28ac6..337608f408ce 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -38,7 +38,6 @@ enum prs_errcode {
/* bits in struct cpuset flags field */
typedef enum {
- CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
@@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
/* convenient tests for these bits */
static inline bool is_cpuset_online(struct cpuset *cs)
{
- return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
+ return css_is_online(&cs->css) && !css_is_dying(&cs->css);
}
static inline int is_cpu_exclusive(const struct cpuset *cs)
@@ -277,6 +276,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on)
ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int cpuset_common_seq_show(struct seq_file *sf, void *v);
+void cpuset_full_lock(void);
+void cpuset_full_unlock(void);
/*
* cpuset-v1.c
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index b69a7db67090..12e76774c75b 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
- cpus_read_lock();
- cpuset_lock();
+ cpuset_full_lock();
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- cpuset_unlock();
- cpus_read_unlock();
+ cpuset_full_unlock();
return retval;
}
@@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
- cpus_read_lock();
- cpuset_lock();
+ cpuset_full_lock();
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
@@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- cpuset_unlock();
- cpus_read_unlock();
+ cpuset_full_unlock();
return retval;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f74d04429a29..52468d2c178a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -40,6 +40,7 @@
#include <linux/sched/isolation.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
+#include <linux/task_work.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -131,11 +132,6 @@ static bool force_sd_rebuild;
#define PRS_INVALID_ROOT -1
#define PRS_INVALID_ISOLATED -2
-static inline bool is_prs_invalid(int prs_state)
-{
- return prs_state < 0;
-}
-
/*
* Temporary cpumasks for working with partitions that are passed among
* functions to avoid memory allocation in inner functions.
@@ -159,16 +155,21 @@ void dec_dl_tasks_cs(struct task_struct *p)
cs->nr_deadline_tasks--;
}
-static inline int is_partition_valid(const struct cpuset *cs)
+static inline bool is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}
-static inline int is_partition_invalid(const struct cpuset *cs)
+static inline bool is_partition_invalid(const struct cpuset *cs)
{
return cs->partition_root_state < 0;
}
+static inline bool cs_is_member(const struct cpuset *cs)
+{
+ return cs->partition_root_state == PRS_MEMBER;
+}
+
/*
* Callers should hold callback_lock to modify partition_root_state.
*/
@@ -207,7 +208,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
* parallel, we may leave an offline CPU in cpu_allowed or some other masks.
*/
static struct cpuset top_cpuset = {
- .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
+ .flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
.relax_domain_level = -1,
@@ -250,6 +251,12 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex);
+/**
+ * cpuset_lock - Acquire the global cpuset mutex
+ *
+ * This locks the global cpuset mutex to prevent modifications to cpuset
+ * hierarchy and configurations. This helper is not enough to make modification.
+ */
void cpuset_lock(void)
{
mutex_lock(&cpuset_mutex);
@@ -260,6 +267,24 @@ void cpuset_unlock(void)
mutex_unlock(&cpuset_mutex);
}
+/**
+ * cpuset_full_lock - Acquire full protection for cpuset modification
+ *
+ * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex
+ * to safely modify cpuset data.
+ */
+void cpuset_full_lock(void)
+{
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
+}
+
+void cpuset_full_unlock(void)
+{
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
+}
+
static DEFINE_SPINLOCK(callback_lock);
void cpuset_callback_lock_irq(void)
@@ -280,7 +305,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
{
if (!cpusets_insane_config() &&
movable_only_nodes(nodes)) {
- static_branch_enable(&cpusets_insane_config_key);
+ static_branch_enable_cpuslocked(&cpusets_insane_config_key);
pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
"Cpuset allocations might fail even with a lot of memory available.\n",
nodemask_pr_args(nodes));
@@ -411,94 +436,104 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
}
/**
- * alloc_cpumasks - allocate three cpumasks for cpuset
- * @cs: the cpuset that have cpumasks to be allocated.
- * @tmp: the tmpmasks structure pointer
+ * alloc_cpumasks - Allocate an array of cpumask variables
+ * @pmasks: Pointer to array of cpumask_var_t pointers
+ * @size: Number of cpumasks to allocate
* Return: 0 if successful, -ENOMEM otherwise.
*
- * Only one of the two input arguments should be non-NULL.
+ * Allocates @size cpumasks and initializes them to empty. Returns 0 on
+ * success, -ENOMEM on allocation failure. On failure, any previously
+ * allocated cpumasks are freed.
*/
-static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size)
{
- cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
+ int i;
- if (cs) {
- pmask1 = &cs->cpus_allowed;
- pmask2 = &cs->effective_cpus;
- pmask3 = &cs->effective_xcpus;
- pmask4 = &cs->exclusive_cpus;
- } else {
- pmask1 = &tmp->new_cpus;
- pmask2 = &tmp->addmask;
- pmask3 = &tmp->delmask;
- pmask4 = NULL;
+ for (i = 0; i < size; i++) {
+ if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) {
+ while (--i >= 0)
+ free_cpumask_var(*pmasks[i]);
+ return -ENOMEM;
+ }
}
-
- if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
- return -ENOMEM;
-
- if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
- goto free_one;
-
- if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
- goto free_two;
-
- if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
- goto free_three;
-
-
return 0;
+}
+
+/**
+ * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations.
+ * @tmp: Pointer to tmpmasks structure to populate
+ * Return: 0 on success, -ENOMEM on allocation failure
+ */
+static inline int alloc_tmpmasks(struct tmpmasks *tmp)
+{
+ /*
+ * Array of pointers to the three cpumask_var_t fields in tmpmasks.
+ * Note: Array size must match actual number of masks (3)
+ */
+ cpumask_var_t *pmask[3] = {
+ &tmp->new_cpus,
+ &tmp->addmask,
+ &tmp->delmask
+ };
-free_three:
- free_cpumask_var(*pmask3);
-free_two:
- free_cpumask_var(*pmask2);
-free_one:
- free_cpumask_var(*pmask1);
- return -ENOMEM;
+ return alloc_cpumasks(pmask, ARRAY_SIZE(pmask));
}
/**
- * free_cpumasks - free cpumasks in a tmpmasks structure
- * @cs: the cpuset that have cpumasks to be free.
+ * free_tmpmasks - free cpumasks in a tmpmasks structure
* @tmp: the tmpmasks structure pointer
*/
-static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+static inline void free_tmpmasks(struct tmpmasks *tmp)
{
- if (cs) {
- free_cpumask_var(cs->cpus_allowed);
- free_cpumask_var(cs->effective_cpus);
- free_cpumask_var(cs->effective_xcpus);
- free_cpumask_var(cs->exclusive_cpus);
- }
- if (tmp) {
- free_cpumask_var(tmp->new_cpus);
- free_cpumask_var(tmp->addmask);
- free_cpumask_var(tmp->delmask);
- }
+ if (!tmp)
+ return;
+
+ free_cpumask_var(tmp->new_cpus);
+ free_cpumask_var(tmp->addmask);
+ free_cpumask_var(tmp->delmask);
}
/**
- * alloc_trial_cpuset - allocate a trial cpuset
- * @cs: the cpuset that the trial cpuset duplicates
+ * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset
+ * @cs: Source cpuset to duplicate (NULL for a fresh allocation)
+ *
+ * Creates a new cpuset by either:
+ * 1. Duplicating an existing cpuset (if @cs is non-NULL), or
+ * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL)
+ *
+ * Return: Pointer to newly allocated cpuset on success, NULL on failure
*/
-static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
+static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
{
struct cpuset *trial;
- trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
+ /* Allocate base structure */
+ trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) :
+ kzalloc(sizeof(*cs), GFP_KERNEL);
if (!trial)
return NULL;
- if (alloc_cpumasks(trial, NULL)) {
+ /* Setup cpumask pointer array */
+ cpumask_var_t *pmask[4] = {
+ &trial->cpus_allowed,
+ &trial->effective_cpus,
+ &trial->effective_xcpus,
+ &trial->exclusive_cpus
+ };
+
+ if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) {
kfree(trial);
return NULL;
}
- cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
- cpumask_copy(trial->effective_cpus, cs->effective_cpus);
- cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
- cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
+ /* Copy masks if duplicating */
+ if (cs) {
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+ cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
+ cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
+ }
+
return trial;
}
@@ -508,7 +543,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
*/
static inline void free_cpuset(struct cpuset *cs)
{
- free_cpumasks(cs, NULL);
+ free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->effective_cpus);
+ free_cpumask_var(cs->effective_xcpus);
+ free_cpumask_var(cs->exclusive_cpus);
kfree(cs);
}
@@ -540,6 +578,47 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
return true;
}
+/**
+ * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
+ * @cs1: first cpuset to check
+ * @cs2: second cpuset to check
+ *
+ * Returns: true if CPU exclusivity conflict exists, false otherwise
+ *
+ * Conflict detection rules:
+ * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
+ * 2. exclusive_cpus masks cannot intersect between cpusets
+ * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs
+ */
+static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ /* If either cpuset is exclusive, check if they are mutually exclusive */
+ if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
+ return !cpusets_are_exclusive(cs1, cs2);
+
+ /* Exclusive_cpus cannot intersect */
+ if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))
+ return true;
+
+ /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */
+ if (!cpumask_empty(cs1->cpus_allowed) &&
+ cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))
+ return true;
+
+ if (!cpumask_empty(cs2->cpus_allowed) &&
+ cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
+ return true;
+
+ return false;
+}
+
+static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2)))
+ return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+ return false;
+}
+
/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
@@ -621,38 +700,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
*/
ret = -EINVAL;
cpuset_for_each_child(c, css, par) {
- bool txset, cxset; /* Are exclusive_cpus set? */
-
if (c == cur)
continue;
-
- txset = !cpumask_empty(trial->exclusive_cpus);
- cxset = !cpumask_empty(c->exclusive_cpus);
- if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) ||
- (txset && cxset)) {
- if (!cpusets_are_exclusive(trial, c))
- goto out;
- } else if (txset || cxset) {
- struct cpumask *xcpus, *acpus;
-
- /*
- * When just one of the exclusive_cpus's is set,
- * cpus_allowed of the other cpuset, if set, cannot be
- * a subset of it or none of those CPUs will be
- * available if these exclusive CPUs are activated.
- */
- if (txset) {
- xcpus = trial->exclusive_cpus;
- acpus = c->cpus_allowed;
- } else {
- xcpus = c->exclusive_cpus;
- acpus = trial->cpus_allowed;
- }
- if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus))
- goto out;
- }
- if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ if (cpus_excl_conflict(trial, c))
+ goto out;
+ if (mems_excl_conflict(trial, c))
goto out;
}
@@ -1363,38 +1415,25 @@ bool cpuset_cpu_is_isolated(int cpu)
}
EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
-/*
- * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
- * @cs: cpuset
- * @xcpus: effective exclusive CPUs value to be set
- * @real_cs: the real cpuset (can be NULL)
- * Return: 0 if there is no sibling conflict, > 0 otherwise
+/**
+ * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets
+ * @parent: Parent cpuset containing all siblings
+ * @cs: Current cpuset (will be skipped)
+ * @excpus: exclusive effective CPU mask to modify
*
- * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to
- * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus
- * as well. The provision of real_cs means that a cpumask is being changed and
- * the given cs is a trial one.
+ * This function ensures the given @excpus mask doesn't include any CPUs that
+ * are exclusively allocated to sibling cpusets. It walks through all siblings
+ * of @cs under @parent and removes their exclusive CPUs from @excpus.
*/
-static int compute_effective_exclusive_cpumask(struct cpuset *cs,
- struct cpumask *xcpus,
- struct cpuset *real_cs)
+static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
+ struct cpumask *excpus)
{
struct cgroup_subsys_state *css;
- struct cpuset *parent = parent_cs(cs);
struct cpuset *sibling;
int retval = 0;
- if (!xcpus)
- xcpus = cs->effective_xcpus;
-
- cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
-
- if (!real_cs) {
- if (!cpumask_empty(cs->exclusive_cpus))
- return 0;
- } else {
- cs = real_cs;
- }
+ if (cpumask_empty(excpus))
+ return retval;
/*
* Exclude exclusive CPUs from siblings
@@ -1404,20 +1443,66 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs,
if (sibling == cs)
continue;
- if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
- cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus);
+ if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {
+ cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);
retval++;
continue;
}
- if (cpumask_intersects(xcpus, sibling->effective_xcpus)) {
- cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus);
+ if (cpumask_intersects(excpus, sibling->effective_xcpus)) {
+ cpumask_andnot(excpus, excpus, sibling->effective_xcpus);
retval++;
}
}
rcu_read_unlock();
+
return retval;
}
+/*
+ * compute_excpus - compute effective exclusive CPUs
+ * @cs: cpuset
+ * @xcpus: effective exclusive CPUs value to be set
+ * Return: 0 if there is no sibling conflict, > 0 otherwise
+ *
+ * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets
+ * and exclude their exclusive_cpus or effective_xcpus as well.
+ */
+static int compute_excpus(struct cpuset *cs, struct cpumask *excpus)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus);
+
+ if (!cpumask_empty(cs->exclusive_cpus))
+ return 0;
+
+ return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
+/*
+ * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset
+ * @trialcs: The trial cpuset containing the proposed new configuration
+ * @cs: The original cpuset that the trial configuration is based on
+ * Return: 0 if successful with no sibling conflict, >0 if a conflict is found
+ *
+ * Computes the effective_xcpus for a trial configuration. @cs is provided to represent
+ * the real cs.
+ */
+static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs)
+{
+ struct cpuset *parent = parent_cs(trialcs);
+ struct cpumask *excpus = trialcs->effective_xcpus;
+
+ /* trialcs is member, cpuset.cpus has no impact to excpus */
+ if (cs_is_member(cs))
+ cpumask_and(excpus, trialcs->exclusive_cpus,
+ parent->effective_xcpus);
+ else
+ cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus);
+
+ return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
static inline bool is_remote_partition(struct cpuset *cs)
{
return !list_empty(&cs->remote_sibling);
@@ -1459,7 +1544,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
* Note that creating a remote partition with any local partition root
* above it or remote partition root underneath it is not allowed.
*/
- compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL);
+ compute_excpus(cs, tmp->new_cpus);
WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));
if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
@@ -1508,7 +1593,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
cs->partition_root_state = PRS_MEMBER;
/* effective_xcpus may need to be changed */
- compute_effective_exclusive_cpumask(cs, NULL, NULL);
+ compute_excpus(cs, cs->effective_xcpus);
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
@@ -1677,7 +1762,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
old_prs = new_prs = cs->partition_root_state;
if (cmd == partcmd_invalidate) {
- if (is_prs_invalid(old_prs))
+ if (is_partition_invalid(cs))
return 0;
/*
@@ -1709,13 +1794,14 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
/*
- * Need to call compute_effective_exclusive_cpumask() in case
+ * Need to call compute_excpus() in case
* exclusive_cpus not set. Sibling conflict should only happen
* if exclusive_cpus isn't set.
*/
xcpus = tmp->delmask;
- if (compute_effective_exclusive_cpumask(cs, xcpus, NULL))
+ if (compute_excpus(cs, xcpus))
WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
+ new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
/*
* Enabling partition root is not allowed if its
@@ -1727,11 +1813,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (prstate_housekeeping_conflict(new_prs, xcpus))
return PERR_HKEEPING;
- /*
- * A parent can be left with no CPU as long as there is no
- * task directly associated with the parent partition.
- */
- if (nocpu)
+ if (tasks_nocpu_error(parent, cs, xcpus))
return PERR_NOCPUS;
/*
@@ -1748,7 +1830,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
deleting = true;
subparts_delta++;
- new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
* May need to add cpus back to parent's effective_cpus
@@ -1788,7 +1869,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
* For invalid partition:
* delmask = newmask & parent->effective_xcpus
*/
- if (is_prs_invalid(old_prs)) {
+ if (is_partition_invalid(cs)) {
adding = false;
deleting = cpumask_and(tmp->delmask,
newmask, parent->effective_xcpus);
@@ -1837,13 +1918,12 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
* A partition error happens when parent has tasks and all
* its effective CPUs will have to be distributed out.
*/
- WARN_ON_ONCE(!is_partition_valid(parent));
if (nocpu) {
part_error = PERR_NOCPUS;
if (is_partition_valid(cs))
adding = cpumask_and(tmp->addmask,
xcpus, parent->effective_xcpus);
- } else if (is_partition_invalid(cs) &&
+ } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) &&
cpumask_subset(xcpus, parent->effective_xcpus)) {
struct cgroup_subsys_state *css;
struct cpuset *child;
@@ -1996,7 +2076,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
* 2) All the effective_cpus will be used up and cp
* has tasks
*/
- compute_effective_exclusive_cpumask(cs, new_ecpus, NULL);
+ compute_excpus(cs, new_ecpus);
cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
rcu_read_lock();
@@ -2075,7 +2155,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
* its value is being processed.
*/
if (remote && (cp != cs)) {
- compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL);
+ compute_excpus(cp, tmp->new_cpus);
if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
@@ -2177,7 +2257,7 @@ get_css:
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cp->partition_root_state = new_prs;
if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
- compute_effective_exclusive_cpumask(cp, NULL, NULL);
+ compute_excpus(cp, cp->effective_xcpus);
/*
* Make sure effective_xcpus is properly set for a valid
@@ -2284,82 +2364,54 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
rcu_read_unlock();
}
-/**
- * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
- * @cs: the cpuset to consider
- * @trialcs: trial cpuset
- * @buf: buffer of cpu numbers written to this cpuset
- */
-static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
- const char *buf)
+static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask)
{
int retval;
- struct tmpmasks tmp;
- struct cpuset *parent = parent_cs(cs);
- bool invalidate = false;
- bool force = false;
- int old_prs = cs->partition_root_state;
- /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */
- if (cs == &top_cpuset)
- return -EACCES;
+ retval = cpulist_parse(buf, out_mask);
+ if (retval < 0)
+ return retval;
+ if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed))
+ return -EINVAL;
- /*
- * An empty cpus_allowed is ok only if the cpuset has no tasks.
- * Since cpulist_parse() fails on an empty mask, we special case
- * that parsing. The validate_change() call ensures that cpusets
- * with tasks have cpus.
- */
- if (!*buf) {
- cpumask_clear(trialcs->cpus_allowed);
- if (cpumask_empty(trialcs->exclusive_cpus))
- cpumask_clear(trialcs->effective_xcpus);
- } else {
- retval = cpulist_parse(buf, trialcs->cpus_allowed);
- if (retval < 0)
- return retval;
+ return 0;
+}
- if (!cpumask_subset(trialcs->cpus_allowed,
- top_cpuset.cpus_allowed))
- return -EINVAL;
+/**
+ * validate_partition - Validate a cpuset partition configuration
+ * @cs: The cpuset to validate
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ *
+ * If any validation check fails, the appropriate error code is set in the
+ * cpuset's prs_err field.
+ *
+ * Return: PRS error code (0 if valid, non-zero error code if invalid)
+ */
+static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs)
+{
+ struct cpuset *parent = parent_cs(cs);
- /*
- * When exclusive_cpus isn't explicitly set, it is constrained
- * by cpus_allowed and parent's effective_xcpus. Otherwise,
- * trialcs->effective_xcpus is used as a temporary cpumask
- * for checking validity of the partition root.
- */
- trialcs->partition_root_state = PRS_MEMBER;
- if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
- compute_effective_exclusive_cpumask(trialcs, NULL, cs);
- }
+ if (cs_is_member(trialcs))
+ return PERR_NONE;
- /* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
- return 0;
+ if (cpumask_empty(trialcs->effective_xcpus))
+ return PERR_INVCPUS;
- if (alloc_cpumasks(NULL, &tmp))
- return -ENOMEM;
+ if (prstate_housekeeping_conflict(trialcs->partition_root_state,
+ trialcs->effective_xcpus))
+ return PERR_HKEEPING;
- if (old_prs) {
- if (is_partition_valid(cs) &&
- cpumask_empty(trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_INVCPUS;
- } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_HKEEPING;
- } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_NOCPUS;
- }
- }
+ if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus))
+ return PERR_NOCPUS;
- /*
- * Check all the descendants in update_cpumasks_hier() if
- * effective_xcpus is to be changed.
- */
- force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+ return PERR_NONE;
+}
+
+static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks *tmp)
+{
+ int retval;
+ struct cpuset *parent = parent_cs(cs);
retval = validate_change(cs, trialcs);
@@ -2374,7 +2426,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* partition. However, any conflicting sibling partitions
* have to be marked as invalid too.
*/
- invalidate = true;
+ trialcs->prs_err = PERR_NOTEXCL;
rcu_read_lock();
cpuset_for_each_child(cp, css, parent) {
struct cpumask *xcpus = user_xcpus(trialcs);
@@ -2382,36 +2434,92 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (is_partition_valid(cp) &&
cpumask_intersects(xcpus, cp->effective_xcpus)) {
rcu_read_unlock();
- update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+ update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
rcu_read_lock();
}
}
rcu_read_unlock();
retval = 0;
}
+ return retval;
+}
- if (retval < 0)
- goto out_free;
+/**
+ * partition_cpus_change - Handle partition state changes due to CPU mask updates
+ * @cs: The target cpuset being modified
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ * @tmp: Temporary masks for intermediate calculations
+ *
+ * This function handles partition state transitions triggered by CPU mask changes.
+ * CPU modifications may cause a partition to be disabled or require state updates.
+ */
+static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks *tmp)
+{
+ enum prs_errcode prs_err;
- if (is_partition_valid(cs) ||
- (is_partition_invalid(cs) && !invalidate)) {
- struct cpumask *xcpus = trialcs->effective_xcpus;
+ if (cs_is_member(cs))
+ return;
- if (cpumask_empty(xcpus) && is_partition_invalid(cs))
- xcpus = trialcs->cpus_allowed;
+ prs_err = validate_partition(cs, trialcs);
+ if (prs_err)
+ trialcs->prs_err = cs->prs_err = prs_err;
- /*
- * Call remote_cpus_update() to handle valid remote partition
- */
- if (is_remote_partition(cs))
- remote_cpus_update(cs, NULL, xcpus, &tmp);
- else if (invalidate)
+ if (is_remote_partition(cs)) {
+ if (trialcs->prs_err)
+ remote_partition_disable(cs, tmp);
+ else
+ remote_cpus_update(cs, trialcs->exclusive_cpus,
+ trialcs->effective_xcpus, tmp);
+ } else {
+ if (trialcs->prs_err)
update_parent_effective_cpumask(cs, partcmd_invalidate,
- NULL, &tmp);
+ NULL, tmp);
else
update_parent_effective_cpumask(cs, partcmd_update,
- xcpus, &tmp);
+ trialcs->effective_xcpus, tmp);
}
+}
+
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ const char *buf)
+{
+ int retval;
+ struct tmpmasks tmp;
+ bool force = false;
+ int old_prs = cs->partition_root_state;
+
+ retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed);
+ if (retval < 0)
+ return retval;
+
+ /* Nothing to do if the cpus didn't change */
+ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+ return 0;
+
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
+
+ compute_trialcs_excpus(trialcs, cs);
+ trialcs->prs_err = PERR_NONE;
+
+ retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
+ if (retval < 0)
+ goto out_free;
+
+ /*
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
+ */
+ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+
+ partition_cpus_change(cs, trialcs, &tmp);
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
@@ -2427,7 +2535,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
out_free:
- free_cpumasks(NULL, &tmp);
+ free_tmpmasks(&tmp);
return retval;
}
@@ -2444,33 +2552,23 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
{
int retval;
struct tmpmasks tmp;
- struct cpuset *parent = parent_cs(cs);
- bool invalidate = false;
bool force = false;
int old_prs = cs->partition_root_state;
- if (!*buf) {
- cpumask_clear(trialcs->exclusive_cpus);
- cpumask_clear(trialcs->effective_xcpus);
- } else {
- retval = cpulist_parse(buf, trialcs->exclusive_cpus);
- if (retval < 0)
- return retval;
- }
+ retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus);
+ if (retval < 0)
+ return retval;
/* Nothing to do if the CPUs didn't change */
if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
return 0;
- if (*buf) {
- trialcs->partition_root_state = PRS_MEMBER;
- /*
- * Reject the change if there is exclusive CPUs conflict with
- * the siblings.
- */
- if (compute_effective_exclusive_cpumask(trialcs, NULL, cs))
- return -EINVAL;
- }
+ /*
+ * Reject the change if there is exclusive CPUs conflict with
+ * the siblings.
+ */
+ if (compute_trialcs_excpus(trialcs, cs))
+ return -EINVAL;
/*
* Check all the descendants in update_cpumasks_hier() if
@@ -2482,35 +2580,12 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval)
return retval;
- if (alloc_cpumasks(NULL, &tmp))
+ if (alloc_tmpmasks(&tmp))
return -ENOMEM;
- if (old_prs) {
- if (cpumask_empty(trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_INVCPUS;
- } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_HKEEPING;
- } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_NOCPUS;
- }
+ trialcs->prs_err = PERR_NONE;
+ partition_cpus_change(cs, trialcs, &tmp);
- if (is_remote_partition(cs)) {
- if (invalidate)
- remote_partition_disable(cs, &tmp);
- else
- remote_cpus_update(cs, trialcs->exclusive_cpus,
- trialcs->effective_xcpus, &tmp);
- } else if (invalidate) {
- update_parent_effective_cpumask(cs, partcmd_invalidate,
- NULL, &tmp);
- } else {
- update_parent_effective_cpumask(cs, partcmd_update,
- trialcs->effective_xcpus, &tmp);
- }
- }
spin_lock_irq(&callback_lock);
cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
@@ -2530,7 +2605,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
- free_cpumasks(NULL, &tmp);
+ free_tmpmasks(&tmp);
return 0;
}
@@ -2582,9 +2657,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
}
}
-static void cpuset_post_attach(void)
+static void flush_migrate_mm_task_workfn(struct callback_head *head)
{
flush_workqueue(cpuset_migrate_mm_wq);
+ kfree(head);
+}
+
+static void schedule_flush_migrate_mm(void)
+{
+ struct callback_head *flush_cb;
+
+ flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
+ if (!flush_cb)
+ return;
+
+ init_task_work(flush_cb, flush_migrate_mm_task_workfn);
+
+ if (task_work_add(current, flush_cb, TWA_RESUME))
+ kfree(flush_cb);
}
/*
@@ -2750,32 +2840,17 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
int retval;
/*
- * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
- * it's read-only
- */
- if (cs == &top_cpuset) {
- retval = -EACCES;
- goto done;
- }
-
- /*
* An empty mems_allowed is ok iff there are no tasks in the cpuset.
- * Since nodelist_parse() fails on an empty mask, we special case
- * that parsing. The validate_change() call ensures that cpusets
- * with tasks have memory.
+ * The validate_change() call ensures that cpusets with tasks have memory.
*/
- if (!*buf) {
- nodes_clear(trialcs->mems_allowed);
- } else {
- retval = nodelist_parse(buf, trialcs->mems_allowed);
- if (retval < 0)
- goto done;
+ retval = nodelist_parse(buf, trialcs->mems_allowed);
+ if (retval < 0)
+ goto done;
- if (!nodes_subset(trialcs->mems_allowed,
- top_cpuset.mems_allowed)) {
- retval = -EINVAL;
- goto done;
- }
+ if (!nodes_subset(trialcs->mems_allowed,
+ top_cpuset.mems_allowed)) {
+ retval = -EINVAL;
+ goto done;
}
if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
@@ -2826,7 +2901,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int spread_flag_changed;
int err;
- trialcs = alloc_trial_cpuset(cs);
+ trialcs = dup_or_alloc_cpuset(cs);
if (!trialcs)
return -ENOMEM;
@@ -2884,10 +2959,10 @@ static int update_prstate(struct cpuset *cs, int new_prs)
/*
* Treat a previously invalid partition root as if it is a "member".
*/
- if (new_prs && is_prs_invalid(old_prs))
+ if (new_prs && is_partition_invalid(cs))
old_prs = PRS_MEMBER;
- if (alloc_cpumasks(NULL, &tmpmask))
+ if (alloc_tmpmasks(&tmpmask))
return -ENOMEM;
err = update_partition_exclusive_flag(cs, new_prs);
@@ -2983,7 +3058,7 @@ out:
notify_partition_change(cs, old_prs);
if (force_sd_rebuild)
rebuild_sched_domains_locked();
- free_cpumasks(NULL, &tmpmask);
+ free_tmpmasks(&tmpmask);
return 0;
}
@@ -3141,6 +3216,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
bool cpus_updated, mems_updated;
+ bool queue_task_work = false;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
@@ -3191,15 +3267,18 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* @old_mems_allowed is the right nodesets that we
* migrate mm from.
*/
- if (is_memory_migrate(cs))
+ if (is_memory_migrate(cs)) {
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
- else
+ queue_task_work = true;
+ } else
mmput(mm);
}
}
out:
+ if (queue_task_work)
+ schedule_flush_migrate_mm();
cs->old_mems_allowed = cpuset_attach_nodemask_to;
if (cs->nr_migrate_dl_tasks) {
@@ -3223,13 +3302,16 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
struct cpuset *trialcs;
int retval = -ENODEV;
+ /* root is read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
buf = strstrip(buf);
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
+ cpuset_full_lock();
if (!is_cpuset_online(cs))
goto out_unlock;
- trialcs = alloc_trial_cpuset(cs);
+ trialcs = dup_or_alloc_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
goto out_unlock;
@@ -3254,9 +3336,9 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
if (force_sd_rebuild)
rebuild_sched_domains_locked();
out_unlock:
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
- flush_workqueue(cpuset_migrate_mm_wq);
+ cpuset_full_unlock();
+ if (of_cft(of)->private == FILE_MEMLIST)
+ schedule_flush_migrate_mm();
return retval ?: nbytes;
}
@@ -3358,14 +3440,10 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
else
return -EINVAL;
- css_get(&cs->css);
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
+ cpuset_full_lock();
if (is_cpuset_online(cs))
retval = update_prstate(cs, val);
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
- css_put(&cs->css);
+ cpuset_full_unlock();
return retval ?: nbytes;
}
@@ -3464,15 +3542,10 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
if (!parent_css)
return &top_cpuset.css;
- cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ cs = dup_or_alloc_cpuset(NULL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (alloc_cpumasks(cs, NULL)) {
- kfree(cs);
- return ERR_PTR(-ENOMEM);
- }
-
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
@@ -3495,10 +3568,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
-
- set_bit(CS_ONLINE, &cs->flags);
+ cpuset_full_lock();
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
@@ -3550,8 +3620,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
+ cpuset_full_unlock();
return 0;
}
@@ -3566,17 +3635,12 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
-
+ cpuset_full_lock();
if (!cpuset_v2() && is_sched_load_balance(cs))
cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
cpuset_dec();
- clear_bit(CS_ONLINE, &cs->flags);
-
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
+ cpuset_full_unlock();
}
/*
@@ -3588,16 +3652,11 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
-
+ cpuset_full_lock();
/* Reset valid partition back to member */
if (is_partition_valid(cs))
update_prstate(cs, PRS_MEMBER);
-
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
-
+ cpuset_full_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3726,7 +3785,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
- .post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.can_fork = cpuset_can_fork,
.cancel_fork = cpuset_cancel_fork,
@@ -3870,9 +3928,10 @@ retry:
partcmd = partcmd_invalidate;
/*
* On the other hand, an invalid partition root may be transitioned
- * back to a regular one.
+ * back to a regular one with a non-empty effective xcpus.
*/
- else if (is_partition_valid(parent) && is_partition_invalid(cs))
+ else if (is_partition_valid(parent) && is_partition_invalid(cs) &&
+ !cpumask_empty(cs->effective_xcpus))
partcmd = partcmd_update;
if (partcmd >= 0) {
@@ -3929,7 +3988,7 @@ static void cpuset_handle_hotplug(void)
bool on_dfl = is_in_v2_mode();
struct tmpmasks tmp, *ptmp = NULL;
- if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+ if (on_dfl && !alloc_tmpmasks(&tmp))
ptmp = &tmp;
lockdep_assert_cpus_held();
@@ -4009,7 +4068,7 @@ static void cpuset_handle_hotplug(void)
if (force_sd_rebuild)
rebuild_sched_domains_cpuslocked();
- free_cpumasks(NULL, ptmp);
+ free_tmpmasks(ptmp);
}
void cpuset_update_active_cpus(void)
@@ -4074,7 +4133,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
struct cpuset *cs;
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
cs = task_cs(tsk);
if (cs != &top_cpuset)
@@ -4096,7 +4154,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
cpumask_copy(pmask, possible_mask);
}
- rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -4169,9 +4226,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
- rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
return mask;
@@ -4266,10 +4321,8 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
/* Not hardwall and node outside mems_allowed: scan up cpusets */
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
- rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 80aa3f027ac3..81ea38dd6f9d 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
return -ENODEV;
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
cset = task_css_set(current);
refcnt = refcount_read(&cset->refcount);
seq_printf(seq, "css_set %pK %d", cset, refcnt);
@@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name,
css, css->id);
}
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
cgroup_kn_unlock(of->kn);
return 0;
@@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
return -ENOMEM;
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
cset = task_css_set(current);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
seq_printf(seq, "Root %d group %s\n",
c->root->hierarchy_id, name_buf);
}
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
kfree(name_buf);
return 0;
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index bf1690a167dd..6c18854bff34 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze)
/*
* Freeze or unfreeze all tasks in the given cgroup.
*/
-static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
+static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec)
{
struct css_task_iter it;
struct task_struct *task;
@@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- if (freeze)
+ write_seqcount_begin(&cgrp->freezer.freeze_seq);
+ if (freeze) {
set_bit(CGRP_FREEZE, &cgrp->flags);
- else
+ cgrp->freezer.freeze_start_nsec = ts_nsec;
+ } else {
clear_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.frozen_nsec += (ts_nsec -
+ cgrp->freezer.freeze_start_nsec);
+ }
+ write_seqcount_end(&cgrp->freezer.freeze_seq);
spin_unlock_irq(&css_set_lock);
if (freeze)
@@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
struct cgroup *parent;
struct cgroup *dsct;
bool applied = false;
+ u64 ts_nsec;
bool old_e;
lockdep_assert_held(&cgroup_mutex);
@@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
return;
cgrp->freezer.freeze = freeze;
+ ts_nsec = ktime_get_ns();
/*
* Propagate changes downwards the cgroup tree.
@@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
/*
* Do change actual state: freeze or unfreeze.
*/
- cgroup_do_freeze(dsct, freeze);
+ cgroup_do_freeze(dsct, freeze, ts_nsec);
applied = true;
}
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 144a464e45c6..fdbe57578e68 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -5,7 +5,7 @@
#include <linux/slab.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>
-
+#include <linux/nstree.h>
/* cgroup namespaces */
@@ -21,33 +21,32 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts)
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
- struct cgroup_namespace *new_ns;
+ struct cgroup_namespace *new_ns __free(kfree) = NULL;
int ret;
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
+ ret = ns_common_init(new_ns);
+ if (ret)
return ERR_PTR(ret);
- }
- refcount_set(&new_ns->ns.count, 1);
- new_ns->ns.ops = &cgroupns_operations;
- return new_ns;
+ ns_tree_add(new_ns);
+ return no_free_ptr(new_ns);
}
void free_cgroup_ns(struct cgroup_namespace *ns)
{
+ ns_tree_remove(ns);
put_css_set(ns->root_cset);
dec_cgroup_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kfree(ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
EXPORT_SYMBOL(free_cgroup_ns);
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+struct cgroup_namespace *copy_cgroup_ns(u64 flags,
struct user_namespace *user_ns,
struct cgroup_namespace *old_ns)
{
@@ -90,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
return new_ns;
}
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
- return container_of(ns, struct cgroup_namespace, ns);
-}
-
static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
struct nsproxy *nsproxy = nsset->nsproxy;
@@ -143,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns)
const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup",
- .type = CLONE_NEWCGROUP,
.get = cgroupns_get,
.put = cgroupns_put,
.install = cgroupns_install,
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 981e2f77ad4e..a198e40c799b 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -479,6 +479,9 @@ void css_rstat_exit(struct cgroup_subsys_state *css)
if (!css_uses_rstat(css))
return;
+ if (!css->rstat_cpu)
+ return;
+
css_rstat_flush(css);
/* sanity check */
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index 64caaf997fc0..7c3924614e01 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -93,8 +93,8 @@ CONFIG_SECCOMP_FILTER=y
# Provides some protections against SYN flooding.
CONFIG_SYN_COOKIES=y
-# Enable Kernel Control Flow Integrity (currently Clang only).
-CONFIG_CFI_CLANG=y
+# Enable Kernel Control Flow Integrity.
+CONFIG_CFI=y
# CONFIG_CFI_PERMISSIVE is not set
# Attack surface reduction: do not autoload TTY line disciplines.
diff --git a/kernel/cred.c b/kernel/cred.c
index 9676965c0981..dbf6b687dc5c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -287,7 +287,7 @@ struct cred *prepare_exec_creds(void)
* The new process gets the current process's subjective credentials as its
* objective and subjective credentials
*/
-int copy_creds(struct task_struct *p, unsigned long clone_flags)
+int copy_creds(struct task_struct *p, u64 clone_flags)
{
struct cred *new;
int ret;
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 67af8a55185d..d9b9dcba6ff7 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -483,8 +483,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
pr_err("Reserved memory: unable to setup CMA region\n");
return err;
}
- /* Architecture specific contiguous memory fixup. */
- dma_contiguous_early_fixup(rmem->base, rmem->size);
if (default_cma)
dma_contiguous_default_area = cma;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index e43c6de2bce4..b82399437db0 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -39,6 +39,7 @@ enum {
dma_debug_sg,
dma_debug_coherent,
dma_debug_resource,
+ dma_debug_noncoherent,
};
enum map_err_types {
@@ -141,6 +142,7 @@ static const char *type2name[] = {
[dma_debug_sg] = "scatter-gather",
[dma_debug_coherent] = "coherent",
[dma_debug_resource] = "resource",
+ [dma_debug_noncoherent] = "noncoherent",
};
static const char *dir2name[] = {
@@ -993,7 +995,8 @@ static void check_unmap(struct dma_debug_entry *ref)
"[mapped as %s] [unmapped as %s]\n",
ref->dev_addr, ref->size,
type2name[entry->type], type2name[ref->type]);
- } else if (entry->type == dma_debug_coherent &&
+ } else if ((entry->type == dma_debug_coherent ||
+ entry->type == dma_debug_noncoherent) &&
ref->paddr != entry->paddr) {
err_printk(ref->dev, entry, "device driver frees "
"DMA memory with different CPU address "
@@ -1581,6 +1584,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
}
}
+void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+ struct dma_debug_entry *entry;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->type = dma_debug_noncoherent;
+ entry->dev = dev;
+ entry->paddr = page_to_phys(page);
+ entry->size = size;
+ entry->dev_addr = dma_addr;
+ entry->direction = direction;
+
+ add_dma_entry(entry, attrs);
+}
+
+void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr)
+{
+ struct dma_debug_entry ref = {
+ .type = dma_debug_noncoherent,
+ .dev = dev,
+ .paddr = page_to_phys(page),
+ .dev_addr = dma_addr,
+ .size = size,
+ .direction = direction,
+ };
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ check_unmap(&ref);
+}
+
static int __init dma_debug_driver_setup(char *str)
{
int i;
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
index f525197d3cae..48757ca13f31 100644
--- a/kernel/dma/debug.h
+++ b/kernel/dma/debug.h
@@ -54,6 +54,13 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev,
extern void debug_dma_sync_sg_for_device(struct device *dev,
struct scatterlist *sg,
int nelems, int direction);
+extern void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs);
+extern void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr);
#else /* CONFIG_DMA_API_DEBUG */
static inline void debug_dma_map_page(struct device *dev, struct page *page,
size_t offset, size_t size,
@@ -126,5 +133,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev,
int nelems, int direction)
{
}
+
+static inline void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+}
+
+static inline void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr)
+{
+}
#endif /* CONFIG_DMA_API_DEBUG */
#endif /* _KERNEL_DMA_DEBUG_H */
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 107e4a4d251d..56de28a3b179 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -712,7 +712,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
if (page) {
trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle,
size, dir, gfp, 0);
- debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0);
+ debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0);
} else {
trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0);
}
@@ -738,7 +738,7 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page,
dma_addr_t dma_handle, enum dma_data_direction dir)
{
trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0);
- debug_dma_unmap_page(dev, dma_handle, size, dir);
+ debug_dma_free_pages(dev, page, size, dir, dma_handle);
__dma_free_pages(dev, size, page, dma_handle, dir);
}
EXPORT_SYMBOL_GPL(dma_free_pages);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 7b04f7575796..ee45dee33d49 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -102,8 +102,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
#ifdef CONFIG_DMA_DIRECT_REMAP
addr = dma_common_contiguous_remap(page, pool_size,
- pgprot_dmacoherent(PAGE_KERNEL),
- __builtin_return_address(0));
+ pgprot_decrypted(pgprot_dmacoherent(PAGE_KERNEL)),
+ __builtin_return_address(0));
if (!addr)
goto free_page;
#else
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 408d28b5179d..f62e1d1b2063 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -143,6 +143,20 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
return ret;
}
+/**
+ * arch_irqentry_exit_need_resched - Architecture specific need resched function
+ *
+ * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed.
+ * Defaults return true.
+ *
+ * The main purpose is to permit arch to avoid preemption of a task from an IRQ.
+ */
+static inline bool arch_irqentry_exit_need_resched(void);
+
+#ifndef arch_irqentry_exit_need_resched
+static inline bool arch_irqentry_exit_need_resched(void) { return true; }
+#endif
+
void raw_irqentry_exit_cond_resched(void)
{
if (!preempt_count()) {
@@ -150,7 +164,7 @@ void raw_irqentry_exit_cond_resched(void)
rcu_irq_exit_check_preempt();
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
WARN_ON_ONCE(!on_thread_stack());
- if (need_resched())
+ if (need_resched() && arch_irqentry_exit_need_resched())
preempt_schedule_irq();
}
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6c83ad674d01..808c0d7a31fa 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -217,22 +217,26 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
}
struct perf_callchain_entry *
-get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
int rctx, start_entry_idx;
+ /* crosstask is not supported for user stacks */
+ if (crosstask && user && !kernel)
+ return NULL;
+
entry = get_callchain_entry(&rctx);
if (!entry)
return NULL;
- ctx.entry = entry;
- ctx.max_stack = max_stack;
- ctx.nr = entry->nr = init_nr;
- ctx.contexts = 0;
- ctx.contexts_maxed = false;
+ ctx.entry = entry;
+ ctx.max_stack = max_stack;
+ ctx.nr = entry->nr = 0;
+ ctx.contexts = 0;
+ ctx.contexts_maxed = false;
if (kernel && !user_mode(regs)) {
if (add_mark)
@@ -240,25 +244,19 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
perf_callchain_kernel(&ctx, regs);
}
- if (user) {
+ if (user && !crosstask) {
if (!user_mode(regs)) {
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
- }
-
- if (regs) {
- if (crosstask)
+ if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
goto exit_put;
+ regs = task_pt_regs(current);
+ }
- if (add_mark)
- perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+ if (add_mark)
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- start_entry_idx = entry->nr;
- perf_callchain_user(&ctx, regs);
- fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
- }
+ start_entry_idx = entry->nr;
+ perf_callchain_user(&ctx, regs);
+ fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
}
exit_put:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8060c2857bb2..7541f6f85fcb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2665,6 +2665,9 @@ static void perf_log_itrace_start(struct perf_event *event);
static void perf_event_unthrottle(struct perf_event *event, bool start)
{
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
event->hw.interrupts = 0;
if (start)
event->pmu->start(event, 0);
@@ -2674,6 +2677,9 @@ static void perf_event_unthrottle(struct perf_event *event, bool start)
static void perf_event_throttle(struct perf_event *event)
{
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
event->hw.interrupts = MAX_INTERRUPTS;
event->pmu->stop(event, 0);
if (event == event->group_leader)
@@ -3968,7 +3974,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
*/
static inline bool event_update_userpage(struct perf_event *event)
{
- if (likely(!atomic_read(&event->mmap_count)))
+ if (likely(!refcount_read(&event->mmap_count)))
return false;
perf_event_update_time(event);
@@ -6704,11 +6710,11 @@ static void perf_mmap_open(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
mapped_f mapped = get_mapped(event, event_mapped);
- atomic_inc(&event->mmap_count);
- atomic_inc(&event->rb->mmap_count);
+ refcount_inc(&event->mmap_count);
+ refcount_inc(&event->rb->mmap_count);
if (vma->vm_pgoff)
- atomic_inc(&event->rb->aux_mmap_count);
+ refcount_inc(&event->rb->aux_mmap_count);
if (mapped)
mapped(event, vma->vm_mm);
@@ -6743,7 +6749,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
* to avoid complications.
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
- atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
+ refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
/*
* Stop all AUX events that are writing to this buffer,
* so that we can free its AUX pages and corresponding PMU
@@ -6763,10 +6769,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&rb->aux_mutex);
}
- if (atomic_dec_and_test(&rb->mmap_count))
+ if (refcount_dec_and_test(&rb->mmap_count))
detach_rest = true;
- if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
ring_buffer_attach(event, NULL);
@@ -6927,230 +6933,242 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
return err;
}
-static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra)
{
- struct perf_event *event = file->private_data;
- unsigned long user_locked, user_lock_limit;
+ unsigned long user_locked, user_lock_limit, locked, lock_limit;
struct user_struct *user = current_user();
- struct mutex *aux_mutex = NULL;
- struct perf_buffer *rb = NULL;
- unsigned long locked, lock_limit;
- unsigned long vma_size;
- unsigned long nr_pages;
- long user_extra = 0, extra = 0;
- int ret, flags = 0;
- mapped_f mapped;
+
+ user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+ /* Increase the limit linearly with more CPUs */
+ user_lock_limit *= num_online_cpus();
+
+ user_locked = atomic_long_read(&user->locked_vm);
/*
- * Don't allow mmap() of inherited per-task counters. This would
- * create a performance issue due to all children writing to the
- * same rb.
+ * sysctl_perf_event_mlock may have changed, so that
+ * user->locked_vm > user_lock_limit
*/
- if (event->cpu == -1 && event->attr.inherit)
- return -EINVAL;
+ if (user_locked > user_lock_limit)
+ user_locked = user_lock_limit;
+ user_locked += *user_extra;
- if (!(vma->vm_flags & VM_SHARED))
- return -EINVAL;
+ if (user_locked > user_lock_limit) {
+ /*
+ * charge locked_vm until it hits user_lock_limit;
+ * charge the rest from pinned_vm
+ */
+ *extra = user_locked - user_lock_limit;
+ *user_extra -= *extra;
+ }
- ret = security_perf_event_read(event);
- if (ret)
- return ret;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra;
- vma_size = vma->vm_end - vma->vm_start;
- nr_pages = vma_size / PAGE_SIZE;
+ return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK);
+}
- if (nr_pages > INT_MAX)
- return -ENOMEM;
+static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra)
+{
+ struct user_struct *user = current_user();
- if (vma_size != PAGE_SIZE * nr_pages)
- return -EINVAL;
+ atomic_long_add(user_extra, &user->locked_vm);
+ atomic64_add(extra, &vma->vm_mm->pinned_vm);
+}
- user_extra = nr_pages;
+static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
+ unsigned long nr_pages)
+{
+ long extra = 0, user_extra = nr_pages;
+ struct perf_buffer *rb;
+ int rb_flags = 0;
- mutex_lock(&event->mmap_mutex);
- ret = -EINVAL;
+ nr_pages -= 1;
/*
- * This relies on __pmu_detach_event() taking mmap_mutex after marking
- * the event REVOKED. Either we observe the state, or __pmu_detach_event()
- * will detach the rb created here.
+ * If we have rb pages ensure they're a power-of-two number, so we
+ * can do bitmasks instead of modulo.
*/
- if (event->state <= PERF_EVENT_STATE_REVOKED) {
- ret = -ENODEV;
- goto unlock;
- }
-
- if (vma->vm_pgoff == 0) {
- nr_pages -= 1;
-
- /*
- * If we have rb pages ensure they're a power-of-two number, so we
- * can do bitmasks instead of modulo.
- */
- if (nr_pages != 0 && !is_power_of_2(nr_pages))
- goto unlock;
-
- WARN_ON_ONCE(event->ctx->parent_ctx);
+ if (nr_pages != 0 && !is_power_of_2(nr_pages))
+ return -EINVAL;
- if (event->rb) {
- if (data_page_nr(event->rb) != nr_pages)
- goto unlock;
+ WARN_ON_ONCE(event->ctx->parent_ctx);
- if (atomic_inc_not_zero(&event->rb->mmap_count)) {
- /*
- * Success -- managed to mmap() the same buffer
- * multiple times.
- */
- ret = 0;
- /* We need the rb to map pages. */
- rb = event->rb;
- goto unlock;
- }
+ if (event->rb) {
+ if (data_page_nr(event->rb) != nr_pages)
+ return -EINVAL;
+ if (refcount_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close()'s
- * atomic_dec_and_mutex_lock() remove the
- * event and continue as if !event->rb
+ * Success -- managed to mmap() the same buffer
+ * multiple times.
*/
- ring_buffer_attach(event, NULL);
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_inc(&event->mmap_count);
+ return 0;
}
- } else {
/*
- * AUX area mapping: if rb->aux_nr_pages != 0, it's already
- * mapped, all subsequent mappings should have the same size
- * and offset. Must be above the normal perf buffer.
+ * Raced against perf_mmap_close()'s
+ * refcount_dec_and_mutex_lock() remove the
+ * event and continue as if !event->rb
*/
- u64 aux_offset, aux_size;
+ ring_buffer_attach(event, NULL);
+ }
- rb = event->rb;
- if (!rb)
- goto aux_unlock;
+ if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
+ return -EPERM;
- aux_mutex = &rb->aux_mutex;
- mutex_lock(aux_mutex);
+ if (vma->vm_flags & VM_WRITE)
+ rb_flags |= RING_BUFFER_WRITABLE;
- aux_offset = READ_ONCE(rb->user_page->aux_offset);
- aux_size = READ_ONCE(rb->user_page->aux_size);
+ rb = rb_alloc(nr_pages,
+ event->attr.watermark ? event->attr.wakeup_watermark : 0,
+ event->cpu, rb_flags);
- if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
- goto aux_unlock;
+ if (!rb)
+ return -ENOMEM;
- if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
- goto aux_unlock;
+ refcount_set(&rb->mmap_count, 1);
+ rb->mmap_user = get_current_user();
+ rb->mmap_locked = extra;
- /* already mapped with a different offset */
- if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
- goto aux_unlock;
+ ring_buffer_attach(event, rb);
- if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
- goto aux_unlock;
+ perf_event_update_time(event);
+ perf_event_init_userpage(event);
+ perf_event_update_userpage(event);
- /* already mapped with a different size */
- if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
- goto aux_unlock;
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_set(&event->mmap_count, 1);
- if (!is_power_of_2(nr_pages))
- goto aux_unlock;
+ return 0;
+}
- if (!atomic_inc_not_zero(&rb->mmap_count))
- goto aux_unlock;
+static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
+ unsigned long nr_pages)
+{
+ long extra = 0, user_extra = nr_pages;
+ u64 aux_offset, aux_size;
+ struct perf_buffer *rb;
+ int ret, rb_flags = 0;
- if (rb_has_aux(rb)) {
- atomic_inc(&rb->aux_mmap_count);
- ret = 0;
- goto unlock;
- }
- }
+ rb = event->rb;
+ if (!rb)
+ return -EINVAL;
- user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+ guard(mutex)(&rb->aux_mutex);
/*
- * Increase the limit linearly with more CPUs:
+ * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+ * mapped, all subsequent mappings should have the same size
+ * and offset. Must be above the normal perf buffer.
*/
- user_lock_limit *= num_online_cpus();
+ aux_offset = READ_ONCE(rb->user_page->aux_offset);
+ aux_size = READ_ONCE(rb->user_page->aux_size);
- user_locked = atomic_long_read(&user->locked_vm);
+ if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+ return -EINVAL;
- /*
- * sysctl_perf_event_mlock may have changed, so that
- * user->locked_vm > user_lock_limit
- */
- if (user_locked > user_lock_limit)
- user_locked = user_lock_limit;
- user_locked += user_extra;
+ if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+ return -EINVAL;
- if (user_locked > user_lock_limit) {
- /*
- * charge locked_vm until it hits user_lock_limit;
- * charge the rest from pinned_vm
- */
- extra = user_locked - user_lock_limit;
- user_extra -= extra;
- }
+ /* already mapped with a different offset */
+ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+ return -EINVAL;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
+ if (aux_size != nr_pages * PAGE_SIZE)
+ return -EINVAL;
- if ((locked > lock_limit) && perf_is_paranoid() &&
- !capable(CAP_IPC_LOCK)) {
- ret = -EPERM;
- goto unlock;
- }
+ /* already mapped with a different size */
+ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+ return -EINVAL;
- WARN_ON(!rb && event->rb);
+ if (!is_power_of_2(nr_pages))
+ return -EINVAL;
- if (vma->vm_flags & VM_WRITE)
- flags |= RING_BUFFER_WRITABLE;
+ if (!refcount_inc_not_zero(&rb->mmap_count))
+ return -EINVAL;
- if (!rb) {
- rb = rb_alloc(nr_pages,
- event->attr.watermark ? event->attr.wakeup_watermark : 0,
- event->cpu, flags);
+ if (rb_has_aux(rb)) {
+ refcount_inc(&rb->aux_mmap_count);
- if (!rb) {
- ret = -ENOMEM;
- goto unlock;
+ } else {
+ if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
+ refcount_dec(&rb->mmap_count);
+ return -EPERM;
}
- atomic_set(&rb->mmap_count, 1);
- rb->mmap_user = get_current_user();
- rb->mmap_locked = extra;
+ WARN_ON(!rb && event->rb);
- ring_buffer_attach(event, rb);
+ if (vma->vm_flags & VM_WRITE)
+ rb_flags |= RING_BUFFER_WRITABLE;
- perf_event_update_time(event);
- perf_event_init_userpage(event);
- perf_event_update_userpage(event);
- ret = 0;
- } else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
- event->attr.aux_watermark, flags);
- if (!ret) {
- atomic_set(&rb->aux_mmap_count, 1);
- rb->aux_mmap_locked = extra;
+ event->attr.aux_watermark, rb_flags);
+ if (ret) {
+ refcount_dec(&rb->mmap_count);
+ return ret;
}
+
+ refcount_set(&rb->aux_mmap_count, 1);
+ rb->aux_mmap_locked = extra;
}
-unlock:
- if (!ret) {
- atomic_long_add(user_extra, &user->locked_vm);
- atomic64_add(extra, &vma->vm_mm->pinned_vm);
-
- atomic_inc(&event->mmap_count);
- } else if (rb) {
- /* AUX allocation failed */
- atomic_dec(&rb->mmap_count);
- }
-aux_unlock:
- if (aux_mutex)
- mutex_unlock(aux_mutex);
- mutex_unlock(&event->mmap_mutex);
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_inc(&event->mmap_count);
+ return 0;
+}
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct perf_event *event = file->private_data;
+ unsigned long vma_size, nr_pages;
+ mapped_f mapped;
+ int ret;
+
+ /*
+ * Don't allow mmap() of inherited per-task counters. This would
+ * create a performance issue due to all children writing to the
+ * same rb.
+ */
+ if (event->cpu == -1 && event->attr.inherit)
+ return -EINVAL;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ ret = security_perf_event_read(event);
if (ret)
return ret;
+ vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = vma_size / PAGE_SIZE;
+
+ if (nr_pages > INT_MAX)
+ return -ENOMEM;
+
+ if (vma_size != PAGE_SIZE * nr_pages)
+ return -EINVAL;
+
+ scoped_guard (mutex, &event->mmap_mutex) {
+ /*
+ * This relies on __pmu_detach_event() taking mmap_mutex after marking
+ * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+ * will detach the rb created here.
+ */
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
+ if (vma->vm_pgoff == 0)
+ ret = perf_mmap_rb(vma, event, nr_pages);
+ else
+ ret = perf_mmap_aux(vma, event, nr_pages);
+ if (ret)
+ return ret;
+ }
+
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
@@ -7168,7 +7186,7 @@ aux_unlock:
* full cleanup in this case and therefore does not invoke
* vmops::close().
*/
- ret = map_range(rb, vma);
+ ret = map_range(event->rb, vma);
if (ret)
perf_mmap_close(vma);
@@ -7434,7 +7452,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
- } else if (!(current->flags & PF_KTHREAD)) {
+ } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
perf_get_regs_user(regs_user, regs);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -8074,7 +8092,7 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe get_user_page_fast_only first.
* If failed, leave phys_addr as 0.
*/
- if (current->mm != NULL) {
+ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
struct page *p;
pagefault_disable();
@@ -8186,7 +8204,8 @@ struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
bool kernel = !event->attr.exclude_callchain_kernel;
- bool user = !event->attr.exclude_callchain_user;
+ bool user = !event->attr.exclude_callchain_user &&
+ !(current->flags & (PF_KTHREAD | PF_USER_WORKER));
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
const u32 max_stack = event->attr.sample_max_stack;
@@ -8198,7 +8217,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (!kernel && !user)
return &__empty_callchain;
- callchain = get_perf_callchain(regs, 0, kernel, user,
+ callchain = get_perf_callchain(regs, kernel, user,
max_stack, crosstask, true);
return callchain ?: &__empty_callchain;
}
@@ -10324,6 +10343,7 @@ static int __perf_event_overflow(struct perf_event *event,
ret = 1;
event->pending_kill = POLL_HUP;
perf_event_disable_inatomic(event);
+ event->pmu->stop(event, 0);
}
if (event->attr.sigtrap) {
@@ -11225,6 +11245,10 @@ static int __perf_event_set_bpf_prog(struct perf_event *event,
if (prog->kprobe_override && !is_kprobe)
return -EINVAL;
+ /* Writing to context allowed only for uprobes. */
+ if (prog->aux->kprobe_write_ctx && !is_uprobe)
+ return -EINVAL;
+
if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
@@ -12210,7 +12234,7 @@ static const struct attribute_group *pmu_dev_groups[] = {
};
static int pmu_bus_running;
-static struct bus_type pmu_bus = {
+static const struct bus_type pmu_bus = {
.name = "event_source",
.dev_groups = pmu_dev_groups,
};
@@ -13242,7 +13266,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
/* Can't redirect output if we've got an active mmap() */
- if (atomic_read(&event->mmap_count))
+ if (refcount_read(&event->mmap_count))
goto unlock;
if (output_event) {
@@ -13255,7 +13279,7 @@ set:
goto unlock;
/* did we race against perf_mmap_close() */
- if (!atomic_read(&rb->mmap_count)) {
+ if (!refcount_read(&rb->mmap_count)) {
ring_buffer_put(rb);
goto unlock;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 249288d82b8d..d9cc57083091 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -35,7 +35,7 @@ struct perf_buffer {
spinlock_t event_lock;
struct list_head event_list;
- atomic_t mmap_count;
+ refcount_t mmap_count;
unsigned long mmap_locked;
struct user_struct *mmap_user;
@@ -47,7 +47,7 @@ struct perf_buffer {
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
- atomic_t aux_mmap_count;
+ refcount_t aux_mmap_count;
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
refcount_t aux_refcount;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index aa9a759e824f..20a905023736 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* the same order, see perf_mmap_close. Otherwise we end up freeing
* aux pages in this path, which is a bug, because in_atomic().
*/
- if (!atomic_read(&rb->aux_mmap_count))
+ if (!refcount_read(&rb->aux_mmap_count))
goto err;
if (!refcount_inc_not_zero(&rb->aux_refcount))
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7ca1940607bd..5dcf927310fd 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -121,7 +121,7 @@ struct xol_area {
static void uprobe_warn(struct task_struct *t, const char *msg)
{
- pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg);
+ pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg);
}
/*
@@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
return is_swbp_insn(insn);
}
-static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
void *kaddr = kmap_atomic(page);
memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
@@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
kunmap_atomic(kaddr);
}
-static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
+ int nbytes, void *data)
{
uprobe_opcode_t old_opcode;
bool is_swbp;
@@ -205,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* is a trap variant; uprobes always wins over any other (gdb)
* breakpoint.
*/
- copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+ uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
- if (is_swbp_insn(new_opcode)) {
+ if (is_swbp_insn(insn)) {
if (is_swbp) /* register: already installed? */
return 0;
} else {
@@ -399,12 +400,12 @@ static bool orig_page_is_identical(struct vm_area_struct *vma,
return identical;
}
-static int __uprobe_write_opcode(struct vm_area_struct *vma,
+static int __uprobe_write(struct vm_area_struct *vma,
struct folio_walk *fw, struct folio *folio,
- unsigned long opcode_vaddr, uprobe_opcode_t opcode)
+ unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
+ bool is_register)
{
- const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
- const bool is_register = !!is_swbp_insn(&opcode);
+ const unsigned long vaddr = insn_vaddr & PAGE_MASK;
bool pmd_mappable;
/* For now, we'll only handle PTE-mapped folios. */
@@ -429,7 +430,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma,
*/
flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
- copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ copy_to_page(fw->page, insn_vaddr, insn, nbytes);
/*
* When unregistering, we may only zap a PTE if uffd is disabled and
@@ -482,23 +483,32 @@ remap:
* @opcode_vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @opcode_vaddr.
*
- * Called with mm->mmap_lock held for read or write.
+ * Called with mm->mmap_lock held for write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
- const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
+ const unsigned long opcode_vaddr, uprobe_opcode_t opcode,
+ bool is_register)
{
- const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
+ verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
+}
+
+int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
+ uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+ void *data)
+{
+ const unsigned long vaddr = insn_vaddr & PAGE_MASK;
struct mm_struct *mm = vma->vm_mm;
struct uprobe *uprobe;
- int ret, is_register, ref_ctr_updated = 0;
+ int ret, ref_ctr_updated = 0;
unsigned int gup_flags = FOLL_FORCE;
struct mmu_notifier_range range;
struct folio_walk fw;
struct folio *folio;
struct page *page;
- is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
@@ -509,7 +519,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
* page that we can safely modify. Use FOLL_WRITE to trigger a write
* fault if required. When unregistering, we might be lucky and the
* anon page is already gone. So defer write faults until really
- * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
+ * required. Use FOLL_SPLIT_PMD, because __uprobe_write()
* cannot deal with PMDs yet.
*/
if (is_register)
@@ -521,14 +531,14 @@ retry:
goto out;
folio = page_folio(page);
- ret = verify_opcode(page, opcode_vaddr, &opcode);
+ ret = verify(page, insn_vaddr, insn, nbytes, data);
if (ret <= 0) {
folio_put(folio);
goto out;
}
/* We are going to replace instruction, update ref_ctr. */
- if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
+ if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
if (ret) {
folio_put(folio);
@@ -560,7 +570,7 @@ retry:
/* Walk the page tables again, to perform the actual update. */
if (folio_walk_start(&fw, vma, vaddr, 0)) {
if (fw.page == page)
- ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
+ ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register);
folio_walk_end(&fw, vma);
}
@@ -580,7 +590,7 @@ retry:
out:
/* Revert back reference counter if instruction update failed. */
- if (ret < 0 && ref_ctr_updated)
+ if (do_update_ref_ctr && ret < 0 && ref_ctr_updated)
update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
/* try collapse pmd for compound page */
@@ -602,7 +612,7 @@ out:
int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true);
}
/**
@@ -618,7 +628,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe,
struct vm_area_struct *vma, unsigned long vaddr)
{
return uprobe_write_opcode(auprobe, vma, vaddr,
- *(uprobe_opcode_t *)&auprobe->insn);
+ *(uprobe_opcode_t *)&auprobe->insn, false);
}
/* uprobe should have guaranteed positive refcount */
@@ -1051,7 +1061,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
if (IS_ERR(page))
return PTR_ERR(page);
- copy_from_page(page, offset, insn, nbytes);
+ uprobe_copy_from_page(page, offset, insn, nbytes);
put_page(page);
return 0;
@@ -1210,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
* reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
- GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ GFP_NOWAIT | __GFP_NOMEMALLOC);
if (prev)
prev->next = NULL;
}
@@ -1397,7 +1407,7 @@ struct uprobe *uprobe_register(struct inode *inode,
return ERR_PTR(-EINVAL);
/*
- * This ensures that copy_from_page(), copy_to_page() and
+ * This ensures that uprobe_copy_from_page(), copy_to_page() and
* __update_ref_ctr() can't cross page boundary.
*/
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
@@ -1463,7 +1473,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
struct vm_area_struct *vma;
int err = 0;
- mmap_read_lock(mm);
+ mmap_write_lock(mm);
for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -1480,7 +1490,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
vaddr = offset_to_vaddr(vma, uprobe->offset);
err |= remove_breakpoint(uprobe, vma, vaddr);
}
- mmap_read_unlock(mm);
+ mmap_write_unlock(mm);
return err;
}
@@ -1726,7 +1736,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
return ret;
}
-void * __weak arch_uprobe_trampoline(unsigned long *psize)
+void * __weak arch_uretprobe_trampoline(unsigned long *psize)
{
static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
@@ -1758,7 +1768,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
- insns = arch_uprobe_trampoline(&insns_size);
+ insns = arch_uretprobe_trampoline(&insns_size);
arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
if (!xol_add_vma(mm, area))
@@ -1792,6 +1802,14 @@ static struct xol_area *get_xol_area(void)
return area;
}
+void __weak arch_uprobe_clear_state(struct mm_struct *mm)
+{
+}
+
+void __weak arch_uprobe_init_state(struct mm_struct *mm)
+{
+}
+
/*
* uprobe_clear_state - Free the area allocated for slots.
*/
@@ -1803,6 +1821,8 @@ void uprobe_clear_state(struct mm_struct *mm)
delayed_uprobe_remove(NULL, mm);
mutex_unlock(&delayed_uprobe_lock);
+ arch_uprobe_clear_state(mm);
+
if (!area)
return;
@@ -2160,7 +2180,7 @@ static void dup_xol_work(struct callback_head *work)
/*
* Called in context of a new clone/fork from copy_process.
*/
-void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+void uprobe_copy_process(struct task_struct *t, u64 flags)
{
struct uprobe_task *utask = current->utask;
struct mm_struct *mm = current->mm;
@@ -2393,7 +2413,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (result < 0)
return result;
- copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out:
/* This needs to return true for any variant of the trap insn */
@@ -2677,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
return true;
}
+void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -2741,6 +2765,16 @@ static void handle_swbp(struct pt_regs *regs)
handler_chain(uprobe, regs);
+ /*
+ * If user decided to take execution elsewhere, it makes little sense
+ * to execute the original instruction, so let's skip it.
+ */
+ if (instruction_pointer(regs) != bp_vaddr)
+ goto out;
+
+ /* Try to optimize after first hit. */
+ arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
@@ -2752,6 +2786,23 @@ out:
rcu_read_unlock_trace();
}
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+ struct uprobe *uprobe;
+ int is_swbp;
+
+ guard(rcu_tasks_trace)();
+
+ uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+ if (!uprobe)
+ return;
+ if (!get_utask())
+ return;
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ return;
+ handler_chain(uprobe, regs);
+}
+
/*
* Perform required fix-ups and disable singlestep.
* Allow pending signals to take effect.
diff --git a/kernel/fork.c b/kernel/fork.c
index af673856499d..cffa6157a55a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -689,7 +689,6 @@ void __mmdrop(struct mm_struct *mm)
mm_pasid_drop(mm);
mm_destroy_cid(mm);
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
- futex_hash_free(mm);
free_mm(mm);
}
@@ -1015,6 +1014,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
mm->uprobes_state.xol_area = NULL;
+ arch_uprobe_init_state(mm);
#endif
}
@@ -1138,6 +1138,7 @@ static inline void __mmput(struct mm_struct *mm)
if (mm->binfmt)
module_put(mm->binfmt->module);
lru_gen_del_mm(mm);
+ futex_hash_free(mm);
mmdrop(mm);
}
@@ -1507,7 +1508,7 @@ fail_nomem:
return NULL;
}
-static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_mm(u64 clone_flags, struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm;
@@ -1545,7 +1546,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_fs(u64 clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
@@ -1566,7 +1567,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+static int copy_files(u64 clone_flags, struct task_struct *tsk,
int no_files)
{
struct files_struct *oldf, *newf;
@@ -1596,7 +1597,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
return 0;
}
-static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_sighand(u64 clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
@@ -1645,7 +1646,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
posix_cputimers_group_init(pct, cpu_limit);
}
-static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_signal(u64 clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
@@ -1688,6 +1689,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->cgroup_threadgroup_rwsem);
+#endif
+
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -2295,7 +2300,7 @@ __latent_entropy struct task_struct *copy_process(
if (need_futex_hash_allocate_default(clone_flags)) {
retval = futex_hash_allocate_default();
if (retval)
- goto bad_fork_core_free;
+ goto bad_fork_cancel_cgroup;
/*
* If we fail beyond this point we don't free the allocated
* futex hash map. We assume that another thread will be created
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index d9bb5567af0c..125804fbb5cb 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1722,12 +1722,9 @@ int futex_mm_init(struct mm_struct *mm)
RCU_INIT_POINTER(mm->futex_phash, NULL);
mm->futex_phash_new = NULL;
/* futex-ref */
+ mm->futex_ref = NULL;
atomic_long_set(&mm->futex_atomic, 0);
mm->futex_batches = get_state_synchronize_rcu();
- mm->futex_ref = alloc_percpu(unsigned int);
- if (!mm->futex_ref)
- return -ENOMEM;
- this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
return 0;
}
@@ -1801,6 +1798,17 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
}
}
+ if (!mm->futex_ref) {
+ /*
+ * This will always be allocated by the first thread and
+ * therefore requires no locking.
+ */
+ mm->futex_ref = alloc_percpu(unsigned int);
+ if (!mm->futex_ref)
+ return -ENOMEM;
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+ }
+
fph = kvzalloc(struct_size(fph, queues, hash_slots),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!fph)
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index c74eac572acd..2cd57096c38e 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -319,13 +319,13 @@ static __always_inline int futex_put_value(u32 val, u32 __user *to)
{
if (can_do_masked_user_access())
to = masked_user_access_begin(to);
- else if (!user_read_access_begin(to, sizeof(*to)))
+ else if (!user_write_access_begin(to, sizeof(*to)))
return -EFAULT;
unsafe_put_user(val, to, Efault);
- user_read_access_end();
+ user_write_access_end();
return 0;
Efault:
- user_read_access_end();
+ user_write_access_end();
return -EFAULT;
}
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index c716a66f8692..d818b4d47f1b 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -230,8 +230,9 @@ static inline
void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
struct futex_hash_bucket *hb)
{
- q->key = *key;
+ struct task_struct *task;
+ q->key = *key;
__futex_unqueue(q);
WARN_ON(!q->rt_waiter);
@@ -243,10 +244,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
futex_hash_get(hb);
q->drop_hb_ref = true;
q->lock_ptr = &hb->lock;
+ task = READ_ONCE(q->task);
/* Signal locked state to the waiter */
futex_requeue_pi_complete(q, 1);
- wake_up_state(q->task, TASK_NORMAL);
+ wake_up_state(task, TASK_NORMAL);
}
/**
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 4b6da9116aa6..880c9bf2f315 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -39,6 +39,56 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
return 0;
}
+static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat)
+{
+#ifdef CONFIG_COMPAT
+ if (compat)
+ return p->compat_robust_list;
+#endif
+ return p->robust_list;
+}
+
+static void __user *futex_get_robust_list_common(int pid, bool compat)
+{
+ struct task_struct *p = current;
+ void __user *head;
+ int ret;
+
+ scoped_guard(rcu) {
+ if (pid) {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ return (void __user *)ERR_PTR(-ESRCH);
+ }
+ get_task_struct(p);
+ }
+
+ /*
+ * Hold exec_update_lock to serialize with concurrent exec()
+ * so ptrace_may_access() is checked against stable credentials
+ */
+ ret = down_read_killable(&p->signal->exec_update_lock);
+ if (ret)
+ goto err_put;
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = futex_task_robust_list(p, compat);
+
+ up_read(&p->signal->exec_update_lock);
+ put_task_struct(p);
+
+ return head;
+
+err_unlock:
+ up_read(&p->signal->exec_update_lock);
+err_put:
+ put_task_struct(p);
+ return (void __user *)ERR_PTR(ret);
+}
+
/**
* sys_get_robust_list() - Get the robust-futex list head of a task
* @pid: pid of the process [zero for current task]
@@ -49,36 +99,14 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
struct robust_list_head __user * __user *, head_ptr,
size_t __user *, len_ptr)
{
- struct robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
+ struct robust_list_head __user *head = futex_get_robust_list_common(pid, false);
- head = p->robust_list;
- rcu_read_unlock();
+ if (IS_ERR(head))
+ return PTR_ERR(head);
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(head, head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
@@ -455,36 +483,14 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
compat_uptr_t __user *, head_ptr,
compat_size_t __user *, len_ptr)
{
- struct compat_robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
+ struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true);
- head = p->compat_robust_list;
- rcu_read_unlock();
+ if (IS_ERR(head))
+ return PTR_ERR(head);
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
}
#endif /* CONFIG_COMPAT */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 1da5e9d9da71..1b4254d19a73 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -6,10 +6,6 @@ menu "IRQ subsystem"
config MAY_HAVE_SPARSE_IRQ
bool
-# Legacy support, required for itanic
-config GENERIC_IRQ_LEGACY
- bool
-
# Enable the generic irq autoprobe mechanism
config GENERIC_IRQ_PROBE
bool
@@ -147,7 +143,9 @@ config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
config IRQ_KUNIT_TEST
bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS
depends on KUNIT=y
+ depends on SPARSE_IRQ
default KUNIT_ALL_TESTS
+ select IRQ_DOMAIN
imply SMP
help
This option enables KUnit tests for the IRQ subsystem API. These are
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 0d0276378c70..3ffa0d80ddd1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1260,6 +1260,43 @@ int irq_chip_get_parent_state(struct irq_data *data,
EXPORT_SYMBOL_GPL(irq_chip_get_parent_state);
/**
+ * irq_chip_shutdown_parent - Shutdown the parent interrupt
+ * @data: Pointer to interrupt specific data
+ *
+ * Invokes the irq_shutdown() callback of the parent if available or falls
+ * back to irq_chip_disable_parent().
+ */
+void irq_chip_shutdown_parent(struct irq_data *data)
+{
+ struct irq_data *parent = data->parent_data;
+
+ if (parent->chip->irq_shutdown)
+ parent->chip->irq_shutdown(parent);
+ else
+ irq_chip_disable_parent(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_shutdown_parent);
+
+/**
+ * irq_chip_startup_parent - Startup the parent interrupt
+ * @data: Pointer to interrupt specific data
+ *
+ * Invokes the irq_startup() callback of the parent if available or falls
+ * back to irq_chip_enable_parent().
+ */
+unsigned int irq_chip_startup_parent(struct irq_data *data)
+{
+ struct irq_data *parent = data->parent_data;
+
+ if (parent->chip->irq_startup)
+ return parent->chip->irq_startup(parent);
+
+ irq_chip_enable_parent(data);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_chip_startup_parent);
+
+/**
* irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
* NULL)
* @data: Pointer to interrupt specific data
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index eb16a58e0322..b41188698622 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -30,29 +30,22 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
return this->irq == match->irq && this->dev_id == match->dev_id;
}
-/**
- * devm_request_threaded_irq - allocate an interrupt line for a managed device
- * @dev: device to request interrupt for
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs
- * @thread_fn: function to be called in a threaded interrupt context. NULL
- * for devices which handle everything in @handler
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
- * @dev_id: A cookie passed back to the handler function
- *
- * Except for the extra @dev argument, this function takes the
- * same arguments and performs the same function as
- * request_threaded_irq(). IRQs requested with this function will be
- * automatically freed on driver detach.
- *
- * If an IRQ allocated with this function needs to be freed
- * separately, devm_free_irq() must be used.
- */
-int devm_request_threaded_irq(struct device *dev, unsigned int irq,
- irq_handler_t handler, irq_handler_t thread_fn,
- unsigned long irqflags, const char *devname,
- void *dev_id)
+static int devm_request_result(struct device *dev, int rc, unsigned int irq,
+ irq_handler_t handler, irq_handler_t thread_fn,
+ const char *devname)
+{
+ if (rc >= 0)
+ return rc;
+
+ return dev_err_probe(dev, rc, "request_irq(%u) %ps %ps %s\n",
+ irq, handler, thread_fn, devname ? : "");
+}
+
+static int __devm_request_threaded_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler,
+ irq_handler_t thread_fn,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
{
struct irq_devres *dr;
int rc;
@@ -78,28 +71,48 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
return 0;
}
-EXPORT_SYMBOL(devm_request_threaded_irq);
/**
- * devm_request_any_context_irq - allocate an interrupt line for a managed device
- * @dev: device to request interrupt for
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
- * @dev_id: A cookie passed back to the handler function
+ * devm_request_threaded_irq - allocate an interrupt line for a managed device with error logging
+ * @dev: Device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the interrupt occurs
+ * @thread_fn: Function to be called in a threaded interrupt context. NULL
+ * for devices which handle everything in @handler
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id: A cookie passed back to the handler function
*
- * Except for the extra @dev argument, this function takes the
- * same arguments and performs the same function as
- * request_any_context_irq(). IRQs requested with this function will be
- * automatically freed on driver detach.
+ * Except for the extra @dev argument, this function takes the same
+ * arguments and performs the same function as request_threaded_irq().
+ * Interrupts requested with this function will be automatically freed on
+ * driver detach.
+ *
+ * If an interrupt allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ *
+ * When the request fails, an error message is printed with contextual
+ * information (device name, interrupt number, handler functions and
+ * error code). Don't add extra error messages at the call sites.
*
- * If an IRQ allocated with this function needs to be freed
- * separately, devm_free_irq() must be used.
+ * Return: 0 on success or a negative error number.
*/
-int devm_request_any_context_irq(struct device *dev, unsigned int irq,
- irq_handler_t handler, unsigned long irqflags,
- const char *devname, void *dev_id)
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, irq_handler_t thread_fn,
+ unsigned long irqflags, const char *devname,
+ void *dev_id)
+{
+ int rc = __devm_request_threaded_irq(dev, irq, handler, thread_fn,
+ irqflags, devname, dev_id);
+
+ return devm_request_result(dev, rc, irq, handler, thread_fn, devname);
+}
+EXPORT_SYMBOL(devm_request_threaded_irq);
+
+static int __devm_request_any_context_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
{
struct irq_devres *dr;
int rc;
@@ -124,6 +137,40 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
return rc;
}
+
+/**
+ * devm_request_any_context_irq - allocate an interrupt line for a managed device with error logging
+ * @dev: Device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the interrupt occurs
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * Except for the extra @dev argument, this function takes the same
+ * arguments and performs the same function as request_any_context_irq().
+ * Interrupts requested with this function will be automatically freed on
+ * driver detach.
+ *
+ * If an interrupt allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ *
+ * When the request fails, an error message is printed with contextual
+ * information (device name, interrupt number, handler functions and
+ * error code). Don't add extra error messages at the call sites.
+ *
+ * Return: IRQC_IS_HARDIRQ or IRQC_IS_NESTED on success, or a negative error
+ * number.
+ */
+int devm_request_any_context_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ int rc = __devm_request_any_context_irq(dev, irq, handler, irqflags,
+ devname, dev_id);
+
+ return devm_request_result(dev, rc, irq, handler, NULL, devname);
+}
EXPORT_SYMBOL(devm_request_any_context_irq);
/**
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 9489f93b3db3..e103451243a0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -136,6 +136,44 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
+static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled);
+static u64 irqhandler_duration_threshold_ns __ro_after_init;
+
+static int __init irqhandler_duration_check_setup(char *arg)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(arg, 0, &val);
+ if (ret) {
+ pr_err("Unable to parse irqhandler.duration_warn_us setting: ret=%d\n", ret);
+ return 0;
+ }
+
+ if (!val) {
+ pr_err("Invalid irqhandler.duration_warn_us setting, must be > 0\n");
+ return 0;
+ }
+
+ irqhandler_duration_threshold_ns = val * 1000;
+ static_branch_enable(&irqhandler_duration_check_enabled);
+
+ return 1;
+}
+__setup("irqhandler.duration_warn_us=", irqhandler_duration_check_setup);
+
+static inline void irqhandler_duration_check(u64 ts_start, unsigned int irq,
+ const struct irqaction *action)
+{
+ u64 delta_ns = local_clock() - ts_start;
+
+ if (unlikely(delta_ns > irqhandler_duration_threshold_ns)) {
+ pr_warn_ratelimited("[CPU%u] long duration of IRQ[%u:%ps], took: %llu us\n",
+ smp_processor_id(), irq, action->handler,
+ div_u64(delta_ns, NSEC_PER_USEC));
+ }
+}
+
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
@@ -155,7 +193,16 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
lockdep_hardirq_threaded();
trace_irq_handler_entry(irq, action);
- res = action->handler(irq, action->dev_id);
+
+ if (static_branch_unlikely(&irqhandler_duration_check_enabled)) {
+ u64 ts_start = local_clock();
+
+ res = action->handler(irq, action->dev_id);
+ irqhandler_duration_check(ts_start, irq, action);
+ } else {
+ res = action->handler(irq, action->dev_id);
+ }
+
trace_irq_handler_exit(irq, action, res);
if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c
index a75abebed7f2..e2d31914b3c4 100644
--- a/kernel/irq/irq_test.c
+++ b/kernel/irq/irq_test.c
@@ -41,21 +41,37 @@ static struct irq_chip fake_irq_chip = {
.flags = IRQCHIP_SKIP_SET_WAKE,
};
-static void irq_disable_depth_test(struct kunit *test)
+static int irq_test_setup_fake_irq(struct kunit *test, struct irq_affinity_desc *affd)
{
struct irq_desc *desc;
- int virq, ret;
+ int virq;
- virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL);
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, affd);
KUNIT_ASSERT_GE(test, virq, 0);
- irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+ irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */
+ irq_settings_clr_norequest(desc);
+
+ return virq;
+}
+
+static void irq_disable_depth_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ int virq, ret;
+
+ virq = irq_test_setup_fake_irq(test, NULL);
desc = irq_to_desc(virq);
KUNIT_ASSERT_PTR_NE(test, desc, NULL);
ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
- KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
KUNIT_EXPECT_EQ(test, desc->depth, 0);
@@ -73,16 +89,13 @@ static void irq_free_disabled_test(struct kunit *test)
struct irq_desc *desc;
int virq, ret;
- virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL);
- KUNIT_ASSERT_GE(test, virq, 0);
-
- irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+ virq = irq_test_setup_fake_irq(test, NULL);
desc = irq_to_desc(virq);
KUNIT_ASSERT_PTR_NE(test, desc, NULL);
ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
- KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
KUNIT_EXPECT_EQ(test, desc->depth, 0);
@@ -93,7 +106,7 @@ static void irq_free_disabled_test(struct kunit *test)
KUNIT_EXPECT_GE(test, desc->depth, 1);
ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
- KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
KUNIT_EXPECT_EQ(test, desc->depth, 0);
free_irq(virq, NULL);
@@ -112,10 +125,7 @@ static void irq_shutdown_depth_test(struct kunit *test)
if (!IS_ENABLED(CONFIG_SMP))
kunit_skip(test, "requires CONFIG_SMP for managed shutdown");
- virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity);
- KUNIT_ASSERT_GE(test, virq, 0);
-
- irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+ virq = irq_test_setup_fake_irq(test, &affinity);
desc = irq_to_desc(virq);
KUNIT_ASSERT_PTR_NE(test, desc, NULL);
@@ -124,7 +134,7 @@ static void irq_shutdown_depth_test(struct kunit *test)
KUNIT_ASSERT_PTR_NE(test, data, NULL);
ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
- KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
@@ -169,13 +179,12 @@ static void irq_cpuhotplug_test(struct kunit *test)
kunit_skip(test, "requires more than 1 CPU for CPU hotplug");
if (!cpu_is_hotpluggable(1))
kunit_skip(test, "CPU 1 must be hotpluggable");
+ if (!cpu_online(1))
+ kunit_skip(test, "CPU 1 must be online");
cpumask_copy(&affinity.mask, cpumask_of(1));
- virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity);
- KUNIT_ASSERT_GE(test, virq, 0);
-
- irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq);
+ virq = irq_test_setup_fake_irq(test, &affinity);
desc = irq_to_desc(virq);
KUNIT_ASSERT_PTR_NE(test, desc, NULL);
@@ -184,7 +193,7 @@ static void irq_cpuhotplug_test(struct kunit *test)
KUNIT_ASSERT_PTR_NE(test, data, NULL);
ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
- KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
@@ -196,13 +205,9 @@ static void irq_cpuhotplug_test(struct kunit *test)
KUNIT_EXPECT_EQ(test, desc->depth, 1);
KUNIT_EXPECT_EQ(test, remove_cpu(1), 0);
- KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
- KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
KUNIT_EXPECT_GE(test, desc->depth, 1);
KUNIT_EXPECT_EQ(test, add_cpu(1), 0);
- KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
- KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
KUNIT_EXPECT_EQ(test, desc->depth, 1);
enable_irq(virq);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index b64c57b44c20..db714d3014b5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -653,13 +653,6 @@ void irq_mark_irq(unsigned int irq)
irq_insert_desc(irq, irq_desc + irq);
}
-#ifdef CONFIG_GENERIC_IRQ_LEGACY
-void irq_init_desc(unsigned int irq)
-{
- free_desc(irq);
-}
-#endif
-
#endif /* !CONFIG_SPARSE_IRQ */
int handle_irq_desc(struct irq_desc *desc)
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 9b09ad3f9914..e7ad99254841 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1644,9 +1644,6 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
else
__msi_domain_free_irqs(dev, domain, ctrl);
- if (ops->msi_post_free)
- ops->msi_post_free(domain, dev);
-
if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
msi_domain_free_descs(dev, ctrl);
}
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index e49743ae52c5..ecd1ac210dbd 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -144,14 +144,34 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
unsigned int order)
{
struct kho_mem_phys_bits *bits;
- struct kho_mem_phys *physxa;
+ struct kho_mem_phys *physxa, *new_physxa;
const unsigned long pfn_high = pfn >> order;
might_sleep();
- physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa));
- if (IS_ERR(physxa))
- return PTR_ERR(physxa);
+ physxa = xa_load(&track->orders, order);
+ if (!physxa) {
+ int err;
+
+ new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
+ if (!new_physxa)
+ return -ENOMEM;
+
+ xa_init(&new_physxa->phys_bits);
+ physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
+ GFP_KERNEL);
+
+ err = xa_err(physxa);
+ if (err || physxa) {
+ xa_destroy(&new_physxa->phys_bits);
+ kfree(new_physxa);
+
+ if (err)
+ return err;
+ } else {
+ physxa = new_physxa;
+ }
+ }
bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
sizeof(*bits));
@@ -544,6 +564,7 @@ err_free_scratch_areas:
err_free_scratch_desc:
memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch));
err_disable_kho:
+ pr_warn("Failed to reserve scratch area, disabling kexec handover\n");
kho_enable = false;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 0e98b228a8ef..31b072e8d427 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -893,6 +893,7 @@ out:
return ret;
}
+EXPORT_SYMBOL_GPL(kthread_affine_preferred);
/*
* Re-affine kthreads according to their preferences
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 086fd5487ca7..31a785afee6c 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -342,8 +342,12 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
* When waking up the task to wound, be sure to clear the
* blocked_on pointer. Otherwise we can see circular
* blocked_on relationships that can't resolve.
+ *
+ * NOTE: We pass NULL here instead of lock, because we
+ * are waking the mutex owner, who may be currently
+ * blocked on a different mutex.
*/
- __clear_task_blocked_on(owner, lock);
+ __clear_task_blocked_on(owner, NULL);
wake_q_add(wake_q, owner);
}
return true;
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 39278737bb68..2a1beebf1d37 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -460,6 +460,6 @@ config UNUSED_KSYMS_WHITELIST
config MODULES_TREE_LOOKUP
def_bool y
- depends on PERF_EVENTS || TRACING || CFI_CLANG
+ depends on PERF_EVENTS || TRACING || CFI
endif # MODULES
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
index d3204c5c74eb..f8e8c126705c 100644
--- a/kernel/module/tree_lookup.c
+++ b/kernel/module/tree_lookup.c
@@ -14,7 +14,7 @@
* Use a latched RB-tree for __module_address(); this allows us to use
* RCU lookups of the address from any context.
*
- * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can
+ * This is conditional on PERF_EVENTS || TRACING || CFI because those can
* really hit __module_address() hard by doing a lot of stack unwinding;
* potentially from NMI context.
*/
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
new file mode 100644
index 000000000000..c1fb2bad6d72
--- /dev/null
+++ b/kernel/nscommon.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ns_common.h>
+#include <linux/proc_ns.h>
+#include <linux/vfsdebug.h>
+
+#ifdef CONFIG_DEBUG_VFS
+static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
+{
+ switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+ case CLONE_NEWCGROUP:
+ VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
+ break;
+#endif
+#ifdef CONFIG_IPC_NS
+ case CLONE_NEWIPC:
+ VFS_WARN_ON_ONCE(ops != &ipcns_operations);
+ break;
+#endif
+ case CLONE_NEWNS:
+ VFS_WARN_ON_ONCE(ops != &mntns_operations);
+ break;
+#ifdef CONFIG_NET_NS
+ case CLONE_NEWNET:
+ VFS_WARN_ON_ONCE(ops != &netns_operations);
+ break;
+#endif
+#ifdef CONFIG_PID_NS
+ case CLONE_NEWPID:
+ VFS_WARN_ON_ONCE(ops != &pidns_operations);
+ break;
+#endif
+#ifdef CONFIG_TIME_NS
+ case CLONE_NEWTIME:
+ VFS_WARN_ON_ONCE(ops != &timens_operations);
+ break;
+#endif
+#ifdef CONFIG_USER_NS
+ case CLONE_NEWUSER:
+ VFS_WARN_ON_ONCE(ops != &userns_operations);
+ break;
+#endif
+#ifdef CONFIG_UTS_NS
+ case CLONE_NEWUTS:
+ VFS_WARN_ON_ONCE(ops != &utsns_operations);
+ break;
+#endif
+ }
+}
+#endif
+
+int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
+{
+ refcount_set(&ns->__ns_ref, 1);
+ ns->stashed = NULL;
+ ns->ops = ops;
+ ns->ns_id = 0;
+ ns->ns_type = ns_type;
+ RB_CLEAR_NODE(&ns->ns_tree_node);
+ INIT_LIST_HEAD(&ns->ns_list_node);
+
+#ifdef CONFIG_DEBUG_VFS
+ ns_debug(ns, ops);
+#endif
+
+ if (inum) {
+ ns->inum = inum;
+ return 0;
+ }
+ return proc_alloc_inum(&ns->inum);
+}
+
+void __ns_common_free(struct ns_common *ns)
+{
+ proc_free_inum(ns->inum);
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 5f31fdff8a38..19aa64ab08c8 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -64,7 +64,7 @@ static inline struct nsproxy *create_nsproxy(void)
* Return the newly created nsproxy. Do not attach this to the task,
* leave it to the caller to do proper locking and attach it to task.
*/
-static struct nsproxy *create_new_namespaces(unsigned long flags,
+static struct nsproxy *create_new_namespaces(u64 flags,
struct task_struct *tsk, struct user_namespace *user_ns,
struct fs_struct *new_fs)
{
@@ -144,7 +144,7 @@ out_ns:
* called from clone. This now handles copy for nsproxy and all
* namespaces therein.
*/
-int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+int copy_namespaces(u64 flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
@@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (proc_ns_file(fd_file(f))) {
ns = get_proc_ns(file_inode(fd_file(f)));
- if (flags && (ns->ops->type != flags))
+ if (flags && (ns->ns_type != flags))
err = -EINVAL;
- flags = ns->ops->type;
+ flags = ns->ns_type;
} else if (!IS_ERR(pidfd_pid(fd_file(f)))) {
err = check_setns_flags(flags);
} else {
diff --git a/kernel/nstree.c b/kernel/nstree.c
new file mode 100644
index 000000000000..b24a320a11a6
--- /dev/null
+++ b/kernel/nstree.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/nstree.h>
+#include <linux/proc_ns.h>
+#include <linux/vfsdebug.h>
+
+/**
+ * struct ns_tree - Namespace tree
+ * @ns_tree: Rbtree of namespaces of a particular type
+ * @ns_list: Sequentially walkable list of all namespaces of this type
+ * @ns_tree_lock: Seqlock to protect the tree and list
+ * @type: type of namespaces in this tree
+ */
+struct ns_tree {
+ struct rb_root ns_tree;
+ struct list_head ns_list;
+ seqlock_t ns_tree_lock;
+ int type;
+};
+
+struct ns_tree mnt_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWNS,
+};
+
+struct ns_tree net_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWNET,
+};
+EXPORT_SYMBOL_GPL(net_ns_tree);
+
+struct ns_tree uts_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWUTS,
+};
+
+struct ns_tree user_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWUSER,
+};
+
+struct ns_tree ipc_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWIPC,
+};
+
+struct ns_tree pid_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWPID,
+};
+
+struct ns_tree cgroup_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWCGROUP,
+};
+
+struct ns_tree time_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWTIME,
+};
+
+DEFINE_COOKIE(namespace_cookie);
+
+static inline struct ns_common *node_to_ns(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_tree_node);
+}
+
+static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ struct ns_common *ns_a = node_to_ns(a);
+ struct ns_common *ns_b = node_to_ns(b);
+ u64 ns_id_a = ns_a->ns_id;
+ u64 ns_id_b = ns_b->ns_id;
+
+ if (ns_id_a < ns_id_b)
+ return -1;
+ if (ns_id_a > ns_id_b)
+ return 1;
+ return 0;
+}
+
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+ struct rb_node *node, *prev;
+
+ VFS_WARN_ON_ONCE(!ns->ns_id);
+
+ write_seqlock(&ns_tree->ns_tree_lock);
+
+ VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+
+ node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp);
+ /*
+ * If there's no previous entry simply add it after the
+ * head and if there is add it after the previous entry.
+ */
+ prev = rb_prev(&ns->ns_tree_node);
+ if (!prev)
+ list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list);
+ else
+ list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
+
+ write_sequnlock(&ns_tree->ns_tree_lock);
+
+ VFS_WARN_ON_ONCE(node);
+}
+
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+ VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node));
+ VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
+ VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+
+ write_seqlock(&ns_tree->ns_tree_lock);
+ rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
+ list_bidir_del_rcu(&ns->ns_list_node);
+ RB_CLEAR_NODE(&ns->ns_tree_node);
+ write_sequnlock(&ns_tree->ns_tree_lock);
+}
+EXPORT_SYMBOL_GPL(__ns_tree_remove);
+
+static int ns_find(const void *key, const struct rb_node *node)
+{
+ const u64 ns_id = *(u64 *)key;
+ const struct ns_common *ns = node_to_ns(node);
+
+ if (ns_id < ns->ns_id)
+ return -1;
+ if (ns_id > ns->ns_id)
+ return 1;
+ return 0;
+}
+
+
+static struct ns_tree *ns_tree_from_type(int ns_type)
+{
+ switch (ns_type) {
+ case CLONE_NEWCGROUP:
+ return &cgroup_ns_tree;
+ case CLONE_NEWIPC:
+ return &ipc_ns_tree;
+ case CLONE_NEWNS:
+ return &mnt_ns_tree;
+ case CLONE_NEWNET:
+ return &net_ns_tree;
+ case CLONE_NEWPID:
+ return &pid_ns_tree;
+ case CLONE_NEWUSER:
+ return &user_ns_tree;
+ case CLONE_NEWUTS:
+ return &uts_ns_tree;
+ case CLONE_NEWTIME:
+ return &time_ns_tree;
+ }
+
+ return NULL;
+}
+
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ struct ns_tree *ns_tree;
+ struct rb_node *node;
+ unsigned int seq;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return NULL;
+
+ do {
+ seq = read_seqbegin(&ns_tree->ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find);
+ if (node)
+ break;
+ } while (read_seqretry(&ns_tree->ns_tree_lock, seq));
+
+ if (!node)
+ return NULL;
+
+ VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type);
+
+ return node_to_ns(node);
+}
+
+/**
+ * ns_tree_adjoined_rcu - find the next/previous namespace in the same
+ * tree
+ * @ns: namespace to start from
+ * @previous: if true find the previous namespace, otherwise the next
+ *
+ * Find the next or previous namespace in the same tree as @ns. If
+ * there is no next/previous namespace, -ENOENT is returned.
+ */
+struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
+ struct ns_tree *ns_tree, bool previous)
+{
+ struct list_head *list;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage");
+
+ if (previous)
+ list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node));
+ else
+ list = rcu_dereference(list_next_rcu(&ns->ns_list_node));
+ if (list_is_head(list, &ns_tree->ns_list))
+ return ERR_PTR(-ENOENT);
+
+ VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type);
+
+ return list_entry_rcu(list, struct ns_common, ns_list_node);
+}
+
+/**
+ * ns_tree_gen_id - generate a new namespace id
+ * @ns: namespace to generate id for
+ *
+ * Generates a new namespace id and assigns it to the namespace. All
+ * namespaces types share the same id space and thus can be compared
+ * directly. IOW, when two ids of two namespace are equal, they are
+ * identical.
+ */
+u64 ns_tree_gen_id(struct ns_common *ns)
+{
+ guard(preempt)();
+ ns->ns_id = gen_cookie_next(&namespace_cookie);
+ return ns->ns_id;
+}
diff --git a/kernel/params.c b/kernel/params.c
index b92d64161b75..b96cfd693c99 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -513,13 +513,14 @@ EXPORT_SYMBOL(param_array_ops);
int param_set_copystring(const char *val, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
+ const size_t len = strnlen(val, kps->maxlen);
- if (strnlen(val, kps->maxlen) == kps->maxlen) {
+ if (len == kps->maxlen) {
pr_err("%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
return -ENOSPC;
}
- strcpy(kps->string, val);
+ memcpy(kps->string, val, len + 1);
return 0;
}
EXPORT_SYMBOL(param_set_copystring);
@@ -841,7 +842,7 @@ static void __init param_sysfs_builtin(void)
dot = strchr(kp->name, '.');
if (!dot) {
/* This happens for core_param() */
- strcpy(modname, "kernel");
+ strscpy(modname, "kernel");
name_len = 0;
} else {
name_len = dot - kp->name + 1;
diff --git a/kernel/pid.c b/kernel/pid.c
index c45a28c16cd2..4fffec767a63 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -71,13 +71,13 @@ static int pid_max_max = PID_MAX_LIMIT;
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .ns.count = REFCOUNT_INIT(2),
+ .ns.__ns_ref = REFCOUNT_INIT(2),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .ns.inum = PROC_PID_INIT_INO,
+ .ns.inum = ns_init_inum(&init_pid_ns),
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
@@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = {
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
+ .ns.ns_type = ns_common_type(&init_pid_ns),
};
EXPORT_SYMBOL_GPL(init_pid_ns);
@@ -491,7 +492,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
struct upid *upid;
pid_t nr = 0;
- if (pid && ns->level <= pid->level) {
+ if (pid && ns && ns->level <= pid->level) {
upid = &pid->numbers[ns->level];
if (upid->ns == ns)
nr = upid->nr;
@@ -514,7 +515,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
rcu_read_lock();
if (!ns)
ns = task_active_pid_ns(current);
- nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
+ if (ns)
+ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
rcu_read_unlock();
return nr;
@@ -680,7 +682,7 @@ static int pid_table_root_permissions(struct ctl_table_header *head,
container_of(head->set, struct pid_namespace, set);
int mode = table->mode;
- if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
+ if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) ||
uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
mode = (mode & S_IRWXU) >> 6;
else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7098ed44e717..650be58d8d18 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include <linux/nstree.h>
#include <uapi/linux/wait.h>
#include "pid_sysctl.h"
@@ -102,17 +103,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns->pid_cachep == NULL)
goto out_free_idr;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto out_free_idr;
- ns->ns.ops = &pidns_operations;
ns->pid_max = PID_MAX_LIMIT;
err = register_pidns_sysctls(ns);
if (err)
goto out_free_inum;
- refcount_set(&ns->ns.count, 1);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
@@ -124,10 +123,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
+ ns_tree_add(ns);
return ns;
out_free_inum:
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
out_free_idr:
idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
@@ -149,9 +149,10 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
+ ns_tree_remove(ns);
unregister_pidns_sysctls(ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
@@ -168,10 +169,10 @@ static void destroy_pid_namespace_work(struct work_struct *work)
parent = ns->parent;
destroy_pid_namespace(ns);
ns = parent;
- } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
+ } while (ns != &init_pid_ns && ns_ref_put(ns));
}
-struct pid_namespace *copy_pid_ns(unsigned long flags,
+struct pid_namespace *copy_pid_ns(u64 flags,
struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
if (!(flags & CLONE_NEWPID))
@@ -183,7 +184,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
void put_pid_ns(struct pid_namespace *ns)
{
- if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
+ if (ns && ns != &init_pid_ns && ns_ref_put(ns))
schedule_work(&ns->work);
}
EXPORT_SYMBOL_GPL(put_pid_ns);
@@ -344,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
-static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
-{
- return container_of(ns, struct pid_namespace, ns);
-}
-
static struct ns_common *pidns_get(struct task_struct *task)
{
struct pid_namespace *ns;
@@ -390,11 +386,23 @@ static void pidns_put(struct ns_common *ns)
put_pid_ns(to_pid_ns(ns));
}
+bool pidns_is_ancestor(struct pid_namespace *child,
+ struct pid_namespace *ancestor)
+{
+ struct pid_namespace *ns;
+
+ if (child->level < ancestor->level)
+ return false;
+ for (ns = child; ns->level > ancestor->level; ns = ns->parent)
+ ;
+ return ns == ancestor;
+}
+
static int pidns_install(struct nsset *nsset, struct ns_common *ns)
{
struct nsproxy *nsproxy = nsset->nsproxy;
struct pid_namespace *active = task_active_pid_ns(current);
- struct pid_namespace *ancestor, *new = to_pid_ns(ns);
+ struct pid_namespace *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
@@ -408,13 +416,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns)
* this maintains the property that processes and their
* children can not escape their current pid namespace.
*/
- if (new->level < active->level)
- return -EINVAL;
-
- ancestor = new;
- while (ancestor->level > active->level)
- ancestor = ancestor->parent;
- if (ancestor != active)
+ if (!pidns_is_ancestor(new, active))
return -EINVAL;
put_pid_ns(nsproxy->pid_ns_for_children);
@@ -447,7 +449,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns)
const struct proc_ns_operations pidns_operations = {
.name = "pid",
- .type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
@@ -458,7 +459,6 @@ const struct proc_ns_operations pidns_operations = {
const struct proc_ns_operations pidns_for_children_operations = {
.name = "pid_for_children",
.real_ns_name = "pid",
- .type = CLONE_NEWPID,
.get = pidns_for_children_get,
.put = pidns_put,
.install = pidns_install,
@@ -475,6 +475,7 @@ static __init int pid_namespaces_init(void)
#endif
register_pid_ns_sysctl_table_vm();
+ ns_tree_add(&init_pid_ns);
return 0;
}
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index ea7995a25780..8df55397414a 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -553,6 +553,30 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
const struct em_data_callback *cb,
const cpumask_t *cpus, bool microwatts)
{
+ int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts);
+
+ if (_is_cpu_device(dev))
+ em_check_capacity_update();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
+
+/**
+ * em_dev_register_pd_no_update() - Register a perf domain for a device
+ * @dev : Device to register the PD for
+ * @nr_states : Number of performance states in the new PD
+ * @cb : Callback functions for populating the energy model
+ * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device)
+ * @microwatts : Whether or not the power values in the EM will be in uW
+ *
+ * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity
+ * update after registering the PD, even if @dev is a CPU device.
+ */
+int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
+ const struct em_data_callback *cb,
+ const cpumask_t *cpus, bool microwatts)
+{
struct em_perf_table *em_table;
unsigned long cap, prev_cap = 0;
unsigned long flags = 0;
@@ -636,12 +660,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
unlock:
mutex_unlock(&em_pd_mutex);
- if (_is_cpu_device(dev))
- em_check_capacity_update();
-
return ret;
}
-EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
+EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
/**
* em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1f1f30cca573..2f66ab453823 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -449,6 +449,7 @@ int hibernation_snapshot(int platform_mode)
shrink_shmem_memory();
console_suspend_all();
+ pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 174ee243b349..8eff357b0436 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4262,6 +4262,8 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ rcu_preempt_deferred_qs_init(rdp);
rcu_spawn_rnp_kthreads(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index de6ca13a7b5f..b8bbe7960cda 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -488,6 +488,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
static void rcu_flavor_sched_clock_irq(int user);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fc14adf15cbb..4cd170b2d655 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -763,8 +763,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- rdp->defer_qs_iw =
- IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
@@ -904,6 +902,10 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
}
}
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp)
+{
+ rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
+}
#else /* #ifdef CONFIG_PREEMPT_RCU */
/*
@@ -1103,6 +1105,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
}
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { }
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/*
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b7a1ec327e81..2452b7366b00 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -342,12 +342,12 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
/*
* Load and clear event mask atomically with respect to
- * scheduler preemption.
+ * scheduler preemption and membarrier IPIs.
*/
- preempt_disable();
- event_mask = t->rseq_event_mask;
- t->rseq_event_mask = 0;
- preempt_enable();
+ scoped_guard(RSEQ_EVENT_GUARD) {
+ event_mask = t->rseq_event_mask;
+ t->rseq_event_mask = 0;
+ }
return !!event_mask;
}
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index c4a488e67aa7..755883faf751 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -58,6 +58,7 @@
#include "deadline.c"
#ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext_internal.h"
# include "ext.c"
# include "ext_idle.c"
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index be00629f0ba4..198d2dd45f59 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,6 +7,8 @@
* Copyright (C) 1991-2002 Linus Torvalds
* Copyright (C) 1998-2024 Ingo Molnar, Red Hat
*/
+#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE
+#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/hrtimer_api.h>
#include <linux/ktime_api.h>
@@ -917,7 +919,7 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
- rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
+ rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta);
if (rq == this_rq())
__hrtick_restart(rq);
@@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
__do_set_cpus_allowed(p, &ac);
}
-void migrate_disable(void)
-{
- struct task_struct *p = current;
-
- if (p->migration_disabled) {
-#ifdef CONFIG_DEBUG_PREEMPT
- /*
- *Warn about overflow half-way through the range.
- */
- WARN_ON_ONCE((s16)p->migration_disabled < 0);
-#endif
- p->migration_disabled++;
- return;
- }
-
- guard(preempt)();
- this_rq()->nr_pinned++;
- p->migration_disabled = 1;
-}
-EXPORT_SYMBOL_GPL(migrate_disable);
-
-void migrate_enable(void)
+void ___migrate_enable(void)
{
struct task_struct *p = current;
struct affinity_context ac = {
@@ -2410,35 +2391,19 @@ void migrate_enable(void)
.flags = SCA_MIGRATE_ENABLE,
};
-#ifdef CONFIG_DEBUG_PREEMPT
- /*
- * Check both overflow from migrate_disable() and superfluous
- * migrate_enable().
- */
- if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
- return;
-#endif
+ __set_cpus_allowed_ptr(p, &ac);
+}
+EXPORT_SYMBOL_GPL(___migrate_enable);
- if (p->migration_disabled > 1) {
- p->migration_disabled--;
- return;
- }
+void migrate_disable(void)
+{
+ __migrate_disable();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
- /*
- * Ensure stop_task runs either before or after this, and that
- * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
- */
- guard(preempt)();
- if (p->cpus_ptr != &p->cpus_mask)
- __set_cpus_allowed_ptr(p, &ac);
- /*
- * Mustn't clear migration_disabled() until cpus_ptr points back at the
- * regular cpus_mask, otherwise things that race (eg.
- * select_fallback_rq) get confused.
- */
- barrier();
- p->migration_disabled = 0;
- this_rq()->nr_pinned--;
+void migrate_enable(void)
+{
+ __migrate_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
@@ -4472,7 +4437,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
* __sched_fork() is basic setup which is also used by sched_init() to
* initialize the boot CPU's idle task.
*/
-static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+static void __sched_fork(u64 clone_flags, struct task_struct *p)
{
p->on_rq = 0;
@@ -4490,6 +4455,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
+#ifdef CONFIG_CFS_BANDWIDTH
+ init_cfs_throttle_work(p);
+#endif
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -4707,7 +4675,7 @@ late_initcall(sched_core_sysctl_init);
/*
* fork()/clone()-time setup:
*/
-int sched_fork(unsigned long clone_flags, struct task_struct *p)
+int sched_fork(u64 clone_flags, struct task_struct *p)
{
__sched_fork(clone_flags, p);
/*
@@ -9362,8 +9330,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, css, tset)
sched_move_task(task, false);
-
- scx_cgroup_finish_attach();
}
static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
@@ -9551,7 +9517,7 @@ static unsigned long tg_weight(struct task_group *tg)
#ifdef CONFIG_FAIR_GROUP_SCHED
return scale_load_down(tg->shares);
#else
- return sched_weight_from_cgroup(tg->scx_weight);
+ return sched_weight_from_cgroup(tg->scx.weight);
#endif
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e2d51f4306b3..615411a0a881 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
*/
if (dl_se->dl_defer && !dl_se->dl_defer_running &&
dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) {
- if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) {
+ if (!is_dl_boosted(dl_se)) {
/*
* Set dl_se->dl_defer_armed and dl_throttled variables to
@@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
-static bool dl_server_stopped(struct sched_dl_entity *dl_se);
-
static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
{
struct rq *rq = rq_of_dl_se(dl_se);
@@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
if (!dl_se->dl_runtime)
return HRTIMER_NORESTART;
- if (!dl_se->server_has_tasks(dl_se)) {
- replenish_dl_entity(dl_se);
- dl_server_stopped(dl_se);
- return HRTIMER_NORESTART;
- }
-
if (dl_se->dl_defer_armed) {
/*
* First check if the server could consume runtime in background.
@@ -1496,10 +1488,12 @@ throttle:
}
if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
- if (dl_server(dl_se))
- enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
- else
+ if (dl_server(dl_se)) {
+ replenish_dl_new_period(dl_se, rq);
+ start_dl_timer(dl_se);
+ } else {
enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+ }
}
if (!is_leftmost(dl_se, &rq->dl))
@@ -1577,10 +1571,8 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
{
/* 0 runtime = fair server disabled */
- if (dl_se->dl_runtime) {
- dl_se->dl_server_idle = 0;
+ if (dl_se->dl_runtime)
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
- }
}
void dl_server_start(struct sched_dl_entity *dl_se)
@@ -1608,26 +1600,10 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
dl_se->dl_server_active = 0;
}
-static bool dl_server_stopped(struct sched_dl_entity *dl_se)
-{
- if (!dl_se->dl_server_active)
- return false;
-
- if (dl_se->dl_server_idle) {
- dl_server_stop(dl_se);
- return true;
- }
-
- dl_se->dl_server_idle = 1;
- return false;
-}
-
void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
- dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick_task)
{
dl_se->rq = rq;
- dl_se->server_has_tasks = has_tasks;
dl_se->server_pick_task = pick_task;
}
@@ -1849,7 +1825,9 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
u64 deadline = dl_se->deadline;
dl_rq->dl_nr_running++;
- add_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
}
@@ -1859,7 +1837,9 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
- sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
}
@@ -2388,10 +2368,7 @@ again:
if (dl_server(dl_se)) {
p = dl_se->server_pick_task(dl_se);
if (!p) {
- if (!dl_server_stopped(dl_se)) {
- dl_se->dl_yielded = 1;
- update_curr_dl_se(rq, dl_se, 0);
- }
+ dl_server_stop(dl_se);
goto again;
}
rq->dl_server = dl_se;
@@ -2574,6 +2551,25 @@ static int find_later_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
+
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
+
+ return p;
+}
+
/* Locks the rq it finds */
static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
{
@@ -2601,12 +2597,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
- if (unlikely(task_rq(task) != rq ||
+ /*
+ * double_lock_balance had to release rq->lock, in the
+ * meantime, task may no longer be fit to be migrated.
+ * Check the following to ensure that the task is
+ * still suitable for migration:
+ * 1. It is possible the task was scheduled,
+ * migrate_disabled was set and then got preempted,
+ * so we must check the task migration disable
+ * flag.
+ * 2. The CPU picked is in the task's affinity.
+ * 3. For throttled task (dl_task_offline_migration),
+ * check the following:
+ * - the task is not on the rq anymore (it was
+ * migrated)
+ * - the task is not on CPU anymore
+ * - the task is still a dl task
+ * - the task is not queued on the rq anymore
+ * 4. For the non-throttled task (push_dl_task), the
+ * check to ensure that this task is still at the
+ * head of the pushable tasks list is enough.
+ */
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !dl_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ (task->dl.dl_throttled &&
+ (task_rq(task) != rq ||
+ task_on_cpu(rq, task) ||
+ !dl_task(task) ||
+ !task_on_rq_queued(task))) ||
+ (!task->dl.dl_throttled &&
+ task != pick_next_pushable_dl_task(rq)))) {
+
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -2629,25 +2650,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
return later_rq;
}
-static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_dl_tasks(rq))
- return NULL;
-
- p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
-
- WARN_ON_ONCE(rq->cpu != task_cpu(p));
- WARN_ON_ONCE(task_current(rq, p));
- WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
-
- WARN_ON_ONCE(!task_on_rq_queued(p));
- WARN_ON_ONCE(!dl_task(p));
-
- return p;
-}
-
/*
* See if the non running -deadline tasks on this rq
* can be sent to some other CPU where they can preempt
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 3f06ab84d53f..02e16b70a790 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -376,10 +376,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
return -EINVAL;
}
- if (rq->cfs.h_nr_queued) {
- update_rq_clock(rq);
- dl_server_stop(&rq->fair_server);
- }
+ update_rq_clock(rq);
+ dl_server_stop(&rq->fair_server);
retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
if (retval)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7dedc9a16281..2b0e88206d07 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,1040 +9,6 @@
#include <linux/btf_ids.h>
#include "ext_idle.h"
-#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
-
-enum scx_consts {
- SCX_DSP_DFL_MAX_BATCH = 32,
- SCX_DSP_MAX_LOOPS = 32,
- SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
-
- SCX_EXIT_BT_LEN = 64,
- SCX_EXIT_MSG_LEN = 1024,
- SCX_EXIT_DUMP_DFL_LEN = 32768,
-
- SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
-
- /*
- * Iterating all tasks may take a while. Periodically drop
- * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
- */
- SCX_TASK_ITER_BATCH = 32,
-};
-
-enum scx_exit_kind {
- SCX_EXIT_NONE,
- SCX_EXIT_DONE,
-
- SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
- SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
- SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
- SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
-
- SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
- SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
- SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
-};
-
-/*
- * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
- * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
- * are 64bit of the format:
- *
- * Bits: [63 .. 48 47 .. 32 31 .. 0]
- * [ SYS ACT ] [ SYS RSN ] [ USR ]
- *
- * SYS ACT: System-defined exit actions
- * SYS RSN: System-defined exit reasons
- * USR : User-defined exit codes and reasons
- *
- * Using the above, users may communicate intention and context by ORing system
- * actions and/or system reasons with a user-defined exit code.
- */
-enum scx_exit_code {
- /* Reasons */
- SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
-
- /* Actions */
- SCX_ECODE_ACT_RESTART = 1LLU << 48,
-};
-
-/*
- * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
- * being disabled.
- */
-struct scx_exit_info {
- /* %SCX_EXIT_* - broad category of the exit reason */
- enum scx_exit_kind kind;
-
- /* exit code if gracefully exiting */
- s64 exit_code;
-
- /* textual representation of the above */
- const char *reason;
-
- /* backtrace if exiting due to an error */
- unsigned long *bt;
- u32 bt_len;
-
- /* informational message */
- char *msg;
-
- /* debug dump */
- char *dump;
-};
-
-/* sched_ext_ops.flags */
-enum scx_ops_flags {
- /*
- * Keep built-in idle tracking even if ops.update_idle() is implemented.
- */
- SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
-
- /*
- * By default, if there are no other task to run on the CPU, ext core
- * keeps running the current task even after its slice expires. If this
- * flag is specified, such tasks are passed to ops.enqueue() with
- * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
- */
- SCX_OPS_ENQ_LAST = 1LLU << 1,
-
- /*
- * An exiting task may schedule after PF_EXITING is set. In such cases,
- * bpf_task_from_pid() may not be able to find the task and if the BPF
- * scheduler depends on pid lookup for dispatching, the task will be
- * lost leading to various issues including RCU grace period stalls.
- *
- * To mask this problem, by default, unhashed tasks are automatically
- * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
- * depend on pid lookups and wants to handle these tasks directly, the
- * following flag can be used.
- */
- SCX_OPS_ENQ_EXITING = 1LLU << 2,
-
- /*
- * If set, only tasks with policy set to SCHED_EXT are attached to
- * sched_ext. If clear, SCHED_NORMAL tasks are also included.
- */
- SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
-
- /*
- * A migration disabled task can only execute on its current CPU. By
- * default, such tasks are automatically put on the CPU's local DSQ with
- * the default slice on enqueue. If this ops flag is set, they also go
- * through ops.enqueue().
- *
- * A migration disabled task never invokes ops.select_cpu() as it can
- * only select the current CPU. Also, p->cpus_ptr will only contain its
- * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
- * and thus may disagree with cpumask_weight(p->cpus_ptr).
- */
- SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
-
- /*
- * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
- * ops.enqueue() on the ops.select_cpu() selected or the wakee's
- * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
- * transfers. When this optimization is enabled, ops.select_cpu() is
- * skipped in some cases (when racing against the wakee switching out).
- * As the BPF scheduler may depend on ops.select_cpu() being invoked
- * during wakeups, queued wakeup is disabled by default.
- *
- * If this ops flag is set, queued wakeup optimization is enabled and
- * the BPF scheduler must be able to handle ops.enqueue() invoked on the
- * wakee's CPU without preceding ops.select_cpu() even for tasks which
- * may be executed on multiple CPUs.
- */
- SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
-
- /*
- * If set, enable per-node idle cpumasks. If clear, use a single global
- * flat idle cpumask.
- */
- SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
-
- /*
- * CPU cgroup support flags
- */
- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
-
- SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
- SCX_OPS_ENQ_LAST |
- SCX_OPS_ENQ_EXITING |
- SCX_OPS_ENQ_MIGRATION_DISABLED |
- SCX_OPS_ALLOW_QUEUED_WAKEUP |
- SCX_OPS_SWITCH_PARTIAL |
- SCX_OPS_BUILTIN_IDLE_PER_NODE |
- SCX_OPS_HAS_CGROUP_WEIGHT,
-
- /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
- __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
-
- SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
-};
-
-/* argument container for ops.init_task() */
-struct scx_init_task_args {
- /*
- * Set if ops.init_task() is being invoked on the fork path, as opposed
- * to the scheduler transition path.
- */
- bool fork;
-#ifdef CONFIG_EXT_GROUP_SCHED
- /* the cgroup the task is joining */
- struct cgroup *cgroup;
-#endif
-};
-
-/* argument container for ops.exit_task() */
-struct scx_exit_task_args {
- /* Whether the task exited before running on sched_ext. */
- bool cancelled;
-};
-
-/* argument container for ops->cgroup_init() */
-struct scx_cgroup_init_args {
- /* the weight of the cgroup [1..10000] */
- u32 weight;
-
- /* bandwidth control parameters from cpu.max and cpu.max.burst */
- u64 bw_period_us;
- u64 bw_quota_us;
- u64 bw_burst_us;
-};
-
-enum scx_cpu_preempt_reason {
- /* next task is being scheduled by &sched_class_rt */
- SCX_CPU_PREEMPT_RT,
- /* next task is being scheduled by &sched_class_dl */
- SCX_CPU_PREEMPT_DL,
- /* next task is being scheduled by &sched_class_stop */
- SCX_CPU_PREEMPT_STOP,
- /* unknown reason for SCX being preempted */
- SCX_CPU_PREEMPT_UNKNOWN,
-};
-
-/*
- * Argument container for ops->cpu_acquire(). Currently empty, but may be
- * expanded in the future.
- */
-struct scx_cpu_acquire_args {};
-
-/* argument container for ops->cpu_release() */
-struct scx_cpu_release_args {
- /* the reason the CPU was preempted */
- enum scx_cpu_preempt_reason reason;
-
- /* the task that's going to be scheduled on the CPU */
- struct task_struct *task;
-};
-
-/*
- * Informational context provided to dump operations.
- */
-struct scx_dump_ctx {
- enum scx_exit_kind kind;
- s64 exit_code;
- const char *reason;
- u64 at_ns;
- u64 at_jiffies;
-};
-
-/**
- * struct sched_ext_ops - Operation table for BPF scheduler implementation
- *
- * A BPF scheduler can implement an arbitrary scheduling policy by
- * implementing and loading operations in this table. Note that a userland
- * scheduling policy can also be implemented using the BPF scheduler
- * as a shim layer.
- */
-struct sched_ext_ops {
- /**
- * @select_cpu: Pick the target CPU for a task which is being woken up
- * @p: task being woken up
- * @prev_cpu: the cpu @p was on before sleeping
- * @wake_flags: SCX_WAKE_*
- *
- * Decision made here isn't final. @p may be moved to any CPU while it
- * is getting dispatched for execution later. However, as @p is not on
- * the rq at this point, getting the eventual execution CPU right here
- * saves a small bit of overhead down the line.
- *
- * If an idle CPU is returned, the CPU is kicked and will try to
- * dispatch. While an explicit custom mechanism can be added,
- * select_cpu() serves as the default way to wake up idle CPUs.
- *
- * @p may be inserted into a DSQ directly by calling
- * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
- * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
- * of the CPU returned by this operation.
- *
- * Note that select_cpu() is never called for tasks that can only run
- * on a single CPU or tasks with migration disabled, as they don't have
- * the option to select a different CPU. See select_task_rq() for
- * details.
- */
- s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
-
- /**
- * @enqueue: Enqueue a task on the BPF scheduler
- * @p: task being enqueued
- * @enq_flags: %SCX_ENQ_*
- *
- * @p is ready to run. Insert directly into a DSQ by calling
- * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
- * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
- * the task will stall.
- *
- * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
- * skipped.
- */
- void (*enqueue)(struct task_struct *p, u64 enq_flags);
-
- /**
- * @dequeue: Remove a task from the BPF scheduler
- * @p: task being dequeued
- * @deq_flags: %SCX_DEQ_*
- *
- * Remove @p from the BPF scheduler. This is usually called to isolate
- * the task while updating its scheduling properties (e.g. priority).
- *
- * The ext core keeps track of whether the BPF side owns a given task or
- * not and can gracefully ignore spurious dispatches from BPF side,
- * which makes it safe to not implement this method. However, depending
- * on the scheduling logic, this can lead to confusing behaviors - e.g.
- * scheduling position not being updated across a priority change.
- */
- void (*dequeue)(struct task_struct *p, u64 deq_flags);
-
- /**
- * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
- * @cpu: CPU to dispatch tasks for
- * @prev: previous task being switched out
- *
- * Called when a CPU's local dsq is empty. The operation should dispatch
- * one or more tasks from the BPF scheduler into the DSQs using
- * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
- * using scx_bpf_dsq_move_to_local().
- *
- * The maximum number of times scx_bpf_dsq_insert() can be called
- * without an intervening scx_bpf_dsq_move_to_local() is specified by
- * ops.dispatch_max_batch. See the comments on top of the two functions
- * for more details.
- *
- * When not %NULL, @prev is an SCX task with its slice depleted. If
- * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
- * @prev->scx.flags, it is not enqueued yet and will be enqueued after
- * ops.dispatch() returns. To keep executing @prev, return without
- * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
- */
- void (*dispatch)(s32 cpu, struct task_struct *prev);
-
- /**
- * @tick: Periodic tick
- * @p: task running currently
- *
- * This operation is called every 1/HZ seconds on CPUs which are
- * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
- * immediate dispatch cycle on the CPU.
- */
- void (*tick)(struct task_struct *p);
-
- /**
- * @runnable: A task is becoming runnable on its associated CPU
- * @p: task becoming runnable
- * @enq_flags: %SCX_ENQ_*
- *
- * This and the following three functions can be used to track a task's
- * execution state transitions. A task becomes ->runnable() on a CPU,
- * and then goes through one or more ->running() and ->stopping() pairs
- * as it runs on the CPU, and eventually becomes ->quiescent() when it's
- * done running on the CPU.
- *
- * @p is becoming runnable on the CPU because it's
- *
- * - waking up (%SCX_ENQ_WAKEUP)
- * - being moved from another CPU
- * - being restored after temporarily taken off the queue for an
- * attribute change.
- *
- * This and ->enqueue() are related but not coupled. This operation
- * notifies @p's state transition and may not be followed by ->enqueue()
- * e.g. when @p is being dispatched to a remote CPU, or when @p is
- * being enqueued on a CPU experiencing a hotplug event. Likewise, a
- * task may be ->enqueue()'d without being preceded by this operation
- * e.g. after exhausting its slice.
- */
- void (*runnable)(struct task_struct *p, u64 enq_flags);
-
- /**
- * @running: A task is starting to run on its associated CPU
- * @p: task starting to run
- *
- * Note that this callback may be called from a CPU other than the
- * one the task is going to run on. This can happen when a task
- * property is changed (i.e., affinity), since scx_next_task_scx(),
- * which triggers this callback, may run on a CPU different from
- * the task's assigned CPU.
- *
- * Therefore, always use scx_bpf_task_cpu(@p) to determine the
- * target CPU the task is going to use.
- *
- * See ->runnable() for explanation on the task state notifiers.
- */
- void (*running)(struct task_struct *p);
-
- /**
- * @stopping: A task is stopping execution
- * @p: task stopping to run
- * @runnable: is task @p still runnable?
- *
- * Note that this callback may be called from a CPU other than the
- * one the task was running on. This can happen when a task
- * property is changed (i.e., affinity), since dequeue_task_scx(),
- * which triggers this callback, may run on a CPU different from
- * the task's assigned CPU.
- *
- * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
- * the task was running on.
- *
- * See ->runnable() for explanation on the task state notifiers. If
- * !@runnable, ->quiescent() will be invoked after this operation
- * returns.
- */
- void (*stopping)(struct task_struct *p, bool runnable);
-
- /**
- * @quiescent: A task is becoming not runnable on its associated CPU
- * @p: task becoming not runnable
- * @deq_flags: %SCX_DEQ_*
- *
- * See ->runnable() for explanation on the task state notifiers.
- *
- * @p is becoming quiescent on the CPU because it's
- *
- * - sleeping (%SCX_DEQ_SLEEP)
- * - being moved to another CPU
- * - being temporarily taken off the queue for an attribute change
- * (%SCX_DEQ_SAVE)
- *
- * This and ->dequeue() are related but not coupled. This operation
- * notifies @p's state transition and may not be preceded by ->dequeue()
- * e.g. when @p is being dispatched to a remote CPU.
- */
- void (*quiescent)(struct task_struct *p, u64 deq_flags);
-
- /**
- * @yield: Yield CPU
- * @from: yielding task
- * @to: optional yield target task
- *
- * If @to is NULL, @from is yielding the CPU to other runnable tasks.
- * The BPF scheduler should ensure that other available tasks are
- * dispatched before the yielding task. Return value is ignored in this
- * case.
- *
- * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
- * scheduler can implement the request, return %true; otherwise, %false.
- */
- bool (*yield)(struct task_struct *from, struct task_struct *to);
-
- /**
- * @core_sched_before: Task ordering for core-sched
- * @a: task A
- * @b: task B
- *
- * Used by core-sched to determine the ordering between two tasks. See
- * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
- * core-sched.
- *
- * Both @a and @b are runnable and may or may not currently be queued on
- * the BPF scheduler. Should return %true if @a should run before @b.
- * %false if there's no required ordering or @b should run before @a.
- *
- * If not specified, the default is ordering them according to when they
- * became runnable.
- */
- bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
-
- /**
- * @set_weight: Set task weight
- * @p: task to set weight for
- * @weight: new weight [1..10000]
- *
- * Update @p's weight to @weight.
- */
- void (*set_weight)(struct task_struct *p, u32 weight);
-
- /**
- * @set_cpumask: Set CPU affinity
- * @p: task to set CPU affinity for
- * @cpumask: cpumask of cpus that @p can run on
- *
- * Update @p's CPU affinity to @cpumask.
- */
- void (*set_cpumask)(struct task_struct *p,
- const struct cpumask *cpumask);
-
- /**
- * @update_idle: Update the idle state of a CPU
- * @cpu: CPU to update the idle state for
- * @idle: whether entering or exiting the idle state
- *
- * This operation is called when @rq's CPU goes or leaves the idle
- * state. By default, implementing this operation disables the built-in
- * idle CPU tracking and the following helpers become unavailable:
- *
- * - scx_bpf_select_cpu_dfl()
- * - scx_bpf_select_cpu_and()
- * - scx_bpf_test_and_clear_cpu_idle()
- * - scx_bpf_pick_idle_cpu()
- *
- * The user also must implement ops.select_cpu() as the default
- * implementation relies on scx_bpf_select_cpu_dfl().
- *
- * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
- * tracking.
- */
- void (*update_idle)(s32 cpu, bool idle);
-
- /**
- * @cpu_acquire: A CPU is becoming available to the BPF scheduler
- * @cpu: The CPU being acquired by the BPF scheduler.
- * @args: Acquire arguments, see the struct definition.
- *
- * A CPU that was previously released from the BPF scheduler is now once
- * again under its control.
- */
- void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
-
- /**
- * @cpu_release: A CPU is taken away from the BPF scheduler
- * @cpu: The CPU being released by the BPF scheduler.
- * @args: Release arguments, see the struct definition.
- *
- * The specified CPU is no longer under the control of the BPF
- * scheduler. This could be because it was preempted by a higher
- * priority sched_class, though there may be other reasons as well. The
- * caller should consult @args->reason to determine the cause.
- */
- void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
-
- /**
- * @init_task: Initialize a task to run in a BPF scheduler
- * @p: task to initialize for BPF scheduling
- * @args: init arguments, see the struct definition
- *
- * Either we're loading a BPF scheduler or a new task is being forked.
- * Initialize @p for BPF scheduling. This operation may block and can
- * be used for allocations, and is called exactly once for a task.
- *
- * Return 0 for success, -errno for failure. An error return while
- * loading will abort loading of the BPF scheduler. During a fork, it
- * will abort that specific fork.
- */
- s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
-
- /**
- * @exit_task: Exit a previously-running task from the system
- * @p: task to exit
- * @args: exit arguments, see the struct definition
- *
- * @p is exiting or the BPF scheduler is being unloaded. Perform any
- * necessary cleanup for @p.
- */
- void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
-
- /**
- * @enable: Enable BPF scheduling for a task
- * @p: task to enable BPF scheduling for
- *
- * Enable @p for BPF scheduling. enable() is called on @p any time it
- * enters SCX, and is always paired with a matching disable().
- */
- void (*enable)(struct task_struct *p);
-
- /**
- * @disable: Disable BPF scheduling for a task
- * @p: task to disable BPF scheduling for
- *
- * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
- * Disable BPF scheduling for @p. A disable() call is always matched
- * with a prior enable() call.
- */
- void (*disable)(struct task_struct *p);
-
- /**
- * @dump: Dump BPF scheduler state on error
- * @ctx: debug dump context
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
- */
- void (*dump)(struct scx_dump_ctx *ctx);
-
- /**
- * @dump_cpu: Dump BPF scheduler state for a CPU on error
- * @ctx: debug dump context
- * @cpu: CPU to generate debug dump for
- * @idle: @cpu is currently idle without any runnable tasks
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
- * @cpu. If @idle is %true and this operation doesn't produce any
- * output, @cpu is skipped for dump.
- */
- void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
-
- /**
- * @dump_task: Dump BPF scheduler state for a runnable task on error
- * @ctx: debug dump context
- * @p: runnable task to generate debug dump for
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
- * @p.
- */
- void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
-
-#ifdef CONFIG_EXT_GROUP_SCHED
- /**
- * @cgroup_init: Initialize a cgroup
- * @cgrp: cgroup being initialized
- * @args: init arguments, see the struct definition
- *
- * Either the BPF scheduler is being loaded or @cgrp created, initialize
- * @cgrp for sched_ext. This operation may block.
- *
- * Return 0 for success, -errno for failure. An error return while
- * loading will abort loading of the BPF scheduler. During cgroup
- * creation, it will abort the specific cgroup creation.
- */
- s32 (*cgroup_init)(struct cgroup *cgrp,
- struct scx_cgroup_init_args *args);
-
- /**
- * @cgroup_exit: Exit a cgroup
- * @cgrp: cgroup being exited
- *
- * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
- * @cgrp for sched_ext. This operation my block.
- */
- void (*cgroup_exit)(struct cgroup *cgrp);
-
- /**
- * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
- * @p: task being moved
- * @from: cgroup @p is being moved from
- * @to: cgroup @p is being moved to
- *
- * Prepare @p for move from cgroup @from to @to. This operation may
- * block and can be used for allocations.
- *
- * Return 0 for success, -errno for failure. An error return aborts the
- * migration.
- */
- s32 (*cgroup_prep_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_move: Commit cgroup move
- * @p: task being moved
- * @from: cgroup @p is being moved from
- * @to: cgroup @p is being moved to
- *
- * Commit the move. @p is dequeued during this operation.
- */
- void (*cgroup_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_cancel_move: Cancel cgroup move
- * @p: task whose cgroup move is being canceled
- * @from: cgroup @p was being moved from
- * @to: cgroup @p was being moved to
- *
- * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
- * Undo the preparation.
- */
- void (*cgroup_cancel_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_set_weight: A cgroup's weight is being changed
- * @cgrp: cgroup whose weight is being updated
- * @weight: new weight [1..10000]
- *
- * Update @cgrp's weight to @weight.
- */
- void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
-
- /**
- * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
- * @cgrp: cgroup whose bandwidth is being updated
- * @period_us: bandwidth control period
- * @quota_us: bandwidth control quota
- * @burst_us: bandwidth control burst
- *
- * Update @cgrp's bandwidth control parameters. This is from the cpu.max
- * cgroup interface.
- *
- * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
- * to. For example, if @period_us is 1_000_000 and @quota_us is
- * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
- * interpreted in the same fashion and specifies how much @cgrp can
- * burst temporarily. The specific control mechanism and thus the
- * interpretation of @period_us and burstiness is upto to the BPF
- * scheduler.
- */
- void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
- u64 period_us, u64 quota_us, u64 burst_us);
-
-#endif /* CONFIG_EXT_GROUP_SCHED */
-
- /*
- * All online ops must come before ops.cpu_online().
- */
-
- /**
- * @cpu_online: A CPU became online
- * @cpu: CPU which just came up
- *
- * @cpu just came online. @cpu will not call ops.enqueue() or
- * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
- */
- void (*cpu_online)(s32 cpu);
-
- /**
- * @cpu_offline: A CPU is going offline
- * @cpu: CPU which is going offline
- *
- * @cpu is going offline. @cpu will not call ops.enqueue() or
- * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
- */
- void (*cpu_offline)(s32 cpu);
-
- /*
- * All CPU hotplug ops must come before ops.init().
- */
-
- /**
- * @init: Initialize the BPF scheduler
- */
- s32 (*init)(void);
-
- /**
- * @exit: Clean up after the BPF scheduler
- * @info: Exit info
- *
- * ops.exit() is also called on ops.init() failure, which is a bit
- * unusual. This is to allow rich reporting through @info on how
- * ops.init() failed.
- */
- void (*exit)(struct scx_exit_info *info);
-
- /**
- * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
- */
- u32 dispatch_max_batch;
-
- /**
- * @flags: %SCX_OPS_* flags
- */
- u64 flags;
-
- /**
- * @timeout_ms: The maximum amount of time, in milliseconds, that a
- * runnable task should be able to wait before being scheduled. The
- * maximum timeout may not exceed the default timeout of 30 seconds.
- *
- * Defaults to the maximum allowed timeout value of 30 seconds.
- */
- u32 timeout_ms;
-
- /**
- * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
- * value of 32768 is used.
- */
- u32 exit_dump_len;
-
- /**
- * @hotplug_seq: A sequence number that may be set by the scheduler to
- * detect when a hotplug event has occurred during the loading process.
- * If 0, no detection occurs. Otherwise, the scheduler will fail to
- * load if the sequence number does not match @scx_hotplug_seq on the
- * enable path.
- */
- u64 hotplug_seq;
-
- /**
- * @name: BPF scheduler's name
- *
- * Must be a non-zero valid BPF object name including only isalnum(),
- * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
- * BPF scheduler is enabled.
- */
- char name[SCX_OPS_NAME_LEN];
-
- /* internal use only, must be NULL */
- void *priv;
-};
-
-enum scx_opi {
- SCX_OPI_BEGIN = 0,
- SCX_OPI_NORMAL_BEGIN = 0,
- SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
- SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
- SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
- SCX_OPI_END = SCX_OP_IDX(init),
-};
-
-/*
- * Collection of event counters. Event types are placed in descending order.
- */
-struct scx_event_stats {
- /*
- * If ops.select_cpu() returns a CPU which can't be used by the task,
- * the core scheduler code silently picks a fallback CPU.
- */
- s64 SCX_EV_SELECT_CPU_FALLBACK;
-
- /*
- * When dispatching to a local DSQ, the CPU may have gone offline in
- * the meantime. In this case, the task is bounced to the global DSQ.
- */
- s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
-
- /*
- * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
- * continued to run because there were no other tasks on the CPU.
- */
- s64 SCX_EV_DISPATCH_KEEP_LAST;
-
- /*
- * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
- * is dispatched to a local DSQ when exiting.
- */
- s64 SCX_EV_ENQ_SKIP_EXITING;
-
- /*
- * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
- * migration disabled task skips ops.enqueue() and is dispatched to its
- * local DSQ.
- */
- s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
-
- /*
- * Total number of times a task's time slice was refilled with the
- * default value (SCX_SLICE_DFL).
- */
- s64 SCX_EV_REFILL_SLICE_DFL;
-
- /*
- * The total duration of bypass modes in nanoseconds.
- */
- s64 SCX_EV_BYPASS_DURATION;
-
- /*
- * The number of tasks dispatched in the bypassing mode.
- */
- s64 SCX_EV_BYPASS_DISPATCH;
-
- /*
- * The number of times the bypassing mode has been activated.
- */
- s64 SCX_EV_BYPASS_ACTIVATE;
-};
-
-struct scx_sched {
- struct sched_ext_ops ops;
- DECLARE_BITMAP(has_op, SCX_OPI_END);
-
- /*
- * Dispatch queues.
- *
- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
- * This is to avoid live-locking in bypass mode where all tasks are
- * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
- * per-node split isn't sufficient, it can be further split.
- */
- struct rhashtable dsq_hash;
- struct scx_dispatch_q **global_dsqs;
-
- /*
- * The event counters are in a per-CPU variable to minimize the
- * accounting overhead. A system-wide view on the event counter is
- * constructed when requested by scx_bpf_events().
- */
- struct scx_event_stats __percpu *event_stats_cpu;
-
- bool warned_zero_slice;
-
- atomic_t exit_kind;
- struct scx_exit_info *exit_info;
-
- struct kobject kobj;
-
- struct kthread_worker *helper;
- struct irq_work error_irq_work;
- struct kthread_work disable_work;
- struct rcu_work rcu_work;
-};
-
-enum scx_wake_flags {
- /* expose select WF_* flags as enums */
- SCX_WAKE_FORK = WF_FORK,
- SCX_WAKE_TTWU = WF_TTWU,
- SCX_WAKE_SYNC = WF_SYNC,
-};
-
-enum scx_enq_flags {
- /* expose select ENQUEUE_* flags as enums */
- SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
- SCX_ENQ_HEAD = ENQUEUE_HEAD,
- SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
-
- /* high 32bits are SCX specific */
-
- /*
- * Set the following to trigger preemption when calling
- * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
- * current task is cleared to zero and the CPU is kicked into the
- * scheduling path. Implies %SCX_ENQ_HEAD.
- */
- SCX_ENQ_PREEMPT = 1LLU << 32,
-
- /*
- * The task being enqueued was previously enqueued on the current CPU's
- * %SCX_DSQ_LOCAL, but was removed from it in a call to the
- * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
- * invoked in a ->cpu_release() callback, and the task is again
- * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
- * task will not be scheduled on the CPU until at least the next invocation
- * of the ->cpu_acquire() callback.
- */
- SCX_ENQ_REENQ = 1LLU << 40,
-
- /*
- * The task being enqueued is the only task available for the cpu. By
- * default, ext core keeps executing such tasks but when
- * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
- * %SCX_ENQ_LAST flag set.
- *
- * The BPF scheduler is responsible for triggering a follow-up
- * scheduling event. Otherwise, Execution may stall.
- */
- SCX_ENQ_LAST = 1LLU << 41,
-
- /* high 8 bits are internal */
- __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
-
- SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
- SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
-};
-
-enum scx_deq_flags {
- /* expose select DEQUEUE_* flags as enums */
- SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
-
- /* high 32bits are SCX specific */
-
- /*
- * The generic core-sched layer decided to execute the task even though
- * it hasn't been dispatched yet. Dequeue from the BPF side.
- */
- SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
-};
-
-enum scx_pick_idle_cpu_flags {
- SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
- SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
-};
-
-enum scx_kick_flags {
- /*
- * Kick the target CPU if idle. Guarantees that the target CPU goes
- * through at least one full scheduling cycle before going idle. If the
- * target CPU can be determined to be currently not idle and going to go
- * through a scheduling cycle before going idle, noop.
- */
- SCX_KICK_IDLE = 1LLU << 0,
-
- /*
- * Preempt the current task and execute the dispatch path. If the
- * current task of the target CPU is an SCX task, its ->scx.slice is
- * cleared to zero before the scheduling path is invoked so that the
- * task expires and the dispatch path is invoked.
- */
- SCX_KICK_PREEMPT = 1LLU << 1,
-
- /*
- * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
- * return after the target CPU finishes picking the next task.
- */
- SCX_KICK_WAIT = 1LLU << 2,
-};
-
-enum scx_tg_flags {
- SCX_TG_ONLINE = 1U << 0,
- SCX_TG_INITED = 1U << 1,
-};
-
-enum scx_enable_state {
- SCX_ENABLING,
- SCX_ENABLED,
- SCX_DISABLING,
- SCX_DISABLED,
-};
-
-static const char *scx_enable_state_str[] = {
- [SCX_ENABLING] = "enabling",
- [SCX_ENABLED] = "enabled",
- [SCX_DISABLING] = "disabling",
- [SCX_DISABLED] = "disabled",
-};
-
-/*
- * sched_ext_entity->ops_state
- *
- * Used to track the task ownership between the SCX core and the BPF scheduler.
- * State transitions look as follows:
- *
- * NONE -> QUEUEING -> QUEUED -> DISPATCHING
- * ^ | |
- * | v v
- * \-------------------------------/
- *
- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
- * sites for explanations on the conditions being waited upon and why they are
- * safe. Transitions out of them into NONE or QUEUED must store_release and the
- * waiters should load_acquire.
- *
- * Tracking scx_ops_state enables sched_ext core to reliably determine whether
- * any given task can be dispatched by the BPF scheduler at all times and thus
- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
- * to try to dispatch any task anytime regardless of its state as the SCX core
- * can safely reject invalid dispatches.
- */
-enum scx_ops_state {
- SCX_OPSS_NONE, /* owned by the SCX core */
- SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
- SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
- SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
-
- /*
- * QSEQ brands each QUEUED instance so that, when dispatch races
- * dequeue/requeue, the dispatcher can tell whether it still has a claim
- * on the task being dispatched.
- *
- * As some 32bit archs can't do 64bit store_release/load_acquire,
- * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
- * 32bit machines. The dispatch race window QSEQ protects is very narrow
- * and runs with IRQ disabled. 30 bits should be sufficient.
- */
- SCX_OPSS_QSEQ_SHIFT = 2,
-};
-
-/* Use macros to ensure that the type is unsigned long for the masks */
-#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
-#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
-
/*
* NOTE: sched_ext is in the process of growing multiple scheduler support and
* scx_root usage is in a transitional state. Naked dereferences are safe if the
@@ -1170,7 +136,7 @@ static struct kset *scx_kset;
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
-static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
s64 exit_code, const char *fmt, va_list args);
@@ -1185,24 +151,7 @@ static __printf(4, 5) void scx_exit(struct scx_sched *sch,
va_end(args);
}
-static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code,
- const char *fmt, ...)
-{
- struct scx_sched *sch;
- va_list args;
-
- rcu_read_lock();
- sch = rcu_dereference(scx_root);
- if (sch) {
- va_start(args, fmt);
- scx_vexit(sch, kind, exit_code, fmt, args);
- va_end(args);
- }
- rcu_read_unlock();
-}
-
#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
-#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args)
#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
@@ -1232,10 +181,9 @@ static bool u32_before(u32 a, u32 b)
return (s32)(a - b) < 0;
}
-static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
+static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
+ struct task_struct *p)
{
- struct scx_sched *sch = scx_root;
-
return sch->global_dsqs[cpu_to_node(task_cpu(p))];
}
@@ -1363,11 +311,11 @@ do { \
})
/* @mask is constant, always inline to cull unnecessary branches */
-static __always_inline bool scx_kf_allowed(u32 mask)
+static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask)
{
if (unlikely(!(current->scx.kf_mask & mask))) {
- scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
- mask, current->scx.kf_mask);
+ scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
return false;
}
@@ -1380,13 +328,13 @@ static __always_inline bool scx_kf_allowed(u32 mask)
*/
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
- scx_kf_error("cpu_release kfunc called from a nested operation");
+ scx_error(sch, "cpu_release kfunc called from a nested operation");
return false;
}
if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
- scx_kf_error("dispatch kfunc called from a nested operation");
+ scx_error(sch, "dispatch kfunc called from a nested operation");
return false;
}
@@ -1394,15 +342,16 @@ static __always_inline bool scx_kf_allowed(u32 mask)
}
/* see SCX_CALL_OP_TASK() */
-static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
+static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
+ u32 mask,
struct task_struct *p)
{
- if (!scx_kf_allowed(mask))
+ if (!scx_kf_allowed(sch, mask))
return false;
if (unlikely((p != current->scx.kf_tasks[0] &&
p != current->scx.kf_tasks[1]))) {
- scx_kf_error("called on a task not being operated on");
+ scx_error(sch, "called on a task not being operated on");
return false;
}
@@ -1488,10 +437,11 @@ struct bpf_iter_scx_dsq {
*/
struct scx_task_iter {
struct sched_ext_entity cursor;
- struct task_struct *locked;
+ struct task_struct *locked_task;
struct rq *rq;
struct rq_flags rf;
u32 cnt;
+ bool list_locked;
};
/**
@@ -1519,15 +469,16 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
list_add(&iter->cursor.tasks_node, &scx_tasks);
- iter->locked = NULL;
+ iter->locked_task = NULL;
iter->cnt = 0;
+ iter->list_locked = true;
}
static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
- if (iter->locked) {
- task_rq_unlock(iter->rq, iter->locked, &iter->rf);
- iter->locked = NULL;
+ if (iter->locked_task) {
+ task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
+ iter->locked_task = NULL;
}
}
@@ -1537,24 +488,24 @@ static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
*
* If @iter is in the middle of a locked iteration, it may be locking the rq of
* the task currently being visited in addition to scx_tasks_lock. Unlock both.
- * This function can be safely called anytime during an iteration.
+ * This function can be safely called anytime during an iteration. The next
+ * iterator operation will automatically restore the necessary locking.
*/
static void scx_task_iter_unlock(struct scx_task_iter *iter)
{
__scx_task_iter_rq_unlock(iter);
- spin_unlock_irq(&scx_tasks_lock);
+ if (iter->list_locked) {
+ iter->list_locked = false;
+ spin_unlock_irq(&scx_tasks_lock);
+ }
}
-/**
- * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
- * @iter: iterator to re-lock
- *
- * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
- * doesn't re-lock the rq lock. Must be called before other iterator operations.
- */
-static void scx_task_iter_relock(struct scx_task_iter *iter)
+static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
{
- spin_lock_irq(&scx_tasks_lock);
+ if (!iter->list_locked) {
+ spin_lock_irq(&scx_tasks_lock);
+ iter->list_locked = true;
+ }
}
/**
@@ -1567,6 +518,7 @@ static void scx_task_iter_relock(struct scx_task_iter *iter)
*/
static void scx_task_iter_stop(struct scx_task_iter *iter)
{
+ __scx_task_iter_maybe_relock(iter);
list_del_init(&iter->cursor.tasks_node);
scx_task_iter_unlock(iter);
}
@@ -1584,10 +536,12 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
+ __scx_task_iter_maybe_relock(iter);
+
if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
- scx_task_iter_relock(iter);
+ __scx_task_iter_maybe_relock(iter);
}
list_for_each_entry(pos, cursor, tasks_node) {
@@ -1650,7 +604,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return NULL;
iter->rq = task_rq_lock(p, &iter->rf);
- iter->locked = p;
+ iter->locked_task = p;
return p;
}
@@ -1664,7 +618,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* This can be used when preemption is not disabled.
*/
#define scx_add_event(sch, name, cnt) do { \
- this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
trace_sched_ext_event(#name, (cnt)); \
} while(0)
@@ -1677,7 +631,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* This should be used only when preemption is disabled.
*/
#define __scx_add_event(sch, name, cnt) do { \
- __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
trace_sched_ext_event(#name, cnt); \
} while(0)
@@ -1766,23 +720,6 @@ static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
}
/**
- * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args
- * @cpu: cpu number which came from a BPF ops
- * @where: extra information reported on error
- *
- * The same as ops_cpu_valid() but @sch is implicit.
- */
-static bool kf_cpu_valid(u32 cpu, const char *where)
-{
- if (__cpu_valid(cpu)) {
- return true;
- } else {
- scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
- return false;
- }
-}
-
-/**
* ops_sanitize_err - Sanitize a -errno value
* @sch: scx_sched to error out on error
* @ops_name: operation to blame on failure
@@ -1942,10 +879,10 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
WRITE_ONCE(dsq->nr, dsq->nr + delta);
}
-static void refill_task_slice_dfl(struct task_struct *p)
+static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
{
p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
+ __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
}
static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
@@ -1963,7 +900,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
- dsq = find_global_dsq(p);
+ dsq = find_global_dsq(sch, p);
raw_spin_lock(&dsq->lock);
}
}
@@ -2142,26 +1079,27 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
- return find_global_dsq(p);
+ return find_global_dsq(sch, p);
return &cpu_rq(cpu)->scx.local_dsq;
}
if (dsq_id == SCX_DSQ_GLOBAL)
- dsq = find_global_dsq(p);
+ dsq = find_global_dsq(sch, p);
else
dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
dsq_id, p->comm, p->pid);
- return find_global_dsq(p);
+ return find_global_dsq(sch, p);
}
return dsq;
}
-static void mark_direct_dispatch(struct task_struct *ddsp_task,
+static void mark_direct_dispatch(struct scx_sched *sch,
+ struct task_struct *ddsp_task,
struct task_struct *p, u64 dsq_id,
u64 enq_flags)
{
@@ -2175,10 +1113,10 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
/* @p must match the task on the enqueue path */
if (unlikely(p != ddsp_task)) {
if (IS_ERR(ddsp_task))
- scx_kf_error("%s[%d] already direct-dispatched",
+ scx_error(sch, "%s[%d] already direct-dispatched",
p->comm, p->pid);
else
- scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
ddsp_task->comm, ddsp_task->pid,
p->comm, p->pid);
return;
@@ -2333,15 +1271,15 @@ local:
* higher priority it becomes from scx_prio_less()'s POV.
*/
touch_core_sched(rq, p);
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
local_norefill:
dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
return;
global:
touch_core_sched(rq, p); /* see the comment in local: */
- refill_task_slice_dfl(p);
- dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags);
+ refill_task_slice_dfl(sch, p);
+ dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -2651,8 +1589,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch,
if (!scx_rq_online(rq)) {
if (enforce)
- __scx_add_event(scx_root,
- SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+ __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
}
@@ -2754,7 +1691,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
- dst_dsq = find_global_dsq(p);
+ dst_dsq = find_global_dsq(sch, p);
dst_rq = src_rq;
}
} else {
@@ -2910,7 +1847,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
- dispatch_enqueue(sch, find_global_dsq(p), p,
+ dispatch_enqueue(sch, find_global_dsq(sch, p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
@@ -3155,10 +2092,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* balance(), we want to complete this scheduling cycle and then
* start a new one. IOW, we want to call resched_curr() on the
* next, most likely idle, task, not the current one. Use
- * scx_bpf_kick_cpu() for deferred kicking.
+ * scx_kick_cpu() for deferred kicking.
*/
if (unlikely(!--nr_loops)) {
- scx_bpf_kick_cpu(cpu_of(rq), 0);
+ scx_kick_cpu(sch, cpu_of(rq), 0);
break;
}
} while (dspc->nr_tasks);
@@ -3442,24 +2379,25 @@ static struct task_struct *pick_task_scx(struct rq *rq)
if (keep_prev) {
p = prev;
if (!p->scx.slice)
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
} else {
p = first_local_task(rq);
if (!p) {
if (kick_idle)
- scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
+ scx_kick_cpu(rcu_dereference_sched(scx_root),
+ cpu_of(rq), SCX_KICK_IDLE);
return NULL;
}
if (unlikely(!p->scx.slice)) {
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = rcu_dereference_sched(scx_root);
if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
p->comm, p->pid, __func__);
sch->warned_zero_slice = true;
}
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
}
}
@@ -3548,7 +2486,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
} else {
cpu = prev_cpu;
@@ -4084,7 +3022,7 @@ bool scx_can_stop_tick(struct rq *rq)
#ifdef CONFIG_EXT_GROUP_SCHED
-DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem);
static bool scx_cgroup_enabled;
void scx_tg_init(struct task_group *tg)
@@ -4101,8 +3039,6 @@ int scx_tg_online(struct task_group *tg)
WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
- percpu_down_read(&scx_cgroup_rwsem);
-
if (scx_cgroup_enabled) {
if (SCX_HAS_OP(sch, cgroup_init)) {
struct scx_cgroup_init_args args =
@@ -4122,7 +3058,6 @@ int scx_tg_online(struct task_group *tg)
tg->scx.flags |= SCX_TG_ONLINE;
}
- percpu_up_read(&scx_cgroup_rwsem);
return ret;
}
@@ -4132,15 +3067,11 @@ void scx_tg_offline(struct task_group *tg)
WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
- percpu_down_read(&scx_cgroup_rwsem);
-
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
(tg->scx.flags & SCX_TG_INITED))
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
tg->css.cgroup);
tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
-
- percpu_up_read(&scx_cgroup_rwsem);
}
int scx_cgroup_can_attach(struct cgroup_taskset *tset)
@@ -4150,9 +3081,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
struct task_struct *p;
int ret;
- /* released in scx_finish/cancel_attach() */
- percpu_down_read(&scx_cgroup_rwsem);
-
if (!scx_cgroup_enabled)
return 0;
@@ -4192,7 +3120,6 @@ err:
p->scx.cgrp_moving_from = NULL;
}
- percpu_up_read(&scx_cgroup_rwsem);
return ops_sanitize_err(sch, "cgroup_prep_move", ret);
}
@@ -4215,11 +3142,6 @@ void scx_cgroup_move_task(struct task_struct *p)
p->scx.cgrp_moving_from = NULL;
}
-void scx_cgroup_finish_attach(void)
-{
- percpu_up_read(&scx_cgroup_rwsem);
-}
-
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
struct scx_sched *sch = scx_root;
@@ -4227,7 +3149,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
struct task_struct *p;
if (!scx_cgroup_enabled)
- goto out_unlock;
+ return;
cgroup_taskset_for_each(p, css, tset) {
if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
@@ -4236,15 +3158,13 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
-out_unlock:
- percpu_up_read(&scx_cgroup_rwsem);
}
void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
struct scx_sched *sch = scx_root;
- percpu_down_read(&scx_cgroup_rwsem);
+ percpu_down_read(&scx_cgroup_ops_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
tg->scx.weight != weight)
@@ -4253,7 +3173,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
tg->scx.weight = weight;
- percpu_up_read(&scx_cgroup_rwsem);
+ percpu_up_read(&scx_cgroup_ops_rwsem);
}
void scx_group_set_idle(struct task_group *tg, bool idle)
@@ -4266,7 +3186,7 @@ void scx_group_set_bandwidth(struct task_group *tg,
{
struct scx_sched *sch = scx_root;
- percpu_down_read(&scx_cgroup_rwsem);
+ percpu_down_read(&scx_cgroup_ops_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
(tg->scx.bw_period_us != period_us ||
@@ -4279,23 +3199,25 @@ void scx_group_set_bandwidth(struct task_group *tg,
tg->scx.bw_quota_us = quota_us;
tg->scx.bw_burst_us = burst_us;
- percpu_up_read(&scx_cgroup_rwsem);
+ percpu_up_read(&scx_cgroup_ops_rwsem);
}
static void scx_cgroup_lock(void)
{
- percpu_down_write(&scx_cgroup_rwsem);
+ percpu_down_write(&scx_cgroup_ops_rwsem);
+ cgroup_lock();
}
static void scx_cgroup_unlock(void)
{
- percpu_up_write(&scx_cgroup_rwsem);
+ cgroup_unlock();
+ percpu_up_write(&scx_cgroup_ops_rwsem);
}
#else /* CONFIG_EXT_GROUP_SCHED */
-static inline void scx_cgroup_lock(void) {}
-static inline void scx_cgroup_unlock(void) {}
+static void scx_cgroup_lock(void) {}
+static void scx_cgroup_unlock(void) {}
#endif /* CONFIG_EXT_GROUP_SCHED */
@@ -4411,15 +3333,12 @@ static void scx_cgroup_exit(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
- percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-
scx_cgroup_enabled = false;
/*
- * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
* cgroups and exit all the inited ones, all online cgroups are exited.
*/
- rcu_read_lock();
css_for_each_descendant_post(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
@@ -4430,17 +3349,9 @@ static void scx_cgroup_exit(struct scx_sched *sch)
if (!sch->ops.cgroup_exit)
continue;
- if (WARN_ON_ONCE(!css_tryget(css)))
- continue;
- rcu_read_unlock();
-
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
css->cgroup);
-
- rcu_read_lock();
- css_put(css);
}
- rcu_read_unlock();
}
static int scx_cgroup_init(struct scx_sched *sch)
@@ -4448,13 +3359,10 @@ static int scx_cgroup_init(struct scx_sched *sch)
struct cgroup_subsys_state *css;
int ret;
- percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-
/*
- * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
* cgroups and init, all online cgroups are initialized.
*/
- rcu_read_lock();
css_for_each_descendant_pre(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
struct scx_cgroup_init_args args = {
@@ -4473,10 +3381,6 @@ static int scx_cgroup_init(struct scx_sched *sch)
continue;
}
- if (WARN_ON_ONCE(!css_tryget(css)))
- continue;
- rcu_read_unlock();
-
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
@@ -4485,11 +3389,7 @@ static int scx_cgroup_init(struct scx_sched *sch)
return ret;
}
tg->scx.flags |= SCX_TG_INITED;
-
- rcu_read_lock();
- css_put(css);
}
- rcu_read_unlock();
WARN_ON_ONCE(scx_cgroup_enabled);
scx_cgroup_enabled = true;
@@ -4572,7 +3472,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
int node;
kthread_stop(sch->helper->task);
- free_percpu(sch->event_stats_cpu);
+ free_percpu(sch->pcpu);
for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]);
@@ -4671,9 +3571,22 @@ bool task_should_scx(int policy)
bool scx_allow_ttwu_queue(const struct task_struct *p)
{
- return !scx_enabled() ||
- (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
- p->sched_class != &ext_sched_class;
+ struct scx_sched *sch;
+
+ if (!scx_enabled())
+ return true;
+
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch))
+ return true;
+
+ if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+ return true;
+
+ if (unlikely(p->sched_class != &ext_sched_class))
+ return true;
+
+ return false;
}
/**
@@ -4789,7 +3702,7 @@ static void scx_clear_softlockup(void)
*
* - pick_next_task() suppresses zero slice warning.
*
- * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM
* operations.
*
* - scx_prio_less() reverts to the default core_sched_at order.
@@ -5234,7 +4147,8 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
- dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
+ dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr),
+ p->migration_disabled);
if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
@@ -5473,13 +4387,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
sch->global_dsqs[node] = dsq;
}
- sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
- if (!sch->event_stats_cpu)
+ sch->pcpu = alloc_percpu(struct scx_sched_pcpu);
+ if (!sch->pcpu)
goto err_free_gdsqs;
sch->helper = kthread_run_worker(0, "sched_ext_helper");
if (!sch->helper)
- goto err_free_event_stats;
+ goto err_free_pcpu;
sched_set_fifo(sch->helper->task);
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
@@ -5497,8 +4411,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
err_stop_helper:
kthread_stop(sch->helper->task);
-err_free_event_stats:
- free_percpu(sch->event_stats_cpu);
+err_free_pcpu:
+ free_percpu(sch->pcpu);
err_free_gdsqs:
for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]);
@@ -5621,6 +4535,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_error(sch, "ops.init() failed (%d)", ret);
goto err_disable;
}
+ sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
@@ -5713,7 +4628,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
ret = scx_init_task(p, task_group(p), false);
if (ret) {
put_task_struct(p);
- scx_task_iter_relock(&sti);
scx_task_iter_stop(&sti);
scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
ret, p->comm, p->pid);
@@ -5723,7 +4637,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_set_task_state(p, SCX_TASK_READY);
put_task_struct(p);
- scx_task_iter_relock(&sti);
}
scx_task_iter_stop(&sti);
scx_cgroup_unlock();
@@ -5749,6 +4662,9 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
__setscheduler_class(p->policy, p->prio);
struct sched_enq_and_set_ctx ctx;
+ if (!tryget_task_struct(p))
+ continue;
+
if (old_class != new_class && p->se.sched_delayed)
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
@@ -5761,6 +4677,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
sched_enq_and_set_task(&ctx);
check_class_changed(task_rq(p), p, old_class, p->prio);
+ put_task_struct(p);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
@@ -5791,7 +4708,7 @@ err_unlock:
err_disable_unlock_all:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
- scx_bypass(false);
+ /* we'll soon enter disable path, keep bypass on */
err_disable:
mutex_unlock(&scx_enable_mutex);
/*
@@ -6324,40 +5241,41 @@ void __init init_sched_ext_class(void)
/********************************************************************************
* Helpers that can be called from the BPF scheduler.
*/
-static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
+static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
+ u64 enq_flags)
{
- if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
+ if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
return false;
lockdep_assert_irqs_disabled();
if (unlikely(!p)) {
- scx_kf_error("called with NULL task");
+ scx_error(sch, "called with NULL task");
return false;
}
if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
- scx_kf_error("invalid enq_flags 0x%llx", enq_flags);
+ scx_error(sch, "invalid enq_flags 0x%llx", enq_flags);
return false;
}
return true;
}
-static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
+static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
+ u64 dsq_id, u64 enq_flags)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct task_struct *ddsp_task;
ddsp_task = __this_cpu_read(direct_dispatch_task);
if (ddsp_task) {
- mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
+ mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags);
return;
}
if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
- scx_kf_error("dispatch buffer overflow");
+ scx_error(sch, "dispatch buffer overflow");
return;
}
@@ -6409,7 +5327,14 @@ __bpf_kfunc_start_defs();
__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
u64 enq_flags)
{
- if (!scx_dsq_insert_preamble(p, enq_flags))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
return;
if (slice)
@@ -6417,7 +5342,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
else
p->scx.slice = p->scx.slice ?: 1;
- scx_dsq_insert_commit(p, dsq_id, enq_flags);
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags);
}
/**
@@ -6444,7 +5369,14 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
u64 slice, u64 vtime, u64 enq_flags)
{
- if (!scx_dsq_insert_preamble(p, enq_flags))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
return;
if (slice)
@@ -6454,7 +5386,7 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
p->scx.dsq_vtime = vtime;
- scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
__bpf_kfunc_end_defs();
@@ -6479,7 +5411,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
bool in_balance;
unsigned long flags;
- if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
+ if (!scx_kf_allowed_if_unlocked() &&
+ !scx_kf_allowed(sch, SCX_KF_DISPATCH))
return false;
/*
@@ -6564,7 +5497,15 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
{
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return 0;
return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
@@ -6579,14 +5520,21 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
__bpf_kfunc void scx_bpf_dispatch_cancel(void)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct scx_sched *sch;
+
+ guard(rcu)();
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return;
if (dspc->cursor > 0)
dspc->cursor--;
else
- scx_kf_error("dispatch buffer underflow");
+ scx_error(sch, "dispatch buffer underflow");
}
/**
@@ -6605,11 +5553,17 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
*/
__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{
- struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
+ struct scx_sched *sch;
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return false;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return false;
flush_dispatch_buf(sch, dspc->rq);
@@ -6756,12 +5710,18 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
{
+ struct scx_sched *sch;
LIST_HEAD(tasks);
u32 nr_enqueued = 0;
struct rq *rq;
struct task_struct *p, *n;
- if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
return 0;
rq = cpu_rq(smp_processor_id());
@@ -6784,12 +5744,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
* CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
* the current local DSQ for running tasks and thus are not
* visible to the BPF scheduler.
- *
- * Also skip re-enqueueing tasks that can only run on this
- * CPU, as they would just be re-added to the same local
- * DSQ without any benefit.
*/
- if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1)
+ if (p->migration_pending)
continue;
dispatch_dequeue(rq, p);
@@ -6877,22 +5833,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
__bpf_kfunc_start_defs();
-/**
- * scx_bpf_kick_cpu - Trigger reschedule on a CPU
- * @cpu: cpu to kick
- * @flags: %SCX_KICK_* flags
- *
- * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
- * trigger rescheduling on a busy CPU. This can be called from any online
- * scx_ops operation and the actual kicking is performed asynchronously through
- * an irq work.
- */
-__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
{
struct rq *this_rq;
unsigned long irq_flags;
- if (!kf_cpu_valid(cpu, NULL))
+ if (!ops_cpu_valid(sch, cpu, NULL))
return;
local_irq_save(irq_flags);
@@ -6916,7 +5862,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *target_rq = cpu_rq(cpu);
if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
- scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+ scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
if (raw_spin_rq_trylock(target_rq)) {
if (can_skip_idle_kick(target_rq)) {
@@ -6941,6 +5887,26 @@ out:
}
/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: %SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (likely(sch))
+ scx_kick_cpu(sch, cpu, flags);
+}
+
+/**
* scx_bpf_dsq_nr_queued - Return the number of queued tasks
* @dsq_id: id of the DSQ
*
@@ -7120,28 +6086,29 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
__bpf_kfunc_end_defs();
-static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
- char *fmt, unsigned long long *data, u32 data__sz)
+static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
+ size_t line_size, char *fmt, unsigned long long *data,
+ u32 data__sz)
{
struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
s32 ret;
if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
(data__sz && !data)) {
- scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz);
+ scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz);
return -EINVAL;
}
ret = copy_from_kernel_nofault(data_buf, data, data__sz);
if (ret < 0) {
- scx_kf_error("failed to read data fields (%d)", ret);
+ scx_error(sch, "failed to read data fields (%d)", ret);
return ret;
}
ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
&bprintf_data);
if (ret < 0) {
- scx_kf_error("format preparation failed (%d)", ret);
+ scx_error(sch, "format preparation failed (%d)", ret);
return ret;
}
@@ -7149,17 +6116,17 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
bprintf_data.bin_args);
bpf_bprintf_cleanup(&bprintf_data);
if (ret < 0) {
- scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
+ scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
return ret;
}
return ret;
}
-static s32 bstr_format(struct scx_bstr_buf *buf,
+static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf,
char *fmt, unsigned long long *data, u32 data__sz)
{
- return __bstr_format(buf->data, buf->line, sizeof(buf->line),
+ return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line),
fmt, data, data__sz);
}
@@ -7178,11 +6145,14 @@ __bpf_kfunc_start_defs();
__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
unsigned long long *data, u32 data__sz)
{
+ struct scx_sched *sch;
unsigned long flags;
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
- if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
+ sch = rcu_dereference_bh(scx_root);
+ if (likely(sch) &&
+ bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7198,11 +6168,14 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
u32 data__sz)
{
+ struct scx_sched *sch;
unsigned long flags;
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
- if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
+ sch = rcu_dereference_bh(scx_root);
+ if (likely(sch) &&
+ bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7221,17 +6194,24 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
u32 data__sz)
{
+ struct scx_sched *sch;
struct scx_dump_data *dd = &scx_dump_data;
struct scx_bstr_buf *buf = &dd->buf;
s32 ret;
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
if (raw_smp_processor_id() != dd->cpu) {
- scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends");
return;
}
/* append the formatted string to the line buf */
- ret = __bstr_format(buf->data, buf->line + dd->cursor,
+ ret = __bstr_format(sch, buf->data, buf->line + dd->cursor,
sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
if (ret < 0) {
dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
@@ -7267,7 +6247,12 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
{
- if (kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
return arch_scale_cpu_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7289,7 +6274,12 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
{
- if (kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
return arch_scale_freq_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7311,12 +6301,20 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
*/
__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(sch);
+ if (unlikely(!sch))
+ return;
+
if (unlikely(perf > SCX_CPUPERF_ONE)) {
- scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu);
return;
}
- if (kf_cpu_valid(cpu, NULL)) {
+ if (ops_cpu_valid(sch, cpu, NULL)) {
struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
struct rq_flags rf;
@@ -7325,7 +6323,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
* to the corresponding CPU to prevent ABBA deadlocks.
*/
if (locked_rq && rq != locked_rq) {
- scx_kf_error("Invalid target CPU %d", cpu);
+ scx_error(sch, "Invalid target CPU %d", cpu);
return;
}
@@ -7420,13 +6418,76 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
*/
__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
{
- if (!kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
return NULL;
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return NULL;
+
+ if (!sch->warned_deprecated_rq) {
+ printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; "
+ "use scx_bpf_locked_rq() when holding rq lock "
+ "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__);
+ sch->warned_deprecated_rq = true;
+ }
+
return cpu_rq(cpu);
}
/**
+ * scx_bpf_locked_rq - Return the rq currently locked by SCX
+ *
+ * Returns the rq if a rq lock is currently held by SCX.
+ * Otherwise emits an error and returns NULL.
+ */
+__bpf_kfunc struct rq *scx_bpf_locked_rq(void)
+{
+ struct scx_sched *sch;
+ struct rq *rq;
+
+ guard(preempt)();
+
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ rq = scx_locked_rq();
+ if (!rq) {
+ scx_error(sch, "accessing rq without holding rq lock");
+ return NULL;
+ }
+
+ return rq;
+}
+
+/**
+ * scx_bpf_cpu_curr - Return remote CPU's curr task
+ * @cpu: CPU of interest
+ *
+ * Callers must hold RCU read lock (KF_RCU).
+ */
+__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return NULL;
+
+ return rcu_dereference(cpu_rq(cpu)->curr);
+}
+
+/**
* scx_bpf_task_cgroup - Return the sched cgroup of a task
* @p: task of interest
*
@@ -7442,8 +6503,15 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
{
struct task_group *tg = p->sched_task_group;
struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ goto out;
- if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
+ if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p))
goto out;
cgrp = tg_cgrp(tg);
@@ -7524,7 +6592,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event
/* Aggregate per-CPU event counters into @events. */
memset(events, 0, sizeof(*events));
for_each_possible_cpu(cpu) {
- e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
+ e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats;
scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
@@ -7590,6 +6658,8 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
+BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
#ifdef CONFIG_CGROUP_SCHED
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 292bb41a242e..43429b33e52c 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,29 +8,6 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
-static inline bool scx_kf_allowed_if_unlocked(void)
-{
- return !current->scx.kf_mask;
-}
-
-static inline bool scx_rq_bypassing(struct rq *rq)
-{
- return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
-}
-
-DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
-
-DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
-
-/*
- * Return the rq currently locked from an scx callback, or NULL if no rq is
- * locked.
- */
-static inline struct rq *scx_locked_rq(void)
-{
- return __this_cpu_read(scx_locked_rq_state);
-}
-
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -100,7 +77,6 @@ int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
void scx_cgroup_move_task(struct task_struct *p);
-void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
@@ -111,7 +87,6 @@ static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
static inline void scx_cgroup_move_task(struct task_struct *p) {}
-static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 7174e1c1a392..d2434c954848 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -819,10 +819,10 @@ void scx_idle_disable(void)
* Helpers that can be called from the BPF scheduler.
*/
-static int validate_node(int node)
+static int validate_node(struct scx_sched *sch, int node)
{
if (!static_branch_likely(&scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is disabled");
+ scx_error(sch, "per-node idle tracking is disabled");
return -EOPNOTSUPP;
}
@@ -832,13 +832,13 @@ static int validate_node(int node)
/* Make sure node is in a valid range */
if (node < 0 || node >= nr_node_ids) {
- scx_kf_error("invalid node %d", node);
+ scx_error(sch, "invalid node %d", node);
return -EINVAL;
}
/* Make sure the node is part of the set of possible nodes */
if (!node_possible(node)) {
- scx_kf_error("unavailable node %d", node);
+ scx_error(sch, "unavailable node %d", node);
return -EINVAL;
}
@@ -847,26 +847,53 @@ static int validate_node(int node)
__bpf_kfunc_start_defs();
-static bool check_builtin_idle_enabled(void)
+static bool check_builtin_idle_enabled(struct scx_sched *sch)
{
if (static_branch_likely(&scx_builtin_idle_enabled))
return true;
- scx_kf_error("built-in idle tracking is disabled");
+ scx_error(sch, "built-in idle tracking is disabled");
return false;
}
-static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+/*
+ * Determine whether @p is a migration-disabled task in the context of BPF
+ * code.
+ *
+ * We can't simply check whether @p->migration_disabled is set in a
+ * sched_ext callback, because migration is always disabled for the current
+ * task while running BPF code.
+ *
+ * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively
+ * disable and re-enable migration. For this reason, the current task
+ * inside a sched_ext callback is always a migration-disabled task.
+ *
+ * Therefore, when @p->migration_disabled == 1, check whether @p is the
+ * current task or not: if it is, then migration was not disabled before
+ * entering the callback, otherwise migration was disabled.
+ *
+ * Returns true if @p is migration-disabled, false otherwise.
+ */
+static bool is_bpf_migration_disabled(const struct task_struct *p)
+{
+ if (p->migration_disabled == 1)
+ return p != current;
+ else
+ return p->migration_disabled;
+}
+
+static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
+ s32 prev_cpu, u64 wake_flags,
const struct cpumask *allowed, u64 flags)
{
struct rq *rq;
struct rq_flags rf;
s32 cpu;
- if (!kf_cpu_valid(prev_cpu, NULL))
+ if (!ops_cpu_valid(sch, prev_cpu, NULL))
return -EINVAL;
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return -EBUSY;
/*
@@ -879,7 +906,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
if (scx_kf_allowed_if_unlocked()) {
rq = task_rq_lock(p, &rf);
} else {
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
+ if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
return -EPERM;
rq = scx_locked_rq();
}
@@ -898,7 +925,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
* selection optimizations and simply check whether the previously
* used CPU is idle and within the allowed cpumask.
*/
- if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) {
+ if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) {
if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) &&
scx_idle_test_and_clear_cpu(prev_cpu))
cpu = prev_cpu;
@@ -922,9 +949,13 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
*/
__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
{
- if (!kf_cpu_valid(cpu, NULL))
- return NUMA_NO_NODE;
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL))
+ return NUMA_NO_NODE;
return cpu_to_node(cpu);
}
@@ -946,15 +977,21 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *is_idle)
{
+ struct scx_sched *sch;
s32 cpu;
- cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0);
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
*is_idle = true;
return cpu;
}
*is_idle = false;
-
return prev_cpu;
}
@@ -981,7 +1018,16 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags)
{
- return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags,
+ cpus_allowed, flags);
}
/**
@@ -995,7 +1041,15 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ node = validate_node(sch, node);
if (node < 0)
return cpu_none_mask;
@@ -1011,12 +1065,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return cpu_none_mask;
return idle_cpumask(NUMA_NO_NODE)->cpu;
@@ -1034,7 +1096,15 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ node = validate_node(sch, node);
if (node < 0)
return cpu_none_mask;
@@ -1054,12 +1124,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return cpu_none_mask;
if (sched_smt_active())
@@ -1095,10 +1173,18 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
*/
__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
{
- if (!check_builtin_idle_enabled())
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
return false;
- if (!kf_cpu_valid(cpu, NULL))
+ if (!check_builtin_idle_enabled(sch))
+ return false;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
return false;
return scx_idle_test_and_clear_cpu(cpu);
@@ -1126,7 +1212,15 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
int node, u64 flags)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ node = validate_node(sch, node);
if (node < 0)
return node;
@@ -1158,12 +1252,20 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is enabled");
+ scx_error(sch, "per-node idle tracking is enabled");
return -EBUSY;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return -EBUSY;
return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
@@ -1193,9 +1295,16 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
int node, u64 flags)
{
+ struct scx_sched *sch;
s32 cpu;
- node = validate_node(node);
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ node = validate_node(sch, node);
if (node < 0)
return node;
@@ -1233,10 +1342,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
+ struct scx_sched *sch;
s32 cpu;
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is enabled");
+ scx_error(sch, "per-node idle tracking is enabled");
return -EBUSY;
}
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
new file mode 100644
index 000000000000..b3617abed510
--- /dev/null
+++ b/kernel/sched/ext_internal.h
@@ -0,0 +1,1078 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+
+enum scx_consts {
+ SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_DSP_MAX_LOOPS = 32,
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
+
+ SCX_EXIT_BT_LEN = 64,
+ SCX_EXIT_MSG_LEN = 1024,
+ SCX_EXIT_DUMP_DFL_LEN = 32768,
+
+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
+
+ /*
+ * Iterating all tasks may take a while. Periodically drop
+ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+ */
+ SCX_TASK_ITER_BATCH = 32,
+};
+
+enum scx_exit_kind {
+ SCX_EXIT_NONE,
+ SCX_EXIT_DONE,
+
+ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
+ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
+ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
+ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
+
+ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
+ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
+};
+
+/*
+ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+ * are 64bit of the format:
+ *
+ * Bits: [63 .. 48 47 .. 32 31 .. 0]
+ * [ SYS ACT ] [ SYS RSN ] [ USR ]
+ *
+ * SYS ACT: System-defined exit actions
+ * SYS RSN: System-defined exit reasons
+ * USR : User-defined exit codes and reasons
+ *
+ * Using the above, users may communicate intention and context by ORing system
+ * actions and/or system reasons with a user-defined exit code.
+ */
+enum scx_exit_code {
+ /* Reasons */
+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
+
+ /* Actions */
+ SCX_ECODE_ACT_RESTART = 1LLU << 48,
+};
+
+enum scx_exit_flags {
+ /*
+ * ops.exit() may be called even if the loading failed before ops.init()
+ * finishes successfully. This is because ops.exit() allows rich exit
+ * info communication. The following flag indicates whether ops.init()
+ * finished successfully.
+ */
+ SCX_EFLAG_INITIALIZED,
+};
+
+/*
+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
+ * being disabled.
+ */
+struct scx_exit_info {
+ /* %SCX_EXIT_* - broad category of the exit reason */
+ enum scx_exit_kind kind;
+
+ /* exit code if gracefully exiting */
+ s64 exit_code;
+
+ /* %SCX_EFLAG_* */
+ u64 flags;
+
+ /* textual representation of the above */
+ const char *reason;
+
+ /* backtrace if exiting due to an error */
+ unsigned long *bt;
+ u32 bt_len;
+
+ /* informational message */
+ char *msg;
+
+ /* debug dump */
+ char *dump;
+};
+
+/* sched_ext_ops.flags */
+enum scx_ops_flags {
+ /*
+ * Keep built-in idle tracking even if ops.update_idle() is implemented.
+ */
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+
+ /*
+ * By default, if there are no other task to run on the CPU, ext core
+ * keeps running the current task even after its slice expires. If this
+ * flag is specified, such tasks are passed to ops.enqueue() with
+ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
+ */
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
+
+ /*
+ * An exiting task may schedule after PF_EXITING is set. In such cases,
+ * bpf_task_from_pid() may not be able to find the task and if the BPF
+ * scheduler depends on pid lookup for dispatching, the task will be
+ * lost leading to various issues including RCU grace period stalls.
+ *
+ * To mask this problem, by default, unhashed tasks are automatically
+ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
+ * depend on pid lookups and wants to handle these tasks directly, the
+ * following flag can be used.
+ */
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
+
+ /*
+ * If set, only tasks with policy set to SCHED_EXT are attached to
+ * sched_ext. If clear, SCHED_NORMAL tasks are also included.
+ */
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+
+ /*
+ * A migration disabled task can only execute on its current CPU. By
+ * default, such tasks are automatically put on the CPU's local DSQ with
+ * the default slice on enqueue. If this ops flag is set, they also go
+ * through ops.enqueue().
+ *
+ * A migration disabled task never invokes ops.select_cpu() as it can
+ * only select the current CPU. Also, p->cpus_ptr will only contain its
+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
+ */
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
+ * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+ * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+ * transfers. When this optimization is enabled, ops.select_cpu() is
+ * skipped in some cases (when racing against the wakee switching out).
+ * As the BPF scheduler may depend on ops.select_cpu() being invoked
+ * during wakeups, queued wakeup is disabled by default.
+ *
+ * If this ops flag is set, queued wakeup optimization is enabled and
+ * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+ * wakee's CPU without preceding ops.select_cpu() even for tasks which
+ * may be executed on multiple CPUs.
+ */
+ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
+
+ /*
+ * If set, enable per-node idle cpumasks. If clear, use a single global
+ * flat idle cpumask.
+ */
+ SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
+
+ /*
+ * CPU cgroup support flags
+ */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
+
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
+ SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
+ SCX_OPS_HAS_CGROUP_WEIGHT,
+
+ /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
+ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
+};
+
+/* argument container for ops.init_task() */
+struct scx_init_task_args {
+ /*
+ * Set if ops.init_task() is being invoked on the fork path, as opposed
+ * to the scheduler transition path.
+ */
+ bool fork;
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /* the cgroup the task is joining */
+ struct cgroup *cgroup;
+#endif
+};
+
+/* argument container for ops.exit_task() */
+struct scx_exit_task_args {
+ /* Whether the task exited before running on sched_ext. */
+ bool cancelled;
+};
+
+/* argument container for ops->cgroup_init() */
+struct scx_cgroup_init_args {
+ /* the weight of the cgroup [1..10000] */
+ u32 weight;
+
+ /* bandwidth control parameters from cpu.max and cpu.max.burst */
+ u64 bw_period_us;
+ u64 bw_quota_us;
+ u64 bw_burst_us;
+};
+
+enum scx_cpu_preempt_reason {
+ /* next task is being scheduled by &sched_class_rt */
+ SCX_CPU_PREEMPT_RT,
+ /* next task is being scheduled by &sched_class_dl */
+ SCX_CPU_PREEMPT_DL,
+ /* next task is being scheduled by &sched_class_stop */
+ SCX_CPU_PREEMPT_STOP,
+ /* unknown reason for SCX being preempted */
+ SCX_CPU_PREEMPT_UNKNOWN,
+};
+
+/*
+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * expanded in the future.
+ */
+struct scx_cpu_acquire_args {};
+
+/* argument container for ops->cpu_release() */
+struct scx_cpu_release_args {
+ /* the reason the CPU was preempted */
+ enum scx_cpu_preempt_reason reason;
+
+ /* the task that's going to be scheduled on the CPU */
+ struct task_struct *task;
+};
+
+/*
+ * Informational context provided to dump operations.
+ */
+struct scx_dump_ctx {
+ enum scx_exit_kind kind;
+ s64 exit_code;
+ const char *reason;
+ u64 at_ns;
+ u64 at_jiffies;
+};
+
+/**
+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
+ *
+ * A BPF scheduler can implement an arbitrary scheduling policy by
+ * implementing and loading operations in this table. Note that a userland
+ * scheduling policy can also be implemented using the BPF scheduler
+ * as a shim layer.
+ */
+struct sched_ext_ops {
+ /**
+ * @select_cpu: Pick the target CPU for a task which is being woken up
+ * @p: task being woken up
+ * @prev_cpu: the cpu @p was on before sleeping
+ * @wake_flags: SCX_WAKE_*
+ *
+ * Decision made here isn't final. @p may be moved to any CPU while it
+ * is getting dispatched for execution later. However, as @p is not on
+ * the rq at this point, getting the eventual execution CPU right here
+ * saves a small bit of overhead down the line.
+ *
+ * If an idle CPU is returned, the CPU is kicked and will try to
+ * dispatch. While an explicit custom mechanism can be added,
+ * select_cpu() serves as the default way to wake up idle CPUs.
+ *
+ * @p may be inserted into a DSQ directly by calling
+ * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
+ * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
+ * of the CPU returned by this operation.
+ *
+ * Note that select_cpu() is never called for tasks that can only run
+ * on a single CPU or tasks with migration disabled, as they don't have
+ * the option to select a different CPU. See select_task_rq() for
+ * details.
+ */
+ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
+
+ /**
+ * @enqueue: Enqueue a task on the BPF scheduler
+ * @p: task being enqueued
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * @p is ready to run. Insert directly into a DSQ by calling
+ * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
+ * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
+ * the task will stall.
+ *
+ * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
+ * skipped.
+ */
+ void (*enqueue)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @dequeue: Remove a task from the BPF scheduler
+ * @p: task being dequeued
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * Remove @p from the BPF scheduler. This is usually called to isolate
+ * the task while updating its scheduling properties (e.g. priority).
+ *
+ * The ext core keeps track of whether the BPF side owns a given task or
+ * not and can gracefully ignore spurious dispatches from BPF side,
+ * which makes it safe to not implement this method. However, depending
+ * on the scheduling logic, this can lead to confusing behaviors - e.g.
+ * scheduling position not being updated across a priority change.
+ */
+ void (*dequeue)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
+ * @cpu: CPU to dispatch tasks for
+ * @prev: previous task being switched out
+ *
+ * Called when a CPU's local dsq is empty. The operation should dispatch
+ * one or more tasks from the BPF scheduler into the DSQs using
+ * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
+ * using scx_bpf_dsq_move_to_local().
+ *
+ * The maximum number of times scx_bpf_dsq_insert() can be called
+ * without an intervening scx_bpf_dsq_move_to_local() is specified by
+ * ops.dispatch_max_batch. See the comments on top of the two functions
+ * for more details.
+ *
+ * When not %NULL, @prev is an SCX task with its slice depleted. If
+ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
+ * @prev->scx.flags, it is not enqueued yet and will be enqueued after
+ * ops.dispatch() returns. To keep executing @prev, return without
+ * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
+ */
+ void (*dispatch)(s32 cpu, struct task_struct *prev);
+
+ /**
+ * @tick: Periodic tick
+ * @p: task running currently
+ *
+ * This operation is called every 1/HZ seconds on CPUs which are
+ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
+ * immediate dispatch cycle on the CPU.
+ */
+ void (*tick)(struct task_struct *p);
+
+ /**
+ * @runnable: A task is becoming runnable on its associated CPU
+ * @p: task becoming runnable
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * This and the following three functions can be used to track a task's
+ * execution state transitions. A task becomes ->runnable() on a CPU,
+ * and then goes through one or more ->running() and ->stopping() pairs
+ * as it runs on the CPU, and eventually becomes ->quiescent() when it's
+ * done running on the CPU.
+ *
+ * @p is becoming runnable on the CPU because it's
+ *
+ * - waking up (%SCX_ENQ_WAKEUP)
+ * - being moved from another CPU
+ * - being restored after temporarily taken off the queue for an
+ * attribute change.
+ *
+ * This and ->enqueue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be followed by ->enqueue()
+ * e.g. when @p is being dispatched to a remote CPU, or when @p is
+ * being enqueued on a CPU experiencing a hotplug event. Likewise, a
+ * task may be ->enqueue()'d without being preceded by this operation
+ * e.g. after exhausting its slice.
+ */
+ void (*runnable)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @running: A task is starting to run on its associated CPU
+ * @p: task starting to run
+ *
+ * Note that this callback may be called from a CPU other than the
+ * one the task is going to run on. This can happen when a task
+ * property is changed (i.e., affinity), since scx_next_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to determine the
+ * target CPU the task is going to use.
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ */
+ void (*running)(struct task_struct *p);
+
+ /**
+ * @stopping: A task is stopping execution
+ * @p: task stopping to run
+ * @runnable: is task @p still runnable?
+ *
+ * Note that this callback may be called from a CPU other than the
+ * one the task was running on. This can happen when a task
+ * property is changed (i.e., affinity), since dequeue_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
+ * the task was running on.
+ *
+ * See ->runnable() for explanation on the task state notifiers. If
+ * !@runnable, ->quiescent() will be invoked after this operation
+ * returns.
+ */
+ void (*stopping)(struct task_struct *p, bool runnable);
+
+ /**
+ * @quiescent: A task is becoming not runnable on its associated CPU
+ * @p: task becoming not runnable
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ *
+ * @p is becoming quiescent on the CPU because it's
+ *
+ * - sleeping (%SCX_DEQ_SLEEP)
+ * - being moved to another CPU
+ * - being temporarily taken off the queue for an attribute change
+ * (%SCX_DEQ_SAVE)
+ *
+ * This and ->dequeue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be preceded by ->dequeue()
+ * e.g. when @p is being dispatched to a remote CPU.
+ */
+ void (*quiescent)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @yield: Yield CPU
+ * @from: yielding task
+ * @to: optional yield target task
+ *
+ * If @to is NULL, @from is yielding the CPU to other runnable tasks.
+ * The BPF scheduler should ensure that other available tasks are
+ * dispatched before the yielding task. Return value is ignored in this
+ * case.
+ *
+ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
+ * scheduler can implement the request, return %true; otherwise, %false.
+ */
+ bool (*yield)(struct task_struct *from, struct task_struct *to);
+
+ /**
+ * @core_sched_before: Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ *
+ * Used by core-sched to determine the ordering between two tasks. See
+ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
+ * core-sched.
+ *
+ * Both @a and @b are runnable and may or may not currently be queued on
+ * the BPF scheduler. Should return %true if @a should run before @b.
+ * %false if there's no required ordering or @b should run before @a.
+ *
+ * If not specified, the default is ordering them according to when they
+ * became runnable.
+ */
+ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
+
+ /**
+ * @set_weight: Set task weight
+ * @p: task to set weight for
+ * @weight: new weight [1..10000]
+ *
+ * Update @p's weight to @weight.
+ */
+ void (*set_weight)(struct task_struct *p, u32 weight);
+
+ /**
+ * @set_cpumask: Set CPU affinity
+ * @p: task to set CPU affinity for
+ * @cpumask: cpumask of cpus that @p can run on
+ *
+ * Update @p's CPU affinity to @cpumask.
+ */
+ void (*set_cpumask)(struct task_struct *p,
+ const struct cpumask *cpumask);
+
+ /**
+ * @update_idle: Update the idle state of a CPU
+ * @cpu: CPU to update the idle state for
+ * @idle: whether entering or exiting the idle state
+ *
+ * This operation is called when @rq's CPU goes or leaves the idle
+ * state. By default, implementing this operation disables the built-in
+ * idle CPU tracking and the following helpers become unavailable:
+ *
+ * - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_select_cpu_and()
+ * - scx_bpf_test_and_clear_cpu_idle()
+ * - scx_bpf_pick_idle_cpu()
+ *
+ * The user also must implement ops.select_cpu() as the default
+ * implementation relies on scx_bpf_select_cpu_dfl().
+ *
+ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
+ * tracking.
+ */
+ void (*update_idle)(s32 cpu, bool idle);
+
+ /**
+ * @cpu_acquire: A CPU is becoming available to the BPF scheduler
+ * @cpu: The CPU being acquired by the BPF scheduler.
+ * @args: Acquire arguments, see the struct definition.
+ *
+ * A CPU that was previously released from the BPF scheduler is now once
+ * again under its control.
+ */
+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+ /**
+ * @cpu_release: A CPU is taken away from the BPF scheduler
+ * @cpu: The CPU being released by the BPF scheduler.
+ * @args: Release arguments, see the struct definition.
+ *
+ * The specified CPU is no longer under the control of the BPF
+ * scheduler. This could be because it was preempted by a higher
+ * priority sched_class, though there may be other reasons as well. The
+ * caller should consult @args->reason to determine the cause.
+ */
+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+
+ /**
+ * @init_task: Initialize a task to run in a BPF scheduler
+ * @p: task to initialize for BPF scheduling
+ * @args: init arguments, see the struct definition
+ *
+ * Either we're loading a BPF scheduler or a new task is being forked.
+ * Initialize @p for BPF scheduling. This operation may block and can
+ * be used for allocations, and is called exactly once for a task.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During a fork, it
+ * will abort that specific fork.
+ */
+ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
+
+ /**
+ * @exit_task: Exit a previously-running task from the system
+ * @p: task to exit
+ * @args: exit arguments, see the struct definition
+ *
+ * @p is exiting or the BPF scheduler is being unloaded. Perform any
+ * necessary cleanup for @p.
+ */
+ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
+
+ /**
+ * @enable: Enable BPF scheduling for a task
+ * @p: task to enable BPF scheduling for
+ *
+ * Enable @p for BPF scheduling. enable() is called on @p any time it
+ * enters SCX, and is always paired with a matching disable().
+ */
+ void (*enable)(struct task_struct *p);
+
+ /**
+ * @disable: Disable BPF scheduling for a task
+ * @p: task to disable BPF scheduling for
+ *
+ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
+ * Disable BPF scheduling for @p. A disable() call is always matched
+ * with a prior enable() call.
+ */
+ void (*disable)(struct task_struct *p);
+
+ /**
+ * @dump: Dump BPF scheduler state on error
+ * @ctx: debug dump context
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
+ */
+ void (*dump)(struct scx_dump_ctx *ctx);
+
+ /**
+ * @dump_cpu: Dump BPF scheduler state for a CPU on error
+ * @ctx: debug dump context
+ * @cpu: CPU to generate debug dump for
+ * @idle: @cpu is currently idle without any runnable tasks
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @cpu. If @idle is %true and this operation doesn't produce any
+ * output, @cpu is skipped for dump.
+ */
+ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
+
+ /**
+ * @dump_task: Dump BPF scheduler state for a runnable task on error
+ * @ctx: debug dump context
+ * @p: runnable task to generate debug dump for
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @p.
+ */
+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /**
+ * @cgroup_init: Initialize a cgroup
+ * @cgrp: cgroup being initialized
+ * @args: init arguments, see the struct definition
+ *
+ * Either the BPF scheduler is being loaded or @cgrp created, initialize
+ * @cgrp for sched_ext. This operation may block.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During cgroup
+ * creation, it will abort the specific cgroup creation.
+ */
+ s32 (*cgroup_init)(struct cgroup *cgrp,
+ struct scx_cgroup_init_args *args);
+
+ /**
+ * @cgroup_exit: Exit a cgroup
+ * @cgrp: cgroup being exited
+ *
+ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+ * @cgrp for sched_ext. This operation my block.
+ */
+ void (*cgroup_exit)(struct cgroup *cgrp);
+
+ /**
+ * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Prepare @p for move from cgroup @from to @to. This operation may
+ * block and can be used for allocations.
+ *
+ * Return 0 for success, -errno for failure. An error return aborts the
+ * migration.
+ */
+ s32 (*cgroup_prep_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_move: Commit cgroup move
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Commit the move. @p is dequeued during this operation.
+ */
+ void (*cgroup_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_cancel_move: Cancel cgroup move
+ * @p: task whose cgroup move is being canceled
+ * @from: cgroup @p was being moved from
+ * @to: cgroup @p was being moved to
+ *
+ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+ * Undo the preparation.
+ */
+ void (*cgroup_cancel_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_set_weight: A cgroup's weight is being changed
+ * @cgrp: cgroup whose weight is being updated
+ * @weight: new weight [1..10000]
+ *
+ * Update @cgrp's weight to @weight.
+ */
+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+ /**
+ * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+ * @cgrp: cgroup whose bandwidth is being updated
+ * @period_us: bandwidth control period
+ * @quota_us: bandwidth control quota
+ * @burst_us: bandwidth control burst
+ *
+ * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+ * cgroup interface.
+ *
+ * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+ * to. For example, if @period_us is 1_000_000 and @quota_us is
+ * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+ * interpreted in the same fashion and specifies how much @cgrp can
+ * burst temporarily. The specific control mechanism and thus the
+ * interpretation of @period_us and burstiness is upto to the BPF
+ * scheduler.
+ */
+ void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us);
+
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+ /*
+ * All online ops must come before ops.cpu_online().
+ */
+
+ /**
+ * @cpu_online: A CPU became online
+ * @cpu: CPU which just came up
+ *
+ * @cpu just came online. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
+ */
+ void (*cpu_online)(s32 cpu);
+
+ /**
+ * @cpu_offline: A CPU is going offline
+ * @cpu: CPU which is going offline
+ *
+ * @cpu is going offline. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
+ */
+ void (*cpu_offline)(s32 cpu);
+
+ /*
+ * All CPU hotplug ops must come before ops.init().
+ */
+
+ /**
+ * @init: Initialize the BPF scheduler
+ */
+ s32 (*init)(void);
+
+ /**
+ * @exit: Clean up after the BPF scheduler
+ * @info: Exit info
+ *
+ * ops.exit() is also called on ops.init() failure, which is a bit
+ * unusual. This is to allow rich reporting through @info on how
+ * ops.init() failed.
+ */
+ void (*exit)(struct scx_exit_info *info);
+
+ /**
+ * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
+ */
+ u32 dispatch_max_batch;
+
+ /**
+ * @flags: %SCX_OPS_* flags
+ */
+ u64 flags;
+
+ /**
+ * @timeout_ms: The maximum amount of time, in milliseconds, that a
+ * runnable task should be able to wait before being scheduled. The
+ * maximum timeout may not exceed the default timeout of 30 seconds.
+ *
+ * Defaults to the maximum allowed timeout value of 30 seconds.
+ */
+ u32 timeout_ms;
+
+ /**
+ * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
+ * value of 32768 is used.
+ */
+ u32 exit_dump_len;
+
+ /**
+ * @hotplug_seq: A sequence number that may be set by the scheduler to
+ * detect when a hotplug event has occurred during the loading process.
+ * If 0, no detection occurs. Otherwise, the scheduler will fail to
+ * load if the sequence number does not match @scx_hotplug_seq on the
+ * enable path.
+ */
+ u64 hotplug_seq;
+
+ /**
+ * @name: BPF scheduler's name
+ *
+ * Must be a non-zero valid BPF object name including only isalnum(),
+ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
+ * BPF scheduler is enabled.
+ */
+ char name[SCX_OPS_NAME_LEN];
+
+ /* internal use only, must be NULL */
+ void *priv;
+};
+
+enum scx_opi {
+ SCX_OPI_BEGIN = 0,
+ SCX_OPI_NORMAL_BEGIN = 0,
+ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
+ SCX_OPI_END = SCX_OP_IDX(init),
+};
+
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * Total number of times a task's time slice was refilled with the
+ * default value (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_REFILL_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+struct scx_sched_pcpu {
+ /*
+ * The event counters are in a per-CPU variable to minimize the
+ * accounting overhead. A system-wide view on the event counter is
+ * constructed when requested by scx_bpf_events().
+ */
+ struct scx_event_stats event_stats;
+};
+
+struct scx_sched {
+ struct sched_ext_ops ops;
+ DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+ /*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+ * This is to avoid live-locking in bypass mode where all tasks are
+ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+ * per-node split isn't sufficient, it can be further split.
+ */
+ struct rhashtable dsq_hash;
+ struct scx_dispatch_q **global_dsqs;
+ struct scx_sched_pcpu __percpu *pcpu;
+
+ bool warned_zero_slice:1;
+ bool warned_deprecated_rq:1;
+
+ atomic_t exit_kind;
+ struct scx_exit_info *exit_info;
+
+ struct kobject kobj;
+
+ struct kthread_worker *helper;
+ struct irq_work error_irq_work;
+ struct kthread_work disable_work;
+ struct rcu_work rcu_work;
+};
+
+enum scx_wake_flags {
+ /* expose select WF_* flags as enums */
+ SCX_WAKE_FORK = WF_FORK,
+ SCX_WAKE_TTWU = WF_TTWU,
+ SCX_WAKE_SYNC = WF_SYNC,
+};
+
+enum scx_enq_flags {
+ /* expose select ENQUEUE_* flags as enums */
+ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
+ SCX_ENQ_HEAD = ENQUEUE_HEAD,
+ SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * Set the following to trigger preemption when calling
+ * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
+ * current task is cleared to zero and the CPU is kicked into the
+ * scheduling path. Implies %SCX_ENQ_HEAD.
+ */
+ SCX_ENQ_PREEMPT = 1LLU << 32,
+
+ /*
+ * The task being enqueued was previously enqueued on the current CPU's
+ * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+ * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
+ * invoked in a ->cpu_release() callback, and the task is again
+ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+ * task will not be scheduled on the CPU until at least the next invocation
+ * of the ->cpu_acquire() callback.
+ */
+ SCX_ENQ_REENQ = 1LLU << 40,
+
+ /*
+ * The task being enqueued is the only task available for the cpu. By
+ * default, ext core keeps executing such tasks but when
+ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
+ * %SCX_ENQ_LAST flag set.
+ *
+ * The BPF scheduler is responsible for triggering a follow-up
+ * scheduling event. Otherwise, Execution may stall.
+ */
+ SCX_ENQ_LAST = 1LLU << 41,
+
+ /* high 8 bits are internal */
+ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
+ SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
+};
+
+enum scx_deq_flags {
+ /* expose select DEQUEUE_* flags as enums */
+ SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * The generic core-sched layer decided to execute the task even though
+ * it hasn't been dispatched yet. Dequeue from the BPF side.
+ */
+ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
+};
+
+enum scx_pick_idle_cpu_flags {
+ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+ SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
+};
+
+enum scx_kick_flags {
+ /*
+ * Kick the target CPU if idle. Guarantees that the target CPU goes
+ * through at least one full scheduling cycle before going idle. If the
+ * target CPU can be determined to be currently not idle and going to go
+ * through a scheduling cycle before going idle, noop.
+ */
+ SCX_KICK_IDLE = 1LLU << 0,
+
+ /*
+ * Preempt the current task and execute the dispatch path. If the
+ * current task of the target CPU is an SCX task, its ->scx.slice is
+ * cleared to zero before the scheduling path is invoked so that the
+ * task expires and the dispatch path is invoked.
+ */
+ SCX_KICK_PREEMPT = 1LLU << 1,
+
+ /*
+ * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
+ * return after the target CPU finishes picking the next task.
+ */
+ SCX_KICK_WAIT = 1LLU << 2,
+};
+
+enum scx_tg_flags {
+ SCX_TG_ONLINE = 1U << 0,
+ SCX_TG_INITED = 1U << 1,
+};
+
+enum scx_enable_state {
+ SCX_ENABLING,
+ SCX_ENABLED,
+ SCX_DISABLING,
+ SCX_DISABLED,
+};
+
+static const char *scx_enable_state_str[] = {
+ [SCX_ENABLING] = "enabling",
+ [SCX_ENABLED] = "enabled",
+ [SCX_DISABLING] = "disabling",
+ [SCX_DISABLED] = "disabled",
+};
+
+/*
+ * sched_ext_entity->ops_state
+ *
+ * Used to track the task ownership between the SCX core and the BPF scheduler.
+ * State transitions look as follows:
+ *
+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
+ * ^ | |
+ * | v v
+ * \-------------------------------/
+ *
+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
+ * sites for explanations on the conditions being waited upon and why they are
+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
+ * waiters should load_acquire.
+ *
+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
+ * any given task can be dispatched by the BPF scheduler at all times and thus
+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
+ * to try to dispatch any task anytime regardless of its state as the SCX core
+ * can safely reject invalid dispatches.
+ */
+enum scx_ops_state {
+ SCX_OPSS_NONE, /* owned by the SCX core */
+ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
+ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
+ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
+
+ /*
+ * QSEQ brands each QUEUED instance so that, when dispatch races
+ * dequeue/requeue, the dispatcher can tell whether it still has a claim
+ * on the task being dispatched.
+ *
+ * As some 32bit archs can't do 64bit store_release/load_acquire,
+ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
+ * 32bit machines. The dispatch race window QSEQ protects is very narrow
+ * and runs with IRQ disabled. 30 bits should be sufficient.
+ */
+ SCX_OPSS_QSEQ_SHIFT = 2,
+};
+
+/* Use macros to ensure that the type is unsigned long for the masks */
+#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
+#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
+
+DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(scx_locked_rq_state);
+}
+
+static inline bool scx_kf_allowed_if_unlocked(void)
+{
+ return !current->scx.kf_mask;
+}
+
+static inline bool scx_rq_bypassing(struct rq *rq)
+{
+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b173a059315c..3a89f949e307 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3542,7 +3542,7 @@ out:
}
}
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+void init_numa_balancing(u64 clone_flags, struct task_struct *p)
{
int mm_users = 0;
struct mm_struct *mm = p->mm;
@@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se)
if (!gcfs_rq || !gcfs_rq->load.weight)
return;
- if (throttled_hierarchy(gcfs_rq))
- return;
-
shares = calc_group_shares(gcfs_rq);
if (unlikely(se->load.weight != shares))
reweight_entity(cfs_rq_of(se), se, shares);
@@ -5291,18 +5288,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq)) {
- list_add_leaf_cfs_rq(cfs_rq);
- } else {
+ list_add_leaf_cfs_rq(cfs_rq);
#ifdef CONFIG_CFS_BANDWIDTH
+ if (cfs_rq->pelt_clock_throttled) {
struct rq *rq = rq_of(cfs_rq);
- if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
- cfs_rq->throttled_clock = rq_clock(rq);
- if (!cfs_rq->throttled_clock_self)
- cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
}
+#endif
}
}
@@ -5341,8 +5336,6 @@ static void set_delayed(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_runnable--;
- if (cfs_rq_throttled(cfs_rq))
- break;
}
}
@@ -5363,8 +5356,6 @@ static void clear_delayed(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_runnable++;
- if (cfs_rq_throttled(cfs_rq))
- break;
}
}
@@ -5392,7 +5383,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* DELAY_DEQUEUE relies on spurious wakeups, special task
* states must not suffer spurious wakeups, excempt them.
*/
- if (flags & DEQUEUE_SPECIAL)
+ if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
delay = false;
WARN_ON_ONCE(delay && se->sched_delayed);
@@ -5450,8 +5441,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
- if (cfs_rq->nr_queued == 0)
+ if (cfs_rq->nr_queued == 0) {
update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
+ }
+#endif
+ }
return true;
}
@@ -5725,74 +5726,253 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
return cfs_bandwidth_used() && cfs_rq->throttled;
}
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
+}
+
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
+}
+
+static inline bool task_is_throttled(struct task_struct *p)
+{
+ return cfs_bandwidth_used() && p->throttled;
+}
+
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
+static void throttle_cfs_rq_work(struct callback_head *work)
+{
+ struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+
+ WARN_ON_ONCE(p != current);
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+
+ /*
+ * If task is exiting, then there won't be a return to userspace, so we
+ * don't have to bother with any of this.
+ */
+ if ((p->flags & PF_EXITING))
+ return;
+
+ scoped_guard(task_rq_lock, p) {
+ se = &p->se;
+ cfs_rq = cfs_rq_of(se);
+
+ /* Raced, forget */
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ /*
+ * If not in limbo, then either replenish has happened or this
+ * task got migrated out of the throttled cfs_rq, move along.
+ */
+ if (!cfs_rq->throttle_count)
+ return;
+ rq = scope.rq;
+ update_rq_clock(rq);
+ WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
+ dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ /*
+ * Must not set throttled before dequeue or dequeue will
+ * mistakenly regard this task as an already throttled one.
+ */
+ p->throttled = true;
+ resched_curr(rq);
+ }
+}
+
+void init_cfs_throttle_work(struct task_struct *p)
+{
+ init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
+ /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+ INIT_LIST_HEAD(&p->throttle_node);
+}
+
/*
- * Ensure that neither of the group entities corresponding to src_cpu or
- * dest_cpu are members of a throttled hierarchy when performing group
- * load-balance operations.
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
*/
-static inline int throttled_lb_pair(struct task_group *tg,
- int src_cpu, int dest_cpu)
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
+{
+ WARN_ON_ONCE(p->se.on_rq);
+ list_del_init(&p->throttle_node);
+
+ /* task blocked after throttled */
+ if (flags & DEQUEUE_SLEEP) {
+ p->throttled = false;
+ return;
+ }
+
+ /*
+ * task is migrating off its old cfs_rq, detach
+ * the task's load from its old cfs_rq.
+ */
+ if (task_on_rq_migrating(p))
+ detach_task_cfs_rq(p);
+}
+
+static bool enqueue_throttled_task(struct task_struct *p)
{
- struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
- src_cfs_rq = tg->cfs_rq[src_cpu];
- dest_cfs_rq = tg->cfs_rq[dest_cpu];
+ /* @p should have gone through dequeue_throttled_task() first */
+ WARN_ON_ONCE(!list_empty(&p->throttle_node));
+
+ /*
+ * If the throttled task @p is enqueued to a throttled cfs_rq,
+ * take the fast path by directly putting the task on the
+ * target cfs_rq's limbo list.
+ *
+ * Do not do that when @p is current because the following race can
+ * cause @p's group_node to be incorectly re-insterted in its rq's
+ * cfs_tasks list, despite being throttled:
+ *
+ * cpuX cpuY
+ * p ret2user
+ * throttle_cfs_rq_work() sched_move_task(p)
+ * LOCK task_rq_lock
+ * dequeue_task_fair(p)
+ * UNLOCK task_rq_lock
+ * LOCK task_rq_lock
+ * task_current_donor(p) == true
+ * task_on_rq_queued(p) == true
+ * dequeue_task(p)
+ * put_prev_task(p)
+ * sched_change_group()
+ * enqueue_task(p) -> p's new cfs_rq
+ * is throttled, go
+ * fast path and skip
+ * actual enqueue
+ * set_next_task(p)
+ * list_move(&se->group_node, &rq->cfs_tasks); // bug
+ * schedule()
+ *
+ * In the above race case, @p current cfs_rq is in the same rq as
+ * its previous cfs_rq because sched_move_task() only moves a task
+ * to a different group from the same rq, so we can use its current
+ * cfs_rq to derive rq and test if the task is current.
+ */
+ if (throttled_hierarchy(cfs_rq) &&
+ !task_current_donor(rq_of(cfs_rq), p)) {
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ return true;
+ }
- return throttled_hierarchy(src_cfs_rq) ||
- throttled_hierarchy(dest_cfs_rq);
+ /* we can't take the fast path, do an actual enqueue*/
+ p->throttled = false;
+ return false;
}
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p, *tmp;
+
+ if (--cfs_rq->throttle_count)
+ return 0;
- cfs_rq->throttle_count--;
- if (!cfs_rq->throttle_count) {
+ if (cfs_rq->pelt_clock_throttled) {
cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
+ }
- /* Add cfs_rq with load or one or more already running entities to the list */
- if (!cfs_rq_is_decayed(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
- if (cfs_rq->throttled_clock_self) {
- u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+ cfs_rq->throttled_clock_self = 0;
- cfs_rq->throttled_clock_self = 0;
+ if (WARN_ON_ONCE((s64)delta < 0))
+ delta = 0;
- if (WARN_ON_ONCE((s64)delta < 0))
- delta = 0;
+ cfs_rq->throttled_clock_self_time += delta;
+ }
- cfs_rq->throttled_clock_self_time += delta;
- }
+ /* Re-enqueue the tasks that have been throttled at this level. */
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+ list_del_init(&p->throttle_node);
+ p->throttled = false;
+ enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
}
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+
return 0;
}
+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+ return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+ if (task_has_throttle_work(p))
+ return;
+
+ /*
+ * Kthreads and exiting tasks don't return to userspace, so adding the
+ * work is pointless
+ */
+ if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+ return;
+
+ task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
+static void record_throttle_clock(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+}
+
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
- list_del_leaf_cfs_rq(cfs_rq);
+ if (cfs_rq->throttle_count++)
+ return 0;
- WARN_ON_ONCE(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock_self = rq_clock(rq);
+ /*
+ * For cfs_rqs that still have entities enqueued, PELT clock
+ * stop happens at dequeue time when all entities are dequeued.
+ */
+ if (!cfs_rq->nr_queued) {
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
}
- cfs_rq->throttle_count++;
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+ WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
return 0;
}
@@ -5800,8 +5980,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+ int dequeue = 1;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5824,76 +6003,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!dequeue)
return false; /* Throttle no longer required. */
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- int flags;
-
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- /*
- * Abuse SPECIAL to avoid delayed dequeue in this instance.
- * This avoids teaching dequeue_entities() about throttled
- * entities and keeps things relatively simple.
- */
- flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
- if (se->sched_delayed)
- flags |= DEQUEUE_DELAYED;
- dequeue_entity(qcfs_rq, se, flags);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
-
- if (qcfs_rq->load.weight) {
- /* Avoid re-evaluating load for this entity: */
- se = parent_entity(se);
- break;
- }
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
- }
-
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, queued_delta);
-done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
WARN_ON_ONCE(cfs_rq->throttled_clock);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@@ -5901,9 +6021,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+
+ /*
+ * It's possible we are called with !runtime_remaining due to things
+ * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+ * unthrottled us with a positive runtime_remaining but other still
+ * running entities consumed those runtime before we reached here.
+ *
+ * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+ * because any enqueue in tg_unthrottle_up() will immediately trigger a
+ * throttle, which is not supposed to happen on unthrottle path.
+ */
+ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+ return;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -5933,62 +6064,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
break;
}
- goto unthrottle_throttle;
- }
-
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- /* Handle any unfinished DELAY_DEQUEUE business first. */
- if (se->sched_delayed) {
- int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
- dequeue_entity(qcfs_rq, se, flags);
- } else if (se->on_rq)
- break;
- enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- update_load_avg(qcfs_rq, se, UPDATE_TG);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
}
- /* Start the fair server if un-throttling resulted in new runnable tasks */
- if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
- dl_server_start(&rq->fair_server);
-
- /* At this point se is NULL and we are at root level*/
- add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
@@ -6472,6 +6549,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+ INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -6639,19 +6717,28 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void task_throttle_setup_work(struct task_struct *p) {}
+static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
+static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return 0;
}
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+ return false;
+}
+
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return 0;
}
-static inline int throttled_lb_pair(struct task_group *tg,
- int src_cpu, int dest_cpu)
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
{
return 0;
}
@@ -6831,6 +6918,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
+ if (task_is_throttled(p) && enqueue_throttled_task(p))
+ return;
+
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
@@ -6883,10 +6973,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
-
flags = ENQUEUE_WAKEUP;
}
@@ -6908,10 +6994,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
}
if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -6941,7 +7023,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!task_new)
check_update_overutilized_status(rq);
-enqueue_throttle:
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -6963,6 +7044,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
bool was_sched_idle = sched_idle_rq(rq);
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
+ bool task_throttled = flags & DEQUEUE_THROTTLE;
struct task_struct *p = NULL;
int h_nr_idle = 0;
int h_nr_queued = 0;
@@ -6996,9 +7078,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -7010,7 +7091,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
- if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+ if (task_sleep && se)
set_next_buddy(se);
break;
}
@@ -7037,9 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
}
sub_nr_running(rq, h_nr_queued);
@@ -7073,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
*/
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
+ if (task_is_throttled(p)) {
+ dequeue_throttled_task(p, flags);
+ return true;
+ }
+
if (!p->se.sched_delayed)
util_est_dequeue(&rq->cfs, p);
@@ -8660,7 +8745,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
- if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+ if (task_is_throttled(p))
return;
if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
@@ -8741,19 +8826,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
+ struct task_struct *p;
+ bool throttled;
again:
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_queued)
return NULL;
+ throttled = false;
+
do {
/* Might not have done put_prev_entity() */
if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto again;
+ throttled |= check_cfs_rq_runtime(cfs_rq);
se = pick_next_entity(rq, cfs_rq);
if (!se)
@@ -8761,7 +8849,10 @@ again:
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
- return task_of(se);
+ p = task_of(se);
+ if (unlikely(throttled))
+ task_throttle_setup_work(p);
+ return p;
}
static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
@@ -8859,11 +8950,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru
return pick_next_task_fair(rq, prev, NULL);
}
-static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
-{
- return !!dl_se->rq->cfs.nr_queued;
-}
-
static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
{
return pick_task_fair(dl_se->rq);
@@ -8875,7 +8961,7 @@ void fair_server_init(struct rq *rq)
init_dl_entity(dl_se);
- dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
+ dl_server_init(dl_se, rq, fair_server_pick_task);
}
/*
@@ -8928,8 +9014,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- /* throttled hierarchies are not runnable */
- if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
+ /* !se->on_rq also covers throttled task */
+ if (!se->on_rq)
return false;
/* Tell the scheduler that we'd really like se to run next. */
@@ -9288,7 +9374,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) delayed dequeued unless we migrate load, or
- * 2) throttled_lb_pair, or
+ * 2) target cfs_rq is in throttled hierarchy, or
* 3) cannot be migrated to this CPU due to cpus_ptr, or
* 4) running (obviously), or
* 5) are cache-hot on their current CPU, or
@@ -9297,7 +9383,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
- if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ if (lb_throttled_hierarchy(p, env->dst_cpu))
return 0;
/*
@@ -13081,10 +13167,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq_throttled(cfs_rq))
- return;
-
- if (!throttled_hierarchy(cfs_rq))
+ /*
+ * If a task gets attached to this cfs_rq and before being queued,
+ * it gets migrated to another CPU due to reasons like affinity
+ * change, make sure this cfs_rq stays on leaf cfs_rq list to have
+ * that removed load decayed or it can cause faireness problem.
+ */
+ if (!cfs_rq_pelt_clock_throttled(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
/* Start to propagate at parent */
@@ -13095,10 +13184,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
update_load_avg(cfs_rq, se, UPDATE_TG);
- if (cfs_rq_throttled(cfs_rq))
- break;
-
- if (!throttled_hierarchy(cfs_rq))
+ if (!cfs_rq_pelt_clock_throttled(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 62c3fa543c0f..f921302dc40f 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
u64 throttled;
- if (unlikely(cfs_rq->throttle_count))
+ if (unlikely(cfs_rq->pelt_clock_throttled))
throttled = U64_MAX;
else
throttled = cfs_rq->throttled_clock_pelt_time;
@@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
- if (unlikely(cfs_rq->throttle_count))
+ if (unlikely(cfs_rq->pelt_clock_throttled))
return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
new file mode 100644
index 000000000000..a23747bbe25b
--- /dev/null
+++ b/kernel/sched/rq-offsets.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#define COMPILE_OFFSETS
+#include <linux/kbuild.h>
+#include <linux/types.h>
+#include "sched.h"
+
+int main(void)
+{
+ DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
+
+ return 0;
+}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index be9745d104f7..1f5d07067f60 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -365,25 +365,50 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6
*
* dl_se::rq -- runqueue we belong to.
*
- * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
- * server when it runs out of tasks to run.
- *
* dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
* returns NULL.
*
* dl_server_update() -- called from update_curr_common(), propagates runtime
* to the server.
*
- * dl_server_start()
- * dl_server_stop() -- start/stop the server when it has (no) tasks.
+ * dl_server_start() -- start the server when it has tasks; it will stop
+ * automatically when there are no more tasks, per
+ * dl_se::server_pick() returning NULL.
+ *
+ * dl_server_stop() -- (force) stop the server; use when updating
+ * parameters.
*
* dl_server_init() -- initializes the server.
+ *
+ * When started the dl_server will (per dl_defer) schedule a timer for its
+ * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a
+ * server will run at the very end of its period.
+ *
+ * This is done such that any runtime from the target class can be accounted
+ * against the server -- through dl_server_update() above -- such that when it
+ * becomes time to run, it might already be out of runtime and get deferred
+ * until the next period. In this case dl_server_timer() will alternate
+ * between defer and replenish but never actually enqueue the server.
+ *
+ * Only when the target class does not manage to exhaust the server's runtime
+ * (there's actualy starvation in the given period), will the dl_server get on
+ * the runqueue. Once queued it will pick tasks from the target class and run
+ * them until either its runtime is exhaused, at which point its back to
+ * dl_server_timer, or until there are no more tasks to run, at which point
+ * the dl_server stops itself.
+ *
+ * By stopping at this point the dl_server retains bandwidth, which, if a new
+ * task wakes up imminently (starting the server again), can be used --
+ * subject to CBS wakeup rules -- without having to wait for the next period.
+ *
+ * Additionally, because of the dl_defer behaviour the start/stop behaviour is
+ * naturally thottled to once per period, avoiding high context switch
+ * workloads from spamming the hrtimer program/cancel paths.
*/
extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
extern void dl_server_start(struct sched_dl_entity *dl_se);
extern void dl_server_stop(struct sched_dl_entity *dl_se);
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
- dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick_task);
extern void sched_init_dl_servers(void);
@@ -735,10 +760,12 @@ struct cfs_rq {
u64 throttled_clock_pelt_time;
u64 throttled_clock_self;
u64 throttled_clock_self_time;
- int throttled;
+ bool throttled:1;
+ bool pelt_clock_throttled:1;
int throttle_count;
struct list_head throttled_list;
struct list_head throttled_csd_list;
+ struct list_head throttled_limbo_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1935,12 +1962,12 @@ extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *p, struct task_struct *t,
int cpu, int scpu);
-extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+extern void init_numa_balancing(u64 clone_flags, struct task_struct *p);
#else /* !CONFIG_NUMA_BALANCING: */
static inline void
-init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+init_numa_balancing(u64 clone_flags, struct task_struct *p)
{
}
@@ -2342,6 +2369,7 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SPECIAL 0x10
#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_THROTTLE 0x800
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -2658,6 +2686,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
extern void init_dl_entity(struct sched_dl_entity *dl_se);
+extern void init_cfs_throttle_work(struct task_struct *p);
+
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 977e133bb8a4..444bdfdab731 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1591,7 +1591,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
enum numa_topology_type sched_numa_topology_type;
static int sched_domains_numa_levels;
-static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
@@ -1632,14 +1631,7 @@ sd_init(struct sched_domain_topology_level *tl,
int sd_id, sd_weight, sd_flags = 0;
struct cpumask *sd_span;
-#ifdef CONFIG_NUMA
- /*
- * Ugly hack to pass state to sd_numa_mask()...
- */
- sched_domains_curr_level = tl->numa_level;
-#endif
-
- sd_weight = cpumask_weight(tl->mask(cpu));
+ sd_weight = cpumask_weight(tl->mask(tl, cpu));
if (tl->sd_flags)
sd_flags = (*tl->sd_flags)();
@@ -1677,7 +1669,7 @@ sd_init(struct sched_domain_topology_level *tl,
};
sd_span = sched_domain_span(sd);
- cpumask_and(sd_span, cpu_map, tl->mask(cpu));
+ cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
sd_id = cpumask_first(sd_span);
sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
@@ -1732,22 +1724,63 @@ sd_init(struct sched_domain_topology_level *tl,
return sd;
}
+#ifdef CONFIG_SCHED_SMT
+int cpu_smt_flags(void)
+{
+ return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_smt_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+int cpu_cluster_flags(void)
+{
+ return SD_CLUSTER | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_clustergroup_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+int cpu_core_flags(void)
+{
+ return SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_coregroup_mask(cpu);
+}
+#endif
+
+const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_node_mask(cpu);
+}
+
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
+ SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
#endif
#ifdef CONFIG_SCHED_CLUSTER
- SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
+ SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
+ SDTL_INIT(tl_mc_mask, cpu_core_flags, MC),
#endif
- SDTL_INIT(cpu_cpu_mask, NULL, PKG),
+ SDTL_INIT(tl_pkg_mask, NULL, PKG),
{ NULL, },
};
@@ -1768,10 +1801,14 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl)
}
#ifdef CONFIG_NUMA
+static int cpu_numa_flags(void)
+{
+ return SD_NUMA;
+}
-static const struct cpumask *sd_numa_mask(int cpu)
+static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu)
{
- return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+ return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)];
}
static void sched_numa_warn(const char *str)
@@ -2201,6 +2238,8 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
goto unlock;
hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+ if (!hop_masks)
+ goto unlock;
hop = hop_masks - k.masks;
ret = hop ?
@@ -2411,7 +2450,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
* breaks the linking done for an earlier span.
*/
for_each_cpu(cpu, cpu_map) {
- const struct cpumask *tl_cpu_mask = tl->mask(cpu);
+ const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu);
int id;
/* lowest bit set in this mask is used as a unique id */
@@ -2419,7 +2458,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
if (cpumask_test_cpu(id, id_seen)) {
/* First CPU has already been seen, ensure identical spans */
- if (!cpumask_equal(tl->mask(id), tl_cpu_mask))
+ if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask))
return false;
} else {
/* First CPU hasn't been seen before, ensure it's a completely new span */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 41aa761c7738..25f62867a16d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -741,6 +741,26 @@ out:
}
#ifdef SECCOMP_ARCH_NATIVE
+static bool seccomp_uprobe_exception(struct seccomp_data *sd)
+{
+#if defined __NR_uretprobe || defined __NR_uprobe
+#ifdef SECCOMP_ARCH_COMPAT
+ if (sd->arch == SECCOMP_ARCH_NATIVE)
+#endif
+ {
+#ifdef __NR_uretprobe
+ if (sd->nr == __NR_uretprobe)
+ return true;
+#endif
+#ifdef __NR_uprobe
+ if (sd->nr == __NR_uprobe)
+ return true;
+#endif
+ }
+#endif
+ return false;
+}
+
/**
* seccomp_is_const_allow - check if filter is constant allow with given data
* @fprog: The BPF programs
@@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
return false;
/* Our single exception to filtering. */
-#ifdef __NR_uretprobe
-#ifdef SECCOMP_ARCH_COMPAT
- if (sd->arch == SECCOMP_ARCH_NATIVE)
-#endif
- if (sd->nr == __NR_uretprobe)
- return true;
-#endif
+ if (seccomp_uprobe_exception(sd))
+ return true;
for (pc = 0; pc < fprog->len; pc++) {
struct sock_filter *insn = &fprog->filter[pc];
@@ -1043,6 +1058,9 @@ static const int mode1_syscalls[] = {
#ifdef __NR_uretprobe
__NR_uretprobe,
#endif
+#ifdef __NR_uprobe
+ __NR_uprobe,
+#endif
-1, /* negative terminated */
};
@@ -1139,7 +1157,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
static bool should_sleep_killable(struct seccomp_filter *match,
struct seccomp_knotif *n)
{
- return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
+ return match->wait_killable_recv && n->state >= SECCOMP_NOTIFY_SENT;
}
static int seccomp_do_user_notification(int this_syscall,
@@ -1186,13 +1204,11 @@ static int seccomp_do_user_notification(int this_syscall,
if (err != 0) {
/*
- * Check to see if the notifcation got picked up and
- * whether we should switch to wait killable.
+ * Check to see whether we should switch to wait
+ * killable. Only return the interrupted error if not.
*/
- if (!wait_killable && should_sleep_killable(match, &n))
- continue;
-
- goto interrupted;
+ if (!(!wait_killable && should_sleep_killable(match, &n)))
+ goto interrupted;
}
addfd = list_first_entry_or_null(&n.addfd,
diff --git a/kernel/signal.c b/kernel/signal.c
index e2c928de7d2c..fe9190d84f28 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4067,6 +4067,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
{
struct pid *pid;
enum pid_type type;
+ int ret;
/* Enforce flags be set to 0 until we add an extension. */
if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
@@ -4108,7 +4109,10 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
}
}
- return do_pidfd_send_signal(pid, sig, type, info, flags);
+ ret = do_pidfd_send_signal(pid, sig, type, info, flags);
+ put_pid(pid);
+
+ return ret;
}
static int
diff --git a/kernel/smp.c b/kernel/smp.c
index 56f83aa58ec8..02f52291fae4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -884,16 +884,15 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
- * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
- * (atomically) until function has completed on other CPUs. If
- * %SCF_RUN_LOCAL is set, the function will also be run locally
- * if the local CPU is set in the @cpumask.
- *
- * If @wait is true, then returns once @func has returned.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
*
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler. Preemption
* must be disabled when calling this function.
+ *
+ * @func is not called on the local CPU even if @mask contains it. Consider
+ * using on_each_cpu_cond_mask() instead if this is not desirable.
*/
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 513b1945987c..77198911b8dd 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -165,7 +165,11 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
/* First entry of a task into a BH disabled section? */
if (!current->softirq_disable_cnt) {
if (preemptible()) {
- local_lock(&softirq_ctrl.lock);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK))
+ local_lock(&softirq_ctrl.lock);
+ else
+ migrate_disable();
+
/* Required to meet the RCU bottomhalf requirements. */
rcu_read_lock();
} else {
@@ -177,17 +181,34 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
* Track the per CPU softirq disabled state. On RT this is per CPU
* state to allow preemption of bottom half disabled sections.
*/
- newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt);
- /*
- * Reflect the result in the task state to prevent recursion on the
- * local lock and to make softirq_count() & al work.
- */
- current->softirq_disable_cnt = newcnt;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) {
+ newcnt = this_cpu_add_return(softirq_ctrl.cnt, cnt);
+ /*
+ * Reflect the result in the task state to prevent recursion on the
+ * local lock and to make softirq_count() & al work.
+ */
+ current->softirq_disable_cnt = newcnt;
- if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) {
- raw_local_irq_save(flags);
- lockdep_softirqs_off(ip);
- raw_local_irq_restore(flags);
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+ }
+ } else {
+ bool sirq_dis = false;
+
+ if (!current->softirq_disable_cnt)
+ sirq_dis = true;
+
+ this_cpu_add(softirq_ctrl.cnt, cnt);
+ current->softirq_disable_cnt += cnt;
+ WARN_ON_ONCE(current->softirq_disable_cnt < 0);
+
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_dis) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+ }
}
}
EXPORT_SYMBOL(__local_bh_disable_ip);
@@ -195,23 +216,42 @@ EXPORT_SYMBOL(__local_bh_disable_ip);
static void __local_bh_enable(unsigned int cnt, bool unlock)
{
unsigned long flags;
+ bool sirq_en = false;
int newcnt;
- DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
- this_cpu_read(softirq_ctrl.cnt));
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) {
+ DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
+ this_cpu_read(softirq_ctrl.cnt));
+ if (softirq_count() == cnt)
+ sirq_en = true;
+ } else {
+ if (current->softirq_disable_cnt == cnt)
+ sirq_en = true;
+ }
- if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) {
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_en) {
raw_local_irq_save(flags);
lockdep_softirqs_on(_RET_IP_);
raw_local_irq_restore(flags);
}
- newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt);
- current->softirq_disable_cnt = newcnt;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) {
+ newcnt = this_cpu_sub_return(softirq_ctrl.cnt, cnt);
+ current->softirq_disable_cnt = newcnt;
- if (!newcnt && unlock) {
- rcu_read_unlock();
- local_unlock(&softirq_ctrl.lock);
+ if (!newcnt && unlock) {
+ rcu_read_unlock();
+ local_unlock(&softirq_ctrl.lock);
+ }
+ } else {
+ current->softirq_disable_cnt -= cnt;
+ this_cpu_sub(softirq_ctrl.cnt, cnt);
+ if (unlock && !current->softirq_disable_cnt) {
+ migrate_enable();
+ rcu_read_unlock();
+ } else {
+ WARN_ON_ONCE(current->softirq_disable_cnt < 0);
+ }
}
}
@@ -228,7 +268,10 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
lock_map_release(&bh_lock_map);
local_irq_save(flags);
- curcnt = __this_cpu_read(softirq_ctrl.cnt);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK))
+ curcnt = this_cpu_read(softirq_ctrl.cnt);
+ else
+ curcnt = current->softirq_disable_cnt;
/*
* If this is not reenabling soft interrupts, no point in trying to
@@ -805,6 +848,58 @@ static bool tasklet_clear_sched(struct tasklet_struct *t)
return false;
}
+#ifdef CONFIG_PREEMPT_RT
+struct tasklet_sync_callback {
+ spinlock_t cb_lock;
+ atomic_t cb_waiters;
+};
+
+static DEFINE_PER_CPU(struct tasklet_sync_callback, tasklet_sync_callback) = {
+ .cb_lock = __SPIN_LOCK_UNLOCKED(tasklet_sync_callback.cb_lock),
+ .cb_waiters = ATOMIC_INIT(0),
+};
+
+static void tasklet_lock_callback(void)
+{
+ spin_lock(this_cpu_ptr(&tasklet_sync_callback.cb_lock));
+}
+
+static void tasklet_unlock_callback(void)
+{
+ spin_unlock(this_cpu_ptr(&tasklet_sync_callback.cb_lock));
+}
+
+static void tasklet_callback_cancel_wait_running(void)
+{
+ struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback);
+
+ atomic_inc(&sync_cb->cb_waiters);
+ spin_lock(&sync_cb->cb_lock);
+ atomic_dec(&sync_cb->cb_waiters);
+ spin_unlock(&sync_cb->cb_lock);
+}
+
+static void tasklet_callback_sync_wait_running(void)
+{
+ struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback);
+
+ if (atomic_read(&sync_cb->cb_waiters)) {
+ spin_unlock(&sync_cb->cb_lock);
+ spin_lock(&sync_cb->cb_lock);
+ }
+}
+
+#else /* !CONFIG_PREEMPT_RT: */
+
+static void tasklet_lock_callback(void) { }
+static void tasklet_unlock_callback(void) { }
+static void tasklet_callback_sync_wait_running(void) { }
+
+#ifdef CONFIG_SMP
+static void tasklet_callback_cancel_wait_running(void) { }
+#endif
+#endif /* !CONFIG_PREEMPT_RT */
+
static void tasklet_action_common(struct tasklet_head *tl_head,
unsigned int softirq_nr)
{
@@ -816,6 +911,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head,
tl_head->tail = &tl_head->head;
local_irq_enable();
+ tasklet_lock_callback();
while (list) {
struct tasklet_struct *t = list;
@@ -835,6 +931,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head,
}
}
tasklet_unlock(t);
+ tasklet_callback_sync_wait_running();
continue;
}
tasklet_unlock(t);
@@ -847,6 +944,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head,
__raise_softirq_irqoff(softirq_nr);
local_irq_enable();
}
+ tasklet_unlock_callback();
}
static __latent_entropy void tasklet_action(void)
@@ -897,12 +995,9 @@ void tasklet_unlock_spin_wait(struct tasklet_struct *t)
/*
* Prevent a live lock when current preempted soft
* interrupt processing or prevents ksoftirqd from
- * running. If the tasklet runs on a different CPU
- * then this has no effect other than doing the BH
- * disable/enable dance for nothing.
+ * running.
*/
- local_bh_disable();
- local_bh_enable();
+ tasklet_callback_cancel_wait_running();
} else {
cpu_relax();
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a86931f8c..bf5d05c635ff 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -392,3 +392,4 @@ COND_SYSCALL(setuid16);
COND_SYSCALL(rseq);
COND_SYSCALL(uretprobe);
+COND_SYSCALL(uprobe);
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e6e9b85d4db5..f7d52d9543cc 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -26,7 +26,7 @@ obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o
ifeq ($(CONFIG_SMP),y)
obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o
endif
-obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
+obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
obj-$(CONFIG_TIME_NS) += namespace.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 577f0e6842d4..069d93bfb0c7 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -35,7 +35,7 @@
/**
* struct alarm_base - Alarm timer bases
- * @lock: Lock for syncrhonized access to the base
+ * @lock: Lock for synchronized access to the base
* @timerqueue: Timerqueue head managing the list of events
* @get_ktime: Function to read the time correlating to the base
* @get_timespec: Function to read the namespace time correlating to the base
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f3e831f62906..a59bc75ab7c5 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -633,7 +633,7 @@ void tick_offline_cpu(unsigned int cpu)
raw_spin_lock(&clockevents_lock);
tick_broadcast_offline(cpu);
- tick_shutdown(cpu);
+ tick_shutdown();
/*
* Unregister the clock event devices which were
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 0aef0e349e49..a1890a073196 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -144,7 +144,7 @@ static u64 suspend_start;
* Default for maximum permissible skew when cs->uncertainty_margin is
* not specified, and the lower bound even when cs->uncertainty_margin
* is specified. This is also the default that is used when registering
- * clocks with unspecifed cs->uncertainty_margin, so this macro is used
+ * clocks with unspecified cs->uncertainty_margin, so this macro is used
* even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels.
*/
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
@@ -407,9 +407,8 @@ void clocksource_verify_percpu(struct clocksource *cs)
if (!cpumask_empty(&cpus_behind))
pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n",
cpumask_pr_args(&cpus_behind), testcpu, cs->name);
- if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
- pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
- testcpu, cs_nsec_min, cs_nsec_max, cs->name);
+ pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
+ testcpu, cs_nsec_min, cs_nsec_max, cs->name);
}
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 30899a8cc52c..88aa062b8a55 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -59,6 +59,7 @@
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
static void retrigger_next_event(void *arg);
+static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);
/*
* The timer bases:
@@ -76,42 +77,34 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.index = HRTIMER_BASE_MONOTONIC,
.clockid = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME,
.clockid = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME,
.clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI,
.clockid = CLOCK_TAI,
- .get_time = &ktime_get_clocktai,
},
{
.index = HRTIMER_BASE_MONOTONIC_SOFT,
.clockid = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME_SOFT,
.clockid = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME_SOFT,
.clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI_SOFT,
.clockid = CLOCK_TAI,
- .get_time = &ktime_get_clocktai,
},
},
.csd = CSD_INIT(retrigger_next_event, NULL)
@@ -208,7 +201,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_
/*
* The offline local CPU can't be the default target if the
* next remote target event is after this timer. Keep the
- * elected new base. An IPI will we issued to reprogram
+ * elected new base. An IPI will be issued to reprogram
* it as a last resort.
*/
if (!hrtimer_base_is_online(this_cpu_base))
@@ -787,10 +780,10 @@ static void retrigger_next_event(void *arg)
* of the next expiring timer is enough. The return from the SMP
* function call will take care of the reprogramming in case the
* CPU was in a NOHZ idle sleep.
+ *
+ * In periodic low resolution mode, the next softirq expiration
+ * must also be updated.
*/
- if (!hrtimer_hres_active(base) && !tick_nohz_active)
- return;
-
raw_spin_lock(&base->lock);
hrtimer_update_base(base);
if (hrtimer_hres_active(base))
@@ -1253,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
remove_hrtimer(timer, base, true, force_local);
if (mode & HRTIMER_MODE_REL)
- tim = ktime_add_safe(tim, base->get_time());
+ tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
tim = hrtimer_update_lowres(timer, tim, mode);
@@ -1574,10 +1567,10 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
switch (clock_id) {
- case CLOCK_REALTIME:
- return HRTIMER_BASE_REALTIME;
case CLOCK_MONOTONIC:
return HRTIMER_BASE_MONOTONIC;
+ case CLOCK_REALTIME:
+ return HRTIMER_BASE_REALTIME;
case CLOCK_BOOTTIME:
return HRTIMER_BASE_BOOTTIME;
case CLOCK_TAI:
@@ -1588,6 +1581,29 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
}
}
+static ktime_t __hrtimer_cb_get_time(clockid_t clock_id)
+{
+ switch (clock_id) {
+ case CLOCK_MONOTONIC:
+ return ktime_get();
+ case CLOCK_REALTIME:
+ return ktime_get_real();
+ case CLOCK_BOOTTIME:
+ return ktime_get_boottime();
+ case CLOCK_TAI:
+ return ktime_get_clocktai();
+ default:
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return ktime_get();
+ }
+}
+
+ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
+{
+ return __hrtimer_cb_get_time(timer->base->clockid);
+}
+EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);
+
static void __hrtimer_setup(struct hrtimer *timer,
enum hrtimer_restart (*function)(struct hrtimer *),
clockid_t clock_id, enum hrtimer_mode mode)
@@ -2295,11 +2311,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
&new_base->clock_base[i]);
}
- /*
- * The migration might have changed the first expiring softirq
- * timer on this CPU. Update it.
- */
- __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
/* Tell the other CPU to retrigger the next event */
smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 876d389b2e21..7c6110e964e7 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -163,8 +163,7 @@ void posixtimer_rearm_itimer(struct task_struct *tsk)
struct hrtimer *tmr = &tsk->signal->real_timer;
if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) {
- hrtimer_forward(tmr, tmr->base->get_time(),
- tsk->signal->it_real_incr);
+ hrtimer_forward_now(tmr, tsk->signal->it_real_incr);
hrtimer_restart(tmr);
}
}
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 667452768ed3..5b6997f4dc3d 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/proc_ns.h>
#include <linux/export.h>
+#include <linux/nstree.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/cred.h>
@@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
+ ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
if (!ns)
goto fail_dec;
- refcount_set(&ns->ns.count, 1);
-
ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!ns->vvar_page)
goto fail_free;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto fail_free_page;
ns->ucounts = ucounts;
- ns->ns.ops = &timens_operations;
ns->user_ns = get_user_ns(user_ns);
ns->offsets = old_ns->offsets;
ns->frozen_offsets = false;
+ ns_tree_add(ns);
return ns;
fail_free_page:
@@ -130,7 +129,7 @@ fail:
*
* Return: timens_for_children namespace or ERR_PTR.
*/
-struct time_namespace *copy_time_ns(unsigned long flags,
+struct time_namespace *copy_time_ns(u64 flags,
struct user_namespace *user_ns, struct time_namespace *old_ns)
{
if (!(flags & CLONE_NEWTIME))
@@ -253,16 +252,13 @@ out:
void free_time_ns(struct time_namespace *ns)
{
+ ns_tree_remove(ns);
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
__free_page(ns->vvar_page);
- kfree(ns);
-}
-
-static struct time_namespace *to_time_ns(struct ns_common *ns)
-{
- return container_of(ns, struct time_namespace, ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *timens_get(struct task_struct *task)
@@ -466,7 +462,6 @@ out:
const struct proc_ns_operations timens_operations = {
.name = "time",
- .type = CLONE_NEWTIME,
.get = timens_get,
.put = timens_put,
.install = timens_install,
@@ -476,7 +471,6 @@ const struct proc_ns_operations timens_operations = {
const struct proc_ns_operations timens_for_children_operations = {
.name = "time_for_children",
.real_ns_name = "time",
- .type = CLONE_NEWTIME,
.get = timens_for_children_get,
.put = timens_put,
.install = timens_install,
@@ -484,9 +478,15 @@ const struct proc_ns_operations timens_for_children_operations = {
};
struct time_namespace init_time_ns = {
- .ns.count = REFCOUNT_INIT(3),
+ .ns.ns_type = ns_common_type(&init_time_ns),
+ .ns.__ns_ref = REFCOUNT_INIT(3),
.user_ns = &init_user_ns,
- .ns.inum = PROC_TIME_INIT_INO,
+ .ns.inum = ns_init_inum(&init_time_ns),
.ns.ops = &timens_operations,
.frozen_offsets = true,
};
+
+void __init time_ns_init(void)
+{
+ ns_tree_add(&init_time_ns);
+}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 8b582174b1f9..aa3120104a51 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -299,8 +299,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
{
struct hrtimer *timer = &timr->it.real.timer;
- timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
- timr->it_interval);
+ timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval);
hrtimer_restart(timer);
}
@@ -535,7 +534,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
goto out;
}
/*
- * After succesful copy out, the timer ID is visible to user space
+ * After successful copy out, the timer ID is visible to user space
* now but not yet valid because new_timer::signal low order bit is 1.
*
* Complete the initialization with the clock specific create
@@ -825,7 +824,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode);
if (!absolute)
- expires = ktime_add_safe(expires, timer->base->get_time());
+ expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer));
hrtimer_set_expires(timer, expires);
if (!sigev_none)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index cc15fe293719..cc1afec306b3 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -174,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
return HRTIMER_RESTART;
}
-void __init
-sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
+void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
@@ -247,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
pr_debug("Registered %pS as sched_clock source\n", read);
}
+EXPORT_SYMBOL_GPL(sched_clock_register);
void __init generic_sched_clock_init(void)
{
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 9a3859443c04..7e33d3f2e889 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -411,24 +411,18 @@ int tick_cpu_dying(unsigned int dying_cpu)
}
/*
- * Shutdown an event device on a given cpu:
+ * Shutdown an event device on the outgoing CPU:
*
- * This is called on a life CPU, when a CPU is dead. So we cannot
- * access the hardware device itself.
- * We just set the mode and remove it from the lists.
+ * Called by the dying CPU during teardown, with clockevents_lock held
+ * and interrupts disabled.
*/
-void tick_shutdown(unsigned int cpu)
+void tick_shutdown(void)
{
- struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
struct clock_event_device *dev = td->evtdev;
td->mode = TICKDEV_MODE_PERIODIC;
if (dev) {
- /*
- * Prevent that the clock events layer tries to call
- * the set mode function!
- */
- clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index faac36de35b9..4e4f7bbe2a64 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -26,7 +26,7 @@ extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_check_new_device(struct clock_event_device *dev);
extern void tick_offline_cpu(unsigned int cpu);
-extern void tick_shutdown(unsigned int cpu);
+extern void tick_shutdown(void);
extern void tick_suspend(void);
extern void tick_resume(void);
extern bool tick_check_replacement(struct clock_event_device *curdev,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 059fa8b79be6..b6974fce800c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -83,6 +83,12 @@ static inline bool tk_is_aux(const struct timekeeper *tk)
}
#endif
+static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs)
+{
+ tk->offs_aux = offs;
+ tk->monotonic_to_aux = ktime_to_timespec64(offs);
+}
+
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -1506,7 +1512,7 @@ static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespe
timekeeping_restore_shadow(tkd);
return -EINVAL;
}
- tks->offs_aux = offs;
+ tk_update_aux_offs(tks, offs);
}
timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
@@ -2937,7 +2943,7 @@ static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew)
* xtime ("realtime") is not applicable for auxiliary clocks and
* kept in sync with "monotonic".
*/
- aux_tks->offs_aux = ktime_sub(timespec64_to_ktime(*tnew), tnow);
+ tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow));
timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
return 0;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index b03d0ada6469..488e47e96e93 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -102,8 +102,6 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
SEQ_printf(m, " .index: %d\n", base->index);
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
-
- SEQ_printf(m, " .get_time: %ps\n", base->get_time);
#ifdef CONFIG_HIGH_RES_TIMERS
SEQ_printf(m, " .offset: %Lu nsecs\n",
(unsigned long long) ktime_to_ns(base->offset));
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 8ba8b0d8a387..aa59919b8f2c 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -159,10 +159,10 @@ void vdso_time_update_aux(struct timekeeper *tk)
if (clock_mode != VDSO_CLOCKMODE_NONE) {
fill_clock_configuration(vc, &tk->tkr_mono);
- vdso_ts->sec = tk->xtime_sec;
+ vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec;
nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
- nsec += tk->offs_aux;
+ nsec += tk->monotonic_to_aux.tv_nsec;
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
nsec = nsec << tk->tkr_mono.shift;
vdso_ts->nsec = nsec;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3ae52978cae6..8f23f5273bab 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -22,7 +22,6 @@
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <linux/key.h>
-#include <linux/verification.h>
#include <linux/namei.h>
#include <net/bpf_sk_storage.h>
@@ -1241,188 +1240,6 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-#ifdef CONFIG_KEYS
-__bpf_kfunc_start_defs();
-
-/**
- * bpf_lookup_user_key - lookup a key by its serial
- * @serial: key handle serial number
- * @flags: lookup-specific flags
- *
- * Search a key with a given *serial* and the provided *flags*.
- * If found, increment the reference count of the key by one, and
- * return it in the bpf_key structure.
- *
- * The bpf_key structure must be passed to bpf_key_put() when done
- * with it, so that the key reference count is decremented and the
- * bpf_key structure is freed.
- *
- * Permission checks are deferred to the time the key is used by
- * one of the available key-specific kfuncs.
- *
- * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
- * special keyring (e.g. session keyring), if it doesn't yet exist.
- * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
- * for the key construction, and to retrieve uninstantiated keys (keys
- * without data attached to them).
- *
- * Return: a bpf_key pointer with a valid key pointer if the key is found, a
- * NULL pointer otherwise.
- */
-__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
-{
- key_ref_t key_ref;
- struct bpf_key *bkey;
-
- if (flags & ~KEY_LOOKUP_ALL)
- return NULL;
-
- /*
- * Permission check is deferred until the key is used, as the
- * intent of the caller is unknown here.
- */
- key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
- if (IS_ERR(key_ref))
- return NULL;
-
- bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
- if (!bkey) {
- key_put(key_ref_to_ptr(key_ref));
- return NULL;
- }
-
- bkey->key = key_ref_to_ptr(key_ref);
- bkey->has_ref = true;
-
- return bkey;
-}
-
-/**
- * bpf_lookup_system_key - lookup a key by a system-defined ID
- * @id: key ID
- *
- * Obtain a bpf_key structure with a key pointer set to the passed key ID.
- * The key pointer is marked as invalid, to prevent bpf_key_put() from
- * attempting to decrement the key reference count on that pointer. The key
- * pointer set in such way is currently understood only by
- * verify_pkcs7_signature().
- *
- * Set *id* to one of the values defined in include/linux/verification.h:
- * 0 for the primary keyring (immutable keyring of system keys);
- * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
- * (where keys can be added only if they are vouched for by existing keys
- * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
- * keyring (primarily used by the integrity subsystem to verify a kexec'ed
- * kerned image and, possibly, the initramfs signature).
- *
- * Return: a bpf_key pointer with an invalid key pointer set from the
- * pre-determined ID on success, a NULL pointer otherwise
- */
-__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
-{
- struct bpf_key *bkey;
-
- if (system_keyring_id_check(id) < 0)
- return NULL;
-
- bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
- if (!bkey)
- return NULL;
-
- bkey->key = (struct key *)(unsigned long)id;
- bkey->has_ref = false;
-
- return bkey;
-}
-
-/**
- * bpf_key_put - decrement key reference count if key is valid and free bpf_key
- * @bkey: bpf_key structure
- *
- * Decrement the reference count of the key inside *bkey*, if the pointer
- * is valid, and free *bkey*.
- */
-__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
-{
- if (bkey->has_ref)
- key_put(bkey->key);
-
- kfree(bkey);
-}
-
-#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
-/**
- * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
- * @data_p: data to verify
- * @sig_p: signature of the data
- * @trusted_keyring: keyring with keys trusted for signature verification
- *
- * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
- * with keys in a keyring referenced by *trusted_keyring*.
- *
- * Return: 0 on success, a negative value on error.
- */
-__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
- struct bpf_dynptr *sig_p,
- struct bpf_key *trusted_keyring)
-{
- struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
- struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
- const void *data, *sig;
- u32 data_len, sig_len;
- int ret;
-
- if (trusted_keyring->has_ref) {
- /*
- * Do the permission check deferred in bpf_lookup_user_key().
- * See bpf_lookup_user_key() for more details.
- *
- * A call to key_task_permission() here would be redundant, as
- * it is already done by keyring_search() called by
- * find_asymmetric_key().
- */
- ret = key_validate(trusted_keyring->key);
- if (ret < 0)
- return ret;
- }
-
- data_len = __bpf_dynptr_size(data_ptr);
- data = __bpf_dynptr_data(data_ptr, data_len);
- sig_len = __bpf_dynptr_size(sig_ptr);
- sig = __bpf_dynptr_data(sig_ptr, sig_len);
-
- return verify_pkcs7_signature(data, data_len, sig, sig_len,
- trusted_keyring->key,
- VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
- NULL);
-}
-#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(key_sig_kfunc_set)
-BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
-#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
-BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
-#endif
-BTF_KFUNCS_END(key_sig_kfunc_set)
-
-static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
- .owner = THIS_MODULE,
- .set = &key_sig_kfunc_set,
-};
-
-static int __init bpf_key_sig_kfuncs_init(void)
-{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
- &bpf_key_sig_kfunc_set);
-}
-
-late_initcall(bpf_key_sig_kfuncs_init);
-#endif /* CONFIG_KEYS */
-
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1521,8 +1338,6 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
{
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
- if (type != BPF_READ)
- return false;
if (off % size != 0)
return false;
/*
@@ -1532,6 +1347,9 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
if (off + size > sizeof(struct pt_regs))
return false;
+ if (type == BPF_WRITE)
+ prog->aux->kprobe_write_ctx = true;
+
return true;
}
@@ -2728,20 +2546,25 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
struct pt_regs *regs;
int err;
+ /*
+ * graph tracer framework ensures we won't migrate, so there is no need
+ * to use migrate_disable for bpf_prog_run again. The check here just for
+ * __this_cpu_inc_return.
+ */
+ cant_sleep();
+
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
bpf_prog_inc_misses_counter(link->link.prog);
err = 1;
goto out;
}
- migrate_disable();
rcu_read_lock();
regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr());
old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
- migrate_enable();
out:
__this_cpu_dec(bpf_prog_active);
@@ -2913,6 +2736,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (!is_kprobe_multi(prog))
return -EINVAL;
+ /* Writing to context is not allowed for kprobes. */
+ if (prog->aux->kprobe_write_ctx)
+ return -EINVAL;
+
flags = attr->link_create.kprobe_multi.flags;
if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
return -EINVAL;
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index f4d200f0c610..484ad7a18463 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -815,6 +815,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe
unsigned long bitmap;
unsigned long ret;
int offset;
+ int bit;
int i;
ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset);
@@ -829,6 +830,15 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe
if (fregs)
ftrace_regs_set_instruction_pointer(fregs, ret);
+ bit = ftrace_test_recursion_trylock(trace.func, ret);
+ /*
+ * This can fail because ftrace_test_recursion_trylock() allows one nest
+ * call. If we are already in a nested call, then we don't probe this and
+ * just return the original return address.
+ */
+ if (unlikely(bit < 0))
+ goto out;
+
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
trace.retval = ftrace_regs_get_return_value(fregs);
#endif
@@ -852,6 +862,8 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe
}
}
+ ftrace_test_recursion_unlock(bit);
+out:
/*
* The ftrace_graph_return() may still access the current
* ret_stack structure, we need to make sure the update of
@@ -1397,6 +1409,8 @@ error:
ftrace_graph_active--;
gops->saved_func = NULL;
fgraph_lru_release_index(i);
+ if (!ftrace_graph_active)
+ unregister_pm_notifier(&ftrace_suspend_notifier);
}
return ret;
}
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index c8034dfc1070..5a807d62e76d 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -428,8 +428,9 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad
{
unsigned long *addrs;
- if (alist->index >= alist->size)
- return -ENOMEM;
+ /* Previously we failed to expand the list. */
+ if (alist->index == alist->size)
+ return -ENOSPC;
alist->addrs[alist->index++] = addr;
if (alist->index < alist->size)
@@ -489,7 +490,7 @@ static int fprobe_module_callback(struct notifier_block *nb,
for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++)
fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist);
- if (alist.index < alist.size && alist.index > 0)
+ if (alist.index > 0)
ftrace_set_filter_ips(&fprobe_graph_ops.ops,
alist.addrs, alist.index, 1, 0);
mutex_unlock(&fprobe_mutex);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 00b76d450a89..a69067367c29 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4661,13 +4661,17 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
} else {
iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
}
+ } else {
+ if (hash)
+ iter->hash = alloc_and_copy_ftrace_hash(hash->size_bits, hash);
+ else
+ iter->hash = EMPTY_HASH;
+ }
- if (!iter->hash) {
- trace_parser_put(&iter->parser);
- goto out_unlock;
- }
- } else
- iter->hash = hash;
+ if (!iter->hash) {
+ trace_parser_put(&iter->parser);
+ goto out_unlock;
+ }
ret = 0;
@@ -6543,9 +6547,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
iter->hash, filter_hash);
mutex_unlock(&ftrace_lock);
- } else {
- /* For read only, the hash is the ops hash */
- iter->hash = NULL;
}
mutex_unlock(&iter->ops->func_hash->regex_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bb71a0dc9d69..43460949ad3f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7666,7 +7666,7 @@ static __init int test_ringbuffer(void)
rb_test_started = true;
set_current_state(TASK_INTERRUPTIBLE);
- /* Just run for 10 seconds */;
+ /* Just run for 10 seconds */
schedule_timeout(10 * HZ);
kthread_stop(rb_hammer);
diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
index eea447b06907..c1347da69e9d 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.c
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -127,7 +127,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
mon = ltl_get_monitor(current);
switch (id) {
+#ifdef __NR_clock_nanosleep
case __NR_clock_nanosleep:
+#endif
#ifdef __NR_clock_nanosleep_time64
case __NR_clock_nanosleep_time64:
#endif
@@ -138,7 +140,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true);
break;
+#ifdef __NR_futex
case __NR_futex:
+#endif
#ifdef __NR_futex_time64
case __NR_futex_time64:
#endif
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 1482e91c39f4..48338520376f 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -495,7 +495,7 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos)
*/
static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct rv_monitor *mon = p;
+ struct rv_monitor *mon = container_of(p, struct rv_monitor, list);
(*pos)++;
@@ -805,7 +805,7 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
retval = create_monitor_dir(monitor, parent);
if (retval)
- return retval;
+ goto out_unlock;
/* keep children close to the parent for easier visualisation */
if (parent)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4283ed4e8f59..b3c94fbaf002 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -834,7 +834,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
/* copy the current bits to the new max */
ret = trace_pid_list_first(filtered_pids, &pid);
while (!ret) {
- trace_pid_list_set(pid_list, pid);
+ ret = trace_pid_list_set(pid_list, pid);
+ if (ret < 0)
+ goto out;
+
ret = trace_pid_list_next(filtered_pids, pid + 1, &pid);
nr_pids++;
}
@@ -871,6 +874,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
trace_parser_clear(&parser);
ret = 0;
}
+ out:
trace_parser_put(&parser);
if (ret < 0) {
@@ -1816,7 +1820,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
ret = get_user(ch, ubuf++);
if (ret)
- return ret;
+ goto fail;
read++;
cnt--;
@@ -1830,7 +1834,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
while (cnt && isspace(ch)) {
ret = get_user(ch, ubuf++);
if (ret)
- return ret;
+ goto fail;
read++;
cnt--;
}
@@ -1848,12 +1852,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
while (cnt && !isspace(ch) && ch) {
if (parser->idx < parser->size - 1)
parser->buffer[parser->idx++] = ch;
- else
- return -EINVAL;
+ else {
+ ret = -EINVAL;
+ goto fail;
+ }
ret = get_user(ch, ubuf++);
if (ret)
- return ret;
+ goto fail;
read++;
cnt--;
}
@@ -1868,11 +1874,15 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
/* Make sure the parsed string always terminates with '\0'. */
parser->buffer[parser->idx] = 0;
} else {
- return -EINVAL;
+ ret = -EINVAL;
+ goto fail;
}
*ppos += read;
return read;
+fail:
+ trace_parser_fail(parser);
+ return ret;
}
/* TODO add a seq_buf_to_buffer() */
@@ -7203,7 +7213,7 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user
entry = ring_buffer_event_data(event);
entry->ip = ip;
- len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
+ len = copy_from_user_nofault(&entry->buf, ubuf, cnt);
if (len) {
memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
cnt = FAULTED_SIZE;
@@ -7300,7 +7310,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr,
entry = ring_buffer_event_data(event);
- len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
+ len = copy_from_user_nofault(&entry->id, ubuf, cnt);
if (len) {
entry->id = -1;
memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
@@ -10632,10 +10642,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
ret = print_trace_line(&iter);
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(&iter);
+
+ trace_printk_seq(&iter.seq);
}
touch_nmi_watchdog();
-
- trace_printk_seq(&iter.seq);
}
if (!cnt)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1dbf1d3cf2f1..5f4bed5842f9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1292,6 +1292,7 @@ bool ftrace_event_is_function(struct trace_event_call *call);
*/
struct trace_parser {
bool cont;
+ bool fail;
char *buffer;
unsigned idx;
unsigned size;
@@ -1299,7 +1300,7 @@ struct trace_parser {
static inline bool trace_parser_loaded(struct trace_parser *parser)
{
- return (parser->idx != 0);
+ return !parser->fail && parser->idx != 0;
}
static inline bool trace_parser_cont(struct trace_parser *parser)
@@ -1313,6 +1314,11 @@ static inline void trace_parser_clear(struct trace_parser *parser)
parser->idx = 0;
}
+static inline void trace_parser_fail(struct trace_parser *parser)
+{
+ parser->fail = true;
+}
+
extern int trace_parser_get_init(struct trace_parser *parser, int size);
extern void trace_parser_put(struct trace_parser *parser);
extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
@@ -2204,7 +2210,7 @@ static inline bool is_good_system_name(const char *name)
static inline void sanitize_event_name(char *name)
{
while (*name++ != '\0')
- if (*name == ':' || *name == '.')
+ if (*name == ':' || *name == '.' || *name == '*')
*name = '_';
}
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 5d64a18cacac..d06854bd32b3 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -230,6 +230,10 @@ static int dyn_event_open(struct inode *inode, struct file *file)
{
int ret;
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
ret = tracing_check_open_get_tr(NULL);
if (ret)
return ret;
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index af42aaa3d172..2ab283fd3032 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -496,7 +496,7 @@ static bool user_event_enabler_queue_fault(struct user_event_mm *mm,
{
struct user_event_enabler_fault *fault;
- fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
+ fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT);
if (!fault)
return false;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 66e1a527cf1a..a7f4b9a47a71 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -27,14 +27,21 @@ struct fgraph_cpu_data {
unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
};
+struct fgraph_ent_args {
+ struct ftrace_graph_ent_entry ent;
+ /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */
+ unsigned long args[FTRACE_REGS_MAX_ARGS];
+};
+
struct fgraph_data {
struct fgraph_cpu_data __percpu *cpu_data;
/* Place to preserve last processed entry. */
union {
- struct ftrace_graph_ent_entry ent;
+ struct fgraph_ent_args ent;
+ /* TODO allow retaddr to have args */
struct fgraph_retaddr_ent_entry rent;
- } ent;
+ };
struct ftrace_graph_ret_entry ret;
int failed;
int cpu;
@@ -627,10 +634,13 @@ get_return_for_leaf(struct trace_iterator *iter,
* Save current and next entries for later reference
* if the output fails.
*/
- if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT))
- data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr;
- else
- data->ent.ent = *curr;
+ if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) {
+ data->rent = *(struct fgraph_retaddr_ent_entry *)curr;
+ } else {
+ int size = min((int)sizeof(data->ent), (int)iter->ent_size);
+
+ memcpy(&data->ent, curr, size);
+ }
/*
* If the next event is not a return type, then
* we only care about what type it is. Otherwise we can
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ccae62d4fb91..fa60362a3f31 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -908,6 +908,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
return -EINVAL;
}
buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
buf[len] = '\0';
ret = kstrtouint(buf, 0, &maxactive);
if (ret || !maxactive) {
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index fd259da0aa64..dc734867f0fc 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2322,12 +2322,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
int running, err;
char *buf __free(kfree) = NULL;
- buf = kmalloc(count, GFP_KERNEL);
+ if (count < 1)
+ return 0;
+
+ buf = kmalloc(count + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
if (copy_from_user(buf, ubuf, count))
return -EFAULT;
+ buf[count] = '\0';
if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL))
return -ENOMEM;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 16b283f9d831..6ea2f6363b90 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -57,12 +57,11 @@ void bacct_add_tsk(struct user_namespace *user_ns,
stats->ac_sched = tsk->policy;
stats->ac_pid = task_pid_nr_ns(tsk, pid_ns);
stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns);
+ stats->ac_ppid = task_ppid_nr_ns(tsk, pid_ns);
rcu_read_lock();
tcred = __task_cred(tsk);
stats->ac_uid = from_kuid_munged(user_ns, tcred->uid);
stats->ac_gid = from_kgid_munged(user_ns, tcred->gid);
- stats->ac_ppid = pid_alive(tsk) ?
- task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
rcu_read_unlock();
task_cputime(tsk, &utime, &stime);
diff --git a/kernel/user.c b/kernel/user.c
index f46b1d41163b..0163665914c9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -65,10 +65,11 @@ struct user_namespace init_user_ns = {
.nr_extents = 1,
},
},
- .ns.count = REFCOUNT_INIT(3),
+ .ns.ns_type = ns_common_type(&init_user_ns),
+ .ns.__ns_ref = REFCOUNT_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
- .ns.inum = PROC_USER_INIT_INO,
+ .ns.inum = ns_init_inum(&init_user_ns),
#ifdef CONFIG_USER_NS
.ns.ops = &userns_operations,
#endif
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 682f40d5632d..03cb63883d04 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,6 +21,7 @@
#include <linux/fs_struct.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/nstree.h>
static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);
@@ -124,12 +125,11 @@ int create_user_ns(struct cred *new)
goto fail_dec;
ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
- ret = ns_alloc_inum(&ns->ns);
+
+ ret = ns_common_init(ns);
if (ret)
goto fail_free;
- ns->ns.ops = &userns_operations;
- refcount_set(&ns->ns.count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
ns->level = parent_ns->level + 1;
@@ -159,12 +159,13 @@ int create_user_ns(struct cred *new)
goto fail_keyring;
set_cred_user_ns(new, ns);
+ ns_tree_add(ns);
return 0;
fail_keyring:
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
#endif
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
fail_free:
kmem_cache_free(user_ns_cachep, ns);
fail_dec:
@@ -201,6 +202,7 @@ static void free_user_ns(struct work_struct *work)
do {
struct ucounts *ucounts = ns->ucounts;
parent = ns->parent;
+ ns_tree_remove(ns);
if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
kfree(ns->gid_map.forward);
kfree(ns->gid_map.reverse);
@@ -218,11 +220,12 @@ static void free_user_ns(struct work_struct *work)
#endif
retire_userns_sysctls(ns);
key_free_user_ns(ns);
- ns_free_inum(&ns->ns);
- kmem_cache_free(user_ns_cachep, ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
dec_user_namespaces(ucounts);
ns = parent;
- } while (refcount_dec_and_test(&parent->ns.count));
+ } while (ns_ref_put(parent));
}
void __put_user_ns(struct user_namespace *ns)
@@ -1322,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns)
}
EXPORT_SYMBOL(current_in_userns);
-static inline struct user_namespace *to_user_ns(struct ns_common *ns)
-{
- return container_of(ns, struct user_namespace, ns);
-}
-
static struct ns_common *userns_get(struct task_struct *task)
{
struct user_namespace *user_ns;
@@ -1402,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns)
const struct proc_ns_operations userns_operations = {
.name = "user",
- .type = CLONE_NEWUSER,
.get = userns_get,
.put = userns_put,
.install = userns_install,
@@ -1413,6 +1410,7 @@ const struct proc_ns_operations userns_operations = {
static __init int user_namespaces_init(void)
{
user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
+ ns_tree_add(&init_user_ns);
return 0;
}
subsys_initcall(user_namespaces_init);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index b1ac3ca870f2..ebbfc578a9d3 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -13,6 +13,7 @@
#include <linux/cred.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
+#include <linux/nstree.h>
#include <linux/sched/task.h>
static struct kmem_cache *uts_ns_cache __ro_after_init;
@@ -27,16 +28,6 @@ static void dec_uts_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
}
-static struct uts_namespace *create_uts_ns(void)
-{
- struct uts_namespace *uts_ns;
-
- uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
- if (uts_ns)
- refcount_set(&uts_ns->ns.count, 1);
- return uts_ns;
-}
-
/*
* Clone a new ns copying an original utsname, setting refcount to 1
* @old_ns: namespace to clone
@@ -55,21 +46,20 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = create_uts_ns();
+ ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL);
if (!ns)
goto fail_dec;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto fail_free;
ns->ucounts = ucounts;
- ns->ns.ops = &utsns_operations;
-
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
ns->user_ns = get_user_ns(user_ns);
up_read(&uts_sem);
+ ns_tree_add(ns);
return ns;
fail_free:
@@ -86,7 +76,7 @@ fail:
* utsname of this process won't be seen by parent, and vice
* versa.
*/
-struct uts_namespace *copy_utsname(unsigned long flags,
+struct uts_namespace *copy_utsname(u64 flags,
struct user_namespace *user_ns, struct uts_namespace *old_ns)
{
struct uts_namespace *new_ns;
@@ -105,15 +95,12 @@ struct uts_namespace *copy_utsname(unsigned long flags,
void free_uts_ns(struct uts_namespace *ns)
{
+ ns_tree_remove(ns);
dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kmem_cache_free(uts_ns_cache, ns);
-}
-
-static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
-{
- return container_of(ns, struct uts_namespace, ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *utsns_get(struct task_struct *task)
@@ -159,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns)
const struct proc_ns_operations utsns_operations = {
.name = "uts",
- .type = CLONE_NEWUTS,
.get = utsns_get,
.put = utsns_put,
.install = utsns_install,
@@ -174,4 +160,5 @@ void __init uts_ns_init(void)
offsetof(struct uts_namespace, name),
sizeof_field(struct uts_namespace, name),
NULL);
+ ns_tree_add(&init_uts_ns);
}
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index bc738fa90c1d..27107dcc1cbf 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -100,6 +100,7 @@ void vhost_task_stop(struct vhost_task *vtsk)
* freeing it below.
*/
wait_for_completion(&vtsk->exited);
+ put_task_struct(vtsk->task);
kfree(vtsk);
}
EXPORT_SYMBOL_GPL(vhost_task_stop);
@@ -148,7 +149,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *),
return ERR_CAST(tsk);
}
- vtsk->task = tsk;
+ vtsk->task = get_task_struct(tsk);
return vtsk;
}
EXPORT_SYMBOL_GPL(vhost_task_create);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c6b79b3675c3..45320e27a16c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -222,7 +222,9 @@ struct worker_pool {
struct workqueue_attrs *attrs; /* I: worker attributes */
struct hlist_node hash_node; /* PL: unbound_pool_hash node */
int refcnt; /* PL: refcnt for unbound pools */
-
+#ifdef CONFIG_PREEMPT_RT
+ spinlock_t cb_lock; /* BH worker cancel lock */
+#endif
/*
* Destruction of pool is RCU protected to allow dereferences
* from get_work_pool().
@@ -2930,7 +2932,7 @@ static void idle_worker_timeout(struct timer_list *t)
raw_spin_unlock_irq(&pool->lock);
if (do_cull)
- queue_work(system_unbound_wq, &pool->idle_cull_work);
+ queue_work(system_dfl_wq, &pool->idle_cull_work);
}
/**
@@ -3078,6 +3080,31 @@ restart:
goto restart;
}
+#ifdef CONFIG_PREEMPT_RT
+static void worker_lock_callback(struct worker_pool *pool)
+{
+ spin_lock(&pool->cb_lock);
+}
+
+static void worker_unlock_callback(struct worker_pool *pool)
+{
+ spin_unlock(&pool->cb_lock);
+}
+
+static void workqueue_callback_cancel_wait_running(struct worker_pool *pool)
+{
+ spin_lock(&pool->cb_lock);
+ spin_unlock(&pool->cb_lock);
+}
+
+#else
+
+static void worker_lock_callback(struct worker_pool *pool) { }
+static void worker_unlock_callback(struct worker_pool *pool) { }
+static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { }
+
+#endif
+
/**
* manage_workers - manage worker pool
* @worker: self
@@ -3557,6 +3584,7 @@ static void bh_worker(struct worker *worker)
int nr_restarts = BH_WORKER_RESTARTS;
unsigned long end = jiffies + BH_WORKER_JIFFIES;
+ worker_lock_callback(pool);
raw_spin_lock_irq(&pool->lock);
worker_leave_idle(worker);
@@ -3585,6 +3613,7 @@ done:
worker_enter_idle(worker);
kick_pool(pool);
raw_spin_unlock_irq(&pool->lock);
+ worker_unlock_callback(pool);
}
/*
@@ -4222,17 +4251,17 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
(data & WORK_OFFQ_BH)) {
/*
* On RT, prevent a live lock when %current preempted
- * soft interrupt processing or prevents ksoftirqd from
- * running by keeping flipping BH. If the BH work item
- * runs on a different CPU then this has no effect other
- * than doing the BH disable/enable dance for nothing.
- * This is copied from
- * kernel/softirq.c::tasklet_unlock_spin_wait().
+ * soft interrupt processing by blocking on lock which
+ * is owned by the thread invoking the callback.
*/
while (!try_wait_for_completion(&barr.done)) {
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- local_bh_disable();
- local_bh_enable();
+ struct worker_pool *pool;
+
+ guard(rcu)();
+ pool = get_work_pool(work);
+ if (pool)
+ workqueue_callback_cancel_wait_running(pool);
} else {
cpu_relax();
}
@@ -4782,6 +4811,9 @@ static int init_worker_pool(struct worker_pool *pool)
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
pool->refcnt = 1;
+#ifdef CONFIG_PREEMPT_RT
+ spin_lock_init(&pool->cb_lock);
+#endif
/* shouldn't fail above this point */
pool->attrs = alloc_workqueue_attrs();
@@ -6046,7 +6078,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
struct pool_workqueue *pwq;
bool ret;
- rcu_read_lock();
preempt_disable();
if (cpu == WORK_CPU_UNBOUND)
@@ -6056,7 +6087,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
ret = !list_empty(&pwq->inactive_works);
preempt_enable();
- rcu_read_unlock();
return ret;
}
@@ -7546,8 +7576,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
if (!thresh)
return;
- rcu_read_lock();
-
for_each_pool(pool, pi) {
unsigned long pool_ts, touched, ts;
@@ -7589,8 +7617,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
}
- rcu_read_unlock();
-
if (lockup_detected)
show_all_workqueues();
@@ -7642,7 +7668,7 @@ static int wq_watchdog_param_set_thresh(const char *val,
if (ret)
return ret;
- if (system_wq)
+ if (system_percpu_wq)
wq_watchdog_set_thresh(thresh);
else
wq_watchdog_thresh = thresh;
@@ -7802,22 +7828,22 @@ void __init workqueue_init_early(void)
ordered_wq_attrs[i] = attrs;
}
- system_wq = alloc_workqueue("events", 0, 0);
- system_percpu_wq = alloc_workqueue("events", 0, 0);
- system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
- system_long_wq = alloc_workqueue("events_long", 0, 0);
+ system_wq = alloc_workqueue("events", WQ_PERCPU, 0);
+ system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0);
+ system_highpri_wq = alloc_workqueue("events_highpri",
+ WQ_HIGHPRI | WQ_PERCPU, 0);
+ system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
- WQ_FREEZABLE, 0);
+ WQ_FREEZABLE | WQ_PERCPU, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
- WQ_POWER_EFFICIENT, 0);
+ WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
- WQ_FREEZABLE | WQ_POWER_EFFICIENT,
- 0);
- system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
+ WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
+ system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0);
system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
- WQ_BH | WQ_HIGHPRI, 0);
+ WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0);
BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
!system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
!system_power_efficient_wq ||