summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt13
-rw-r--r--kernel/audit.c278
-rw-r--r--kernel/audit.h13
-rw-r--r--kernel/audit_tree.c6
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/auditsc.c63
-rw-r--r--kernel/bpf/stackmap.c4
-rw-r--r--kernel/bpf/verifier.c1
-rw-r--r--kernel/cgroup/cgroup-internal.h11
-rw-r--r--kernel/cgroup/cgroup-v1.c19
-rw-r--r--kernel/cgroup/cgroup.c199
-rw-r--r--kernel/cgroup/cpuset-internal.h5
-rw-r--r--kernel/cgroup/cpuset-v1.c12
-rw-r--r--kernel/cgroup/cpuset.c752
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/freezer.c16
-rw-r--r--kernel/events/callchain.c40
-rw-r--r--kernel/events/core.c369
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/events/uprobes.c102
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/futex/syscalls.c106
-rw-r--r--kernel/irq/Kconfig6
-rw-r--r--kernel/irq/chip.c37
-rw-r--r--kernel/irq/devres.c127
-rw-r--r--kernel/irq/handle.c49
-rw-r--r--kernel/irq/irq_test.c55
-rw-r--r--kernel/irq/irqdesc.c7
-rw-r--r--kernel/rseq.c10
-rw-r--r--kernel/sched/build_policy.c1
-rw-r--r--kernel/sched/core.c70
-rw-r--r--kernel/sched/deadline.c73
-rw-r--r--kernel/sched/ext.c1556
-rw-r--r--kernel/sched/ext.h25
-rw-r--r--kernel/sched/ext_idle.c146
-rw-r--r--kernel/sched/ext_internal.h1078
-rw-r--r--kernel/sched/fair.c489
-rw-r--r--kernel/sched/pelt.h4
-rw-r--r--kernel/sched/rq-offsets.c12
-rw-r--r--kernel/sched/sched.h7
-rw-r--r--kernel/sched/topology.c73
-rw-r--r--kernel/seccomp.c32
-rw-r--r--kernel/smp.c11
-rw-r--r--kernel/softirq.c145
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c7
-rw-r--r--kernel/time/hrtimer.c40
-rw-r--r--kernel/time/itimer.c3
-rw-r--r--kernel/time/posix-timers.c7
-rw-r--r--kernel/time/sched_clock.c4
-rw-r--r--kernel/time/tick-common.c16
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/workqueue.c80
58 files changed, 3628 insertions, 2579 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 54ea59ff8fbe..da326800c1c9 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -103,6 +103,19 @@ config PREEMPT_RT
Select this if you are building a kernel for systems which
require real-time guarantees.
+config PREEMPT_RT_NEEDS_BH_LOCK
+ bool "Enforce softirq synchronisation on PREEMPT_RT"
+ depends on PREEMPT_RT
+ help
+ Enforce synchronisation across the softirqs context. On PREEMPT_RT
+ the softirq is preemptible. This enforces the same per-CPU BLK
+ semantic non-PREEMPT_RT builds have. This should not be needed
+ because per-CPU locks were added to avoid the per-CPU BKL.
+
+ This switch provides the old behaviour for testing reasons. Select
+ this if you suspect an error with preemptible softirq and want test
+ the old synchronized behaviour.
+
config PREEMPT_COUNT
bool
diff --git a/kernel/audit.c b/kernel/audit.c
index 61b5744d0bb6..26a332ffb1b8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -54,6 +54,7 @@
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <linux/security.h>
+#include <linux/lsm_hooks.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>
@@ -81,6 +82,13 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK;
/* private audit network namespace index */
static unsigned int audit_net_id;
+/* Number of modules that provide a security context.
+ List of lsms that provide a security context */
+static u32 audit_subj_secctx_cnt;
+static u32 audit_obj_secctx_cnt;
+static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT];
+static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT];
+
/**
* struct audit_net - audit private network namespace data
* @sk: communication socket
@@ -195,8 +203,10 @@ static struct audit_ctl_mutex {
* to place it on a transmit queue. Multiple audit_buffers can be in
* use simultaneously. */
struct audit_buffer {
- struct sk_buff *skb; /* formatted skb ready to send */
+ struct sk_buff *skb; /* the skb for audit_log functions */
+ struct sk_buff_head skb_list; /* formatted skbs, ready to send */
struct audit_context *ctx; /* NULL or associated context */
+ struct audit_stamp stamp; /* audit stamp for these records */
gfp_t gfp_mask;
};
@@ -279,6 +289,33 @@ static pid_t auditd_pid_vnr(void)
}
/**
+ * audit_cfg_lsm - Identify a security module as providing a secctx.
+ * @lsmid: LSM identity
+ * @flags: which contexts are provided
+ *
+ * Description:
+ * Increments the count of the security modules providing a secctx.
+ * If the LSM id is already in the list leave it alone.
+ */
+void audit_cfg_lsm(const struct lsm_id *lsmid, int flags)
+{
+ int i;
+
+ if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) {
+ for (i = 0 ; i < audit_subj_secctx_cnt; i++)
+ if (audit_subj_lsms[i] == lsmid)
+ return;
+ audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid;
+ }
+ if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) {
+ for (i = 0 ; i < audit_obj_secctx_cnt; i++)
+ if (audit_obj_lsms[i] == lsmid)
+ return;
+ audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid;
+ }
+}
+
+/**
* audit_get_sk - Return the audit socket for the given network namespace
* @net: the destination network namespace
*
@@ -1113,7 +1150,6 @@ static int is_audit_feature_set(int i)
return af.features & AUDIT_FEATURE_TO_MASK(i);
}
-
static int audit_get_feature(struct sk_buff *skb)
{
u32 seq;
@@ -1473,7 +1509,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
case AUDIT_SIGNAL_INFO:
if (lsmprop_is_set(&audit_sig_lsm)) {
err = security_lsmprop_to_secctx(&audit_sig_lsm,
- &lsmctx);
+ &lsmctx, LSM_ID_UNDEF);
if (err < 0)
return err;
}
@@ -1776,10 +1812,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set);
static void audit_buffer_free(struct audit_buffer *ab)
{
+ struct sk_buff *skb;
+
if (!ab)
return;
- kfree_skb(ab->skb);
+ while ((skb = skb_dequeue(&ab->skb_list)))
+ kfree_skb(skb);
kmem_cache_free(audit_buffer_cache, ab);
}
@@ -1792,9 +1831,14 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
if (!ab)
return NULL;
+ skb_queue_head_init(&ab->skb_list);
+
ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
goto err;
+
+ skb_queue_tail(&ab->skb_list, ab->skb);
+
if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
goto err;
@@ -1833,11 +1877,11 @@ unsigned int audit_serial(void)
}
static inline void audit_get_stamp(struct audit_context *ctx,
- struct timespec64 *t, unsigned int *serial)
+ struct audit_stamp *stamp)
{
- if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
- ktime_get_coarse_real_ts64(t);
- *serial = audit_serial();
+ if (!ctx || !auditsc_get_stamp(ctx, stamp)) {
+ ktime_get_coarse_real_ts64(&stamp->ctime);
+ stamp->serial = audit_serial();
}
}
@@ -1860,8 +1904,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
int type)
{
struct audit_buffer *ab;
- struct timespec64 t;
- unsigned int serial;
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
@@ -1916,12 +1958,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
return NULL;
}
- audit_get_stamp(ab->ctx, &t, &serial);
+ audit_get_stamp(ab->ctx, &ab->stamp);
/* cancel dummy context to enable supporting records */
if (ctx)
ctx->dummy = 0;
audit_log_format(ab, "audit(%llu.%03lu:%u): ",
- (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
+ (unsigned long long)ab->stamp.ctime.tv_sec,
+ ab->stamp.ctime.tv_nsec/1000000,
+ ab->stamp.serial);
return ab;
}
@@ -2177,33 +2221,179 @@ void audit_log_key(struct audit_buffer *ab, char *key)
audit_log_format(ab, "(null)");
}
-int audit_log_task_context(struct audit_buffer *ab)
+/**
+ * audit_buffer_aux_new - Add an aux record buffer to the skb list
+ * @ab: audit_buffer
+ * @type: message type
+ *
+ * Aux records are allocated and added to the skb list of
+ * the "main" record. The ab->skb is reset to point to the
+ * aux record on its creation. When the aux record in complete
+ * ab->skb has to be reset to point to the "main" record.
+ * This allows the audit_log_ functions to be ignorant of
+ * which kind of record it is logging to. It also avoids adding
+ * special data for aux records.
+ *
+ * On success ab->skb will point to the new aux record.
+ * Returns 0 on success, -ENOMEM should allocation fail.
+ */
+static int audit_buffer_aux_new(struct audit_buffer *ab, int type)
+{
+ WARN_ON(ab->skb != skb_peek(&ab->skb_list));
+
+ ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask);
+ if (!ab->skb)
+ goto err;
+ if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
+ goto err;
+ skb_queue_tail(&ab->skb_list, ab->skb);
+
+ audit_log_format(ab, "audit(%llu.%03lu:%u): ",
+ (unsigned long long)ab->stamp.ctime.tv_sec,
+ ab->stamp.ctime.tv_nsec/1000000,
+ ab->stamp.serial);
+
+ return 0;
+
+err:
+ kfree_skb(ab->skb);
+ ab->skb = skb_peek(&ab->skb_list);
+ return -ENOMEM;
+}
+
+/**
+ * audit_buffer_aux_end - Switch back to the "main" record from an aux record
+ * @ab: audit_buffer
+ *
+ * Restores the "main" audit record to ab->skb.
+ */
+static void audit_buffer_aux_end(struct audit_buffer *ab)
+{
+ ab->skb = skb_peek(&ab->skb_list);
+}
+
+/**
+ * audit_log_subj_ctx - Add LSM subject information
+ * @ab: audit_buffer
+ * @prop: LSM subject properties.
+ *
+ * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record.
+ */
+int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop)
{
- struct lsm_prop prop;
struct lsm_context ctx;
+ char *space = "";
int error;
+ int i;
- security_current_getlsmprop_subj(&prop);
- if (!lsmprop_is_set(&prop))
+ security_current_getlsmprop_subj(prop);
+ if (!lsmprop_is_set(prop))
return 0;
- error = security_lsmprop_to_secctx(&prop, &ctx);
- if (error < 0) {
- if (error != -EINVAL)
- goto error_path;
+ if (audit_subj_secctx_cnt < 2) {
+ error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF);
+ if (error < 0) {
+ if (error != -EINVAL)
+ goto error_path;
+ return 0;
+ }
+ audit_log_format(ab, " subj=%s", ctx.context);
+ security_release_secctx(&ctx);
return 0;
}
-
- audit_log_format(ab, " subj=%s", ctx.context);
- security_release_secctx(&ctx);
+ /* Multiple LSMs provide contexts. Include an aux record. */
+ audit_log_format(ab, " subj=?");
+ error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS);
+ if (error)
+ goto error_path;
+
+ for (i = 0; i < audit_subj_secctx_cnt; i++) {
+ error = security_lsmprop_to_secctx(prop, &ctx,
+ audit_subj_lsms[i]->id);
+ if (error < 0) {
+ /*
+ * Don't print anything. An LSM like BPF could
+ * claim to support contexts, but only do so under
+ * certain conditions.
+ */
+ if (error == -EOPNOTSUPP)
+ continue;
+ if (error != -EINVAL)
+ audit_panic("error in audit_log_subj_ctx");
+ } else {
+ audit_log_format(ab, "%ssubj_%s=%s", space,
+ audit_subj_lsms[i]->name, ctx.context);
+ space = " ";
+ security_release_secctx(&ctx);
+ }
+ }
+ audit_buffer_aux_end(ab);
return 0;
error_path:
- audit_panic("error in audit_log_task_context");
+ audit_panic("error in audit_log_subj_ctx");
return error;
}
+EXPORT_SYMBOL(audit_log_subj_ctx);
+
+int audit_log_task_context(struct audit_buffer *ab)
+{
+ struct lsm_prop prop;
+
+ security_current_getlsmprop_subj(&prop);
+ return audit_log_subj_ctx(ab, &prop);
+}
EXPORT_SYMBOL(audit_log_task_context);
+int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop)
+{
+ int i;
+ int rc;
+ int error = 0;
+ char *space = "";
+ struct lsm_context ctx;
+
+ if (audit_obj_secctx_cnt < 2) {
+ error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF);
+ if (error < 0) {
+ if (error != -EINVAL)
+ goto error_path;
+ return error;
+ }
+ audit_log_format(ab, " obj=%s", ctx.context);
+ security_release_secctx(&ctx);
+ return 0;
+ }
+ audit_log_format(ab, " obj=?");
+ error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS);
+ if (error)
+ goto error_path;
+
+ for (i = 0; i < audit_obj_secctx_cnt; i++) {
+ rc = security_lsmprop_to_secctx(prop, &ctx,
+ audit_obj_lsms[i]->id);
+ if (rc < 0) {
+ audit_log_format(ab, "%sobj_%s=?", space,
+ audit_obj_lsms[i]->name);
+ if (rc != -EINVAL)
+ audit_panic("error in audit_log_obj_ctx");
+ error = rc;
+ } else {
+ audit_log_format(ab, "%sobj_%s=%s", space,
+ audit_obj_lsms[i]->name, ctx.context);
+ security_release_secctx(&ctx);
+ }
+ space = " ";
+ }
+
+ audit_buffer_aux_end(ab);
+ return error;
+
+error_path:
+ audit_panic("error in audit_log_obj_ctx");
+ return error;
+}
+
void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm)
{
@@ -2411,6 +2601,28 @@ int audit_signal_info(int sig, struct task_struct *t)
}
/**
+ * __audit_log_end - enqueue one audit record
+ * @skb: the buffer to send
+ */
+static void __audit_log_end(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh;
+
+ if (audit_rate_check()) {
+ /* setup the netlink header, see the comments in
+ * kauditd_send_multicast_skb() for length quirks */
+ nlh = nlmsg_hdr(skb);
+ nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+
+ /* queue the netlink packet */
+ skb_queue_tail(&audit_queue, skb);
+ } else {
+ audit_log_lost("rate limit exceeded");
+ kfree_skb(skb);
+ }
+}
+
+/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
@@ -2422,25 +2634,15 @@ int audit_signal_info(int sig, struct task_struct *t)
void audit_log_end(struct audit_buffer *ab)
{
struct sk_buff *skb;
- struct nlmsghdr *nlh;
if (!ab)
return;
- if (audit_rate_check()) {
- skb = ab->skb;
- ab->skb = NULL;
-
- /* setup the netlink header, see the comments in
- * kauditd_send_multicast_skb() for length quirks */
- nlh = nlmsg_hdr(skb);
- nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+ while ((skb = skb_dequeue(&ab->skb_list)))
+ __audit_log_end(skb);
- /* queue the netlink packet and poke the kauditd thread */
- skb_queue_tail(&audit_queue, skb);
- wake_up_interruptible(&kauditd_wait);
- } else
- audit_log_lost("rate limit exceeded");
+ /* poke the kauditd thread */
+ wake_up_interruptible(&kauditd_wait);
audit_buffer_free(ab);
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 2a24d01c5fb0..0f05933a173b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -99,6 +99,12 @@ struct audit_proctitle {
char *value; /* the cmdline field */
};
+/* A timestamp/serial pair to identify an event */
+struct audit_stamp {
+ struct timespec64 ctime; /* time of syscall entry */
+ unsigned int serial; /* serial number for record */
+};
+
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
@@ -108,10 +114,9 @@ struct audit_context {
AUDIT_CTX_URING, /* in use by io_uring */
} context;
enum audit_state state, current_state;
- unsigned int serial; /* serial number for record */
+ struct audit_stamp stamp; /* event identifier */
int major; /* syscall number */
int uring_op; /* uring operation */
- struct timespec64 ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
u64 prio;
@@ -263,7 +268,7 @@ extern void audit_put_tty(struct tty_struct *tty);
extern unsigned int audit_serial(void);
#ifdef CONFIG_AUDITSYSCALL
extern int auditsc_get_stamp(struct audit_context *ctx,
- struct timespec64 *t, unsigned int *serial);
+ struct audit_stamp *stamp);
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
@@ -304,7 +309,7 @@ extern void audit_filter_inodes(struct task_struct *tsk,
struct audit_context *ctx);
extern struct list_head *audit_killed_trees(void);
#else /* CONFIG_AUDITSYSCALL */
-#define auditsc_get_stamp(c, t, s) 0
+#define auditsc_get_stamp(c, s) 0
#define audit_put_watch(w) do { } while (0)
#define audit_get_watch(w) do { } while (0)
#define audit_to_watch(k, p, l, o) (-EINVAL)
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b0eae2a3c895..1605df0a171e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -93,8 +93,10 @@ static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
+ size_t sz;
- tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
+ sz = strlen(s) + 1;
+ tree = kmalloc(struct_size(tree, pathname, sz), GFP_KERNEL);
if (tree) {
refcount_set(&tree->count, 1);
tree->goner = 0;
@@ -103,7 +105,7 @@ static struct audit_tree *alloc_tree(const char *s)
INIT_LIST_HEAD(&tree->list);
INIT_LIST_HEAD(&tree->same_root);
tree->root = NULL;
- strcpy(tree->pathname, s);
+ strscpy(tree->pathname, s, sz);
}
return tree;
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f7708fe2c457..c401082d9b25 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1440,7 +1440,7 @@ static int update_lsm_rule(struct audit_krule *r)
}
/* This function will re-initialize the lsm_rule field of all applicable rules.
- * It will traverse the filter lists serarching for rules that contain LSM
+ * It will traverse the filter lists searching for rules that contain LSM
* specific filter fields. When such a rule is found, it is copied, the
* LSM field is re-initialized, and the old rule is replaced with the
* updated rule. */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index eb98cd6fe91f..d1966144bdfe 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -994,10 +994,10 @@ static void audit_reset_context(struct audit_context *ctx)
*/
ctx->current_state = ctx->state;
- ctx->serial = 0;
+ ctx->stamp.serial = 0;
+ ctx->stamp.ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
ctx->major = 0;
ctx->uring_op = 0;
- ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
memset(ctx->argv, 0, sizeof(ctx->argv));
ctx->return_code = 0;
ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
@@ -1098,7 +1098,6 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
char *comm)
{
struct audit_buffer *ab;
- struct lsm_context ctx;
int rc = 0;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
@@ -1108,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
from_kuid(&init_user_ns, auid),
from_kuid(&init_user_ns, uid), sessionid);
- if (lsmprop_is_set(prop)) {
- if (security_lsmprop_to_secctx(prop, &ctx) < 0) {
- audit_log_format(ab, " obj=(none)");
- rc = 1;
- } else {
- audit_log_format(ab, " obj=%s", ctx.context);
- security_release_secctx(&ctx);
- }
- }
+ if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop))
+ rc = 1;
+
audit_log_format(ab, " ocomm=");
audit_log_untrustedstring(ab, comm);
audit_log_end(ab);
@@ -1392,15 +1385,8 @@ static void show_special(struct audit_context *context, int *call_panic)
from_kgid(&init_user_ns, context->ipc.gid),
context->ipc.mode);
if (lsmprop_is_set(&context->ipc.oprop)) {
- struct lsm_context lsmctx;
-
- if (security_lsmprop_to_secctx(&context->ipc.oprop,
- &lsmctx) < 0) {
+ if (audit_log_obj_ctx(ab, &context->ipc.oprop))
*call_panic = 1;
- } else {
- audit_log_format(ab, " obj=%s", lsmctx.context);
- security_release_secctx(&lsmctx);
- }
}
if (context->ipc.has_perm) {
audit_log_end(ab);
@@ -1557,17 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
from_kgid(&init_user_ns, n->gid),
MAJOR(n->rdev),
MINOR(n->rdev));
- if (lsmprop_is_set(&n->oprop)) {
- struct lsm_context ctx;
-
- if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) {
- if (call_panic)
- *call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx.context);
- security_release_secctx(&ctx);
- }
- }
+ if (lsmprop_is_set(&n->oprop) &&
+ audit_log_obj_ctx(ab, &n->oprop))
+ *call_panic = 2;
/* log the audit_names record type */
switch (n->type) {
@@ -1785,8 +1763,9 @@ static void audit_log_exit(void)
audit_log_pid_context(context, context->target_pid,
context->target_auid, context->target_uid,
context->target_sessionid,
- &context->target_ref, context->target_comm))
- call_panic = 1;
+ &context->target_ref,
+ context->target_comm))
+ call_panic = 1;
if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
@@ -1917,7 +1896,7 @@ void __audit_uring_entry(u8 op)
ctx->context = AUDIT_CTX_URING;
ctx->current_state = ctx->state;
- ktime_get_coarse_real_ts64(&ctx->ctime);
+ ktime_get_coarse_real_ts64(&ctx->stamp.ctime);
}
/**
@@ -2039,7 +2018,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
context->argv[3] = a4;
context->context = AUDIT_CTX_SYSCALL;
context->current_state = state;
- ktime_get_coarse_real_ts64(&context->ctime);
+ ktime_get_coarse_real_ts64(&context->stamp.ctime);
}
/**
@@ -2508,21 +2487,17 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
/**
* auditsc_get_stamp - get local copies of audit_context values
* @ctx: audit_context for the task
- * @t: timespec64 to store time recorded in the audit_context
- * @serial: serial value that is recorded in the audit_context
+ * @stamp: timestamp to record
*
* Also sets the context as auditable.
*/
-int auditsc_get_stamp(struct audit_context *ctx,
- struct timespec64 *t, unsigned int *serial)
+int auditsc_get_stamp(struct audit_context *ctx, struct audit_stamp *stamp)
{
if (ctx->context == AUDIT_CTX_UNUSED)
return 0;
- if (!ctx->serial)
- ctx->serial = audit_serial();
- t->tv_sec = ctx->ctime.tv_sec;
- t->tv_nsec = ctx->ctime.tv_nsec;
- *serial = ctx->serial;
+ if (!ctx->stamp.serial)
+ ctx->stamp.serial = audit_serial();
+ *stamp = ctx->stamp;
if (!ctx->prio) {
ctx->prio = 1;
ctx->current_state = AUDIT_STATE_RECORD;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3615c06b7dfa..ec3a57a5fba1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
- trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
false, false);
if (unlikely(!trace))
@@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else if (kernel && task)
trace = get_callchain_entry_for_task(task, max_depth);
else
- trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
crosstask, false);
if (unlikely(!trace) || trace->nr < skip) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9fb1f957a093..8340cecd1b35 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23859,6 +23859,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
BTF_SET_START(btf_id_deny)
BTF_ID_UNUSED
#ifdef CONFIG_SMP
+BTF_ID(func, ___migrate_enable)
BTF_ID(func, migrate_disable)
BTF_ID(func, migrate_enable)
#endif
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index b14e61c64a34..22051b4f1ccb 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -249,12 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
-void cgroup_attach_lock(bool lock_threadgroup);
-void cgroup_attach_unlock(bool lock_threadgroup);
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk);
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
+ enum cgroup_attach_lock_mode *lock_mode)
__acquires(&cgroup_threadgroup_rwsem);
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
+void cgroup_procs_write_finish(struct task_struct *task,
+ enum cgroup_attach_lock_mode lock_mode)
__releases(&cgroup_threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 2a4a387f867a..a9e029b570c8 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -10,6 +10,7 @@
#include <linux/sched/task.h>
#include <linux/magic.h>
#include <linux/slab.h>
+#include <linux/string.h>
#include <linux/vmalloc.h>
#include <linux/delayacct.h>
#include <linux/pid_namespace.h>
@@ -68,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
cgroup_lock();
- cgroup_attach_lock(true);
+ cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -80,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
- cgroup_attach_unlock(true);
+ cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
cgroup_unlock();
return retval;
@@ -117,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
cgroup_lock();
- cgroup_attach_lock(true);
+ cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
/* all tasks in @from are being moved, all csets are source */
spin_lock_irq(&css_set_lock);
@@ -153,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
} while (task && !ret);
out_err:
cgroup_migrate_finish(&mgctx);
- cgroup_attach_unlock(true);
+ cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
cgroup_unlock();
return ret;
}
@@ -502,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
struct task_struct *task;
const struct cred *cred, *tcred;
ssize_t ret;
- bool locked;
+ enum cgroup_attach_lock_mode lock_mode;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -531,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
ret = cgroup_attach_task(cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, lock_mode);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -1133,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
if (ctx->release_agent) {
spin_lock(&release_agent_path_lock);
- strcpy(root->release_agent_path, ctx->release_agent);
+ strscpy(root->release_agent_path, ctx->release_agent);
spin_unlock(&release_agent_path_lock);
}
@@ -1325,7 +1326,7 @@ static int __init cgroup1_wq_init(void)
* Cap @max_active to 1 too.
*/
cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
- 0, 1);
+ WQ_PERCPU, 1);
BUG_ON(!cgroup_pidlist_destroy_wq);
return 0;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a0d5d62f1483..da4eaee52178 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -125,7 +125,7 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
- * destruction work items don't end up filling up max_active of system_wq
+ * destruction work items don't end up filling up max_active of system_percpu_wq
* which may lead to deadlock.
*
* A cgroup destruction should enqueue work sequentially to:
@@ -240,6 +240,14 @@ static u16 have_canfork_callback __read_mostly;
static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
+/*
+ * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem,
+ * read protected by either.
+ *
+ * Can only be turned on, but not turned off.
+ */
+bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
+
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
.ns.__ns_ref = REFCOUNT_INIT(2),
@@ -1327,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
{
bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
- /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+ /*
+ * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
+ * favordynmods can flip while task is between
+ * cgroup_threadgroup_change_begin() and end(), so down_write global
+ * cgroup_threadgroup_rwsem to synchronize them.
+ *
+ * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding
+ * cgroup_threadgroup_rwsem doesn't exlude tasks between
+ * cgroup_thread_group_change_begin() and end() and thus it's unsafe to
+ * turn off. As the scenario is unlikely, simply disallow disabling once
+ * enabled and print out a warning.
+ */
+ percpu_down_write(&cgroup_threadgroup_rwsem);
if (favor && !favoring) {
+ cgroup_enable_per_threadgroup_rwsem = true;
rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
} else if (!favor && favoring) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n");
rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
}
+ percpu_up_write(&cgroup_threadgroup_rwsem);
}
static int cgroup_init_root_id(struct cgroup_root *root)
@@ -2484,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
/**
* cgroup_attach_lock - Lock for ->attach()
- * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether acquire and acquire which rwsem
+ * @tsk: thread group to lock
*
* cgroup migration sometimes needs to stabilize threadgroups against forks and
* exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
@@ -2504,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
* Resolve the situation by always acquiring cpus_read_lock() before optionally
* write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
* CPU hotplug is disabled on entry.
+ *
+ * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead
+ * on dynamic cgroup modifications. see the comment above
+ * CGRP_ROOT_FAVOR_DYNMODS definition.
+ *
+ * tsk is not NULL only when writing to cgroup.procs.
*/
-void cgroup_attach_lock(bool lock_threadgroup)
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
{
cpus_read_lock();
- if (lock_threadgroup)
+
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
percpu_down_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ down_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
+ }
}
/**
* cgroup_attach_unlock - Undo cgroup_attach_lock()
- * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether release and release which rwsem
+ * @tsk: thread group to lock
*/
-void cgroup_attach_unlock(bool lock_threadgroup)
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
{
- if (lock_threadgroup)
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
percpu_up_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ up_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
+ }
+
cpus_read_unlock();
}
@@ -2969,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
/* look up all src csets */
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
@@ -2993,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *threadgroup_locked)
+ enum cgroup_attach_lock_mode *lock_mode)
{
struct task_struct *tsk;
pid_t pid;
@@ -3001,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL);
- /*
- * If we migrate a single thread, we don't care about threadgroup
- * stability. If the thread is `current`, it won't exit(2) under our
- * hands or change PID through exec(2). We exclude
- * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
- * callers by cgroup_mutex.
- * Therefore, we can skip the global lock.
- */
- lockdep_assert_held(&cgroup_mutex);
- *threadgroup_locked = pid || threadgroup;
- cgroup_attach_lock(*threadgroup_locked);
-
+retry_find_task:
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
tsk = ERR_PTR(-ESRCH);
- goto out_unlock_threadgroup;
+ goto out_unlock_rcu;
}
} else {
tsk = current;
@@ -3035,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
tsk = ERR_PTR(-EINVAL);
- goto out_unlock_threadgroup;
+ goto out_unlock_rcu;
}
-
get_task_struct(tsk);
- goto out_unlock_rcu;
+ rcu_read_unlock();
+
+ /*
+ * If we migrate a single thread, we don't care about threadgroup
+ * stability. If the thread is `current`, it won't exit(2) under our
+ * hands or change PID through exec(2). We exclude
+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers
+ * by cgroup_mutex. Therefore, we can skip the global lock.
+ */
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (pid || threadgroup) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP;
+ else
+ *lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ } else {
+ *lock_mode = CGRP_ATTACH_LOCK_NONE;
+ }
+
+ cgroup_attach_lock(*lock_mode, tsk);
+
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * A race with de_thread from another thread's exec()
+ * may strip us of our leadership. If this happens,
+ * throw this task away and try again.
+ */
+ cgroup_attach_unlock(*lock_mode, tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ }
+
+ return tsk;
-out_unlock_threadgroup:
- cgroup_attach_unlock(*threadgroup_locked);
- *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
+void cgroup_procs_write_finish(struct task_struct *task,
+ enum cgroup_attach_lock_mode lock_mode)
{
- struct cgroup_subsys *ss;
- int ssid;
+ cgroup_attach_unlock(lock_mode, task);
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
-
- cgroup_attach_unlock(threadgroup_locked);
-
- for_each_subsys(ss, ssid)
- if (ss->post_attach)
- ss->post_attach();
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -3113,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ enum cgroup_attach_lock_mode lock_mode;
bool has_tasks;
int ret;
@@ -3144,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
* write-locking can be skipped safely.
*/
has_tasks = !list_empty(&mgctx.preloaded_src_csets);
- cgroup_attach_lock(has_tasks);
+
+ if (has_tasks)
+ lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ else
+ lock_mode = CGRP_ATTACH_LOCK_NONE;
+
+ cgroup_attach_lock(lock_mode, NULL);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
@@ -3165,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- cgroup_attach_unlock(has_tasks);
+ cgroup_attach_unlock(lock_mode, NULL);
return ret;
}
@@ -3788,6 +3865,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
return 0;
}
+static int cgroup_core_local_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ unsigned int sequence;
+ u64 freeze_time;
+
+ do {
+ sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq);
+ freeze_time = cgrp->freezer.frozen_nsec;
+ /* Add in current freezer interval if the cgroup is freezing. */
+ if (test_bit(CGRP_FREEZE, &cgrp->flags))
+ freeze_time += (ktime_get_ns() -
+ cgrp->freezer.freeze_start_nsec);
+ } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence));
+
+ do_div(freeze_time, NSEC_PER_USEC);
+ seq_printf(seq, "frozen_usec %llu\n", freeze_time);
+
+ return 0;
+}
+
#ifdef CONFIG_CGROUP_SCHED
/**
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
@@ -5267,13 +5365,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
struct task_struct *task;
const struct cred *saved_cred;
ssize_t ret;
- bool threadgroup_locked;
+ enum cgroup_attach_lock_mode lock_mode;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -5299,7 +5397,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, threadgroup_locked);
+ cgroup_procs_write_finish(task, lock_mode);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -5381,6 +5479,11 @@ static struct cftype cgroup_base_files[] = {
.seq_show = cgroup_stat_show,
},
{
+ .name = "cgroup.stat.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_core_local_stat_show,
+ },
+ {
.name = "cgroup.freeze",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_freeze_show,
@@ -5789,6 +5892,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
* if the parent has to be frozen, the child has too.
*/
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
+ seqcount_init(&cgrp->freezer.freeze_seq);
if (cgrp->freezer.e_freeze) {
/*
* Set the CGRP_FREEZE flag, so when a process will be
@@ -5797,6 +5901,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
* consider it frozen immediately.
*/
set_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.freeze_start_nsec = ktime_get_ns();
set_bit(CGRP_FROZEN, &cgrp->flags);
}
@@ -6352,13 +6457,13 @@ static int __init cgroup_wq_init(void)
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
- cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1);
+ cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1);
BUG_ON(!cgroup_offline_wq);
- cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1);
+ cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1);
BUG_ON(!cgroup_release_wq);
- cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1);
+ cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1);
BUG_ON(!cgroup_free_wq);
return 0;
}
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 383963e28ac6..337608f408ce 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -38,7 +38,6 @@ enum prs_errcode {
/* bits in struct cpuset flags field */
typedef enum {
- CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
@@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
/* convenient tests for these bits */
static inline bool is_cpuset_online(struct cpuset *cs)
{
- return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
+ return css_is_online(&cs->css) && !css_is_dying(&cs->css);
}
static inline int is_cpu_exclusive(const struct cpuset *cs)
@@ -277,6 +276,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on)
ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int cpuset_common_seq_show(struct seq_file *sf, void *v);
+void cpuset_full_lock(void);
+void cpuset_full_unlock(void);
/*
* cpuset-v1.c
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index b69a7db67090..12e76774c75b 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
- cpus_read_lock();
- cpuset_lock();
+ cpuset_full_lock();
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- cpuset_unlock();
- cpus_read_unlock();
+ cpuset_full_unlock();
return retval;
}
@@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
- cpus_read_lock();
- cpuset_lock();
+ cpuset_full_lock();
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
@@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- cpuset_unlock();
- cpus_read_unlock();
+ cpuset_full_unlock();
return retval;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 27adb04df675..52468d2c178a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -40,6 +40,7 @@
#include <linux/sched/isolation.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
+#include <linux/task_work.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -131,11 +132,6 @@ static bool force_sd_rebuild;
#define PRS_INVALID_ROOT -1
#define PRS_INVALID_ISOLATED -2
-static inline bool is_prs_invalid(int prs_state)
-{
- return prs_state < 0;
-}
-
/*
* Temporary cpumasks for working with partitions that are passed among
* functions to avoid memory allocation in inner functions.
@@ -159,16 +155,21 @@ void dec_dl_tasks_cs(struct task_struct *p)
cs->nr_deadline_tasks--;
}
-static inline int is_partition_valid(const struct cpuset *cs)
+static inline bool is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}
-static inline int is_partition_invalid(const struct cpuset *cs)
+static inline bool is_partition_invalid(const struct cpuset *cs)
{
return cs->partition_root_state < 0;
}
+static inline bool cs_is_member(const struct cpuset *cs)
+{
+ return cs->partition_root_state == PRS_MEMBER;
+}
+
/*
* Callers should hold callback_lock to modify partition_root_state.
*/
@@ -207,7 +208,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
* parallel, we may leave an offline CPU in cpu_allowed or some other masks.
*/
static struct cpuset top_cpuset = {
- .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
+ .flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
.relax_domain_level = -1,
@@ -250,6 +251,12 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex);
+/**
+ * cpuset_lock - Acquire the global cpuset mutex
+ *
+ * This locks the global cpuset mutex to prevent modifications to cpuset
+ * hierarchy and configurations. This helper is not enough to make modification.
+ */
void cpuset_lock(void)
{
mutex_lock(&cpuset_mutex);
@@ -260,6 +267,24 @@ void cpuset_unlock(void)
mutex_unlock(&cpuset_mutex);
}
+/**
+ * cpuset_full_lock - Acquire full protection for cpuset modification
+ *
+ * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex
+ * to safely modify cpuset data.
+ */
+void cpuset_full_lock(void)
+{
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
+}
+
+void cpuset_full_unlock(void)
+{
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
+}
+
static DEFINE_SPINLOCK(callback_lock);
void cpuset_callback_lock_irq(void)
@@ -411,94 +436,104 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
}
/**
- * alloc_cpumasks - allocate three cpumasks for cpuset
- * @cs: the cpuset that have cpumasks to be allocated.
- * @tmp: the tmpmasks structure pointer
+ * alloc_cpumasks - Allocate an array of cpumask variables
+ * @pmasks: Pointer to array of cpumask_var_t pointers
+ * @size: Number of cpumasks to allocate
* Return: 0 if successful, -ENOMEM otherwise.
*
- * Only one of the two input arguments should be non-NULL.
+ * Allocates @size cpumasks and initializes them to empty. Returns 0 on
+ * success, -ENOMEM on allocation failure. On failure, any previously
+ * allocated cpumasks are freed.
*/
-static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size)
{
- cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
+ int i;
- if (cs) {
- pmask1 = &cs->cpus_allowed;
- pmask2 = &cs->effective_cpus;
- pmask3 = &cs->effective_xcpus;
- pmask4 = &cs->exclusive_cpus;
- } else {
- pmask1 = &tmp->new_cpus;
- pmask2 = &tmp->addmask;
- pmask3 = &tmp->delmask;
- pmask4 = NULL;
+ for (i = 0; i < size; i++) {
+ if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) {
+ while (--i >= 0)
+ free_cpumask_var(*pmasks[i]);
+ return -ENOMEM;
+ }
}
-
- if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
- return -ENOMEM;
-
- if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
- goto free_one;
-
- if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
- goto free_two;
-
- if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
- goto free_three;
-
-
return 0;
+}
+
+/**
+ * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations.
+ * @tmp: Pointer to tmpmasks structure to populate
+ * Return: 0 on success, -ENOMEM on allocation failure
+ */
+static inline int alloc_tmpmasks(struct tmpmasks *tmp)
+{
+ /*
+ * Array of pointers to the three cpumask_var_t fields in tmpmasks.
+ * Note: Array size must match actual number of masks (3)
+ */
+ cpumask_var_t *pmask[3] = {
+ &tmp->new_cpus,
+ &tmp->addmask,
+ &tmp->delmask
+ };
-free_three:
- free_cpumask_var(*pmask3);
-free_two:
- free_cpumask_var(*pmask2);
-free_one:
- free_cpumask_var(*pmask1);
- return -ENOMEM;
+ return alloc_cpumasks(pmask, ARRAY_SIZE(pmask));
}
/**
- * free_cpumasks - free cpumasks in a tmpmasks structure
- * @cs: the cpuset that have cpumasks to be free.
+ * free_tmpmasks - free cpumasks in a tmpmasks structure
* @tmp: the tmpmasks structure pointer
*/
-static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+static inline void free_tmpmasks(struct tmpmasks *tmp)
{
- if (cs) {
- free_cpumask_var(cs->cpus_allowed);
- free_cpumask_var(cs->effective_cpus);
- free_cpumask_var(cs->effective_xcpus);
- free_cpumask_var(cs->exclusive_cpus);
- }
- if (tmp) {
- free_cpumask_var(tmp->new_cpus);
- free_cpumask_var(tmp->addmask);
- free_cpumask_var(tmp->delmask);
- }
+ if (!tmp)
+ return;
+
+ free_cpumask_var(tmp->new_cpus);
+ free_cpumask_var(tmp->addmask);
+ free_cpumask_var(tmp->delmask);
}
/**
- * alloc_trial_cpuset - allocate a trial cpuset
- * @cs: the cpuset that the trial cpuset duplicates
+ * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset
+ * @cs: Source cpuset to duplicate (NULL for a fresh allocation)
+ *
+ * Creates a new cpuset by either:
+ * 1. Duplicating an existing cpuset (if @cs is non-NULL), or
+ * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL)
+ *
+ * Return: Pointer to newly allocated cpuset on success, NULL on failure
*/
-static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
+static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
{
struct cpuset *trial;
- trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
+ /* Allocate base structure */
+ trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) :
+ kzalloc(sizeof(*cs), GFP_KERNEL);
if (!trial)
return NULL;
- if (alloc_cpumasks(trial, NULL)) {
+ /* Setup cpumask pointer array */
+ cpumask_var_t *pmask[4] = {
+ &trial->cpus_allowed,
+ &trial->effective_cpus,
+ &trial->effective_xcpus,
+ &trial->exclusive_cpus
+ };
+
+ if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) {
kfree(trial);
return NULL;
}
- cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
- cpumask_copy(trial->effective_cpus, cs->effective_cpus);
- cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
- cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
+ /* Copy masks if duplicating */
+ if (cs) {
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+ cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
+ cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
+ }
+
return trial;
}
@@ -508,7 +543,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
*/
static inline void free_cpuset(struct cpuset *cs)
{
- free_cpumasks(cs, NULL);
+ free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->effective_cpus);
+ free_cpumask_var(cs->effective_xcpus);
+ free_cpumask_var(cs->exclusive_cpus);
kfree(cs);
}
@@ -540,6 +578,47 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
return true;
}
+/**
+ * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
+ * @cs1: first cpuset to check
+ * @cs2: second cpuset to check
+ *
+ * Returns: true if CPU exclusivity conflict exists, false otherwise
+ *
+ * Conflict detection rules:
+ * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
+ * 2. exclusive_cpus masks cannot intersect between cpusets
+ * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs
+ */
+static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ /* If either cpuset is exclusive, check if they are mutually exclusive */
+ if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
+ return !cpusets_are_exclusive(cs1, cs2);
+
+ /* Exclusive_cpus cannot intersect */
+ if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))
+ return true;
+
+ /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */
+ if (!cpumask_empty(cs1->cpus_allowed) &&
+ cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))
+ return true;
+
+ if (!cpumask_empty(cs2->cpus_allowed) &&
+ cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
+ return true;
+
+ return false;
+}
+
+static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2)))
+ return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+ return false;
+}
+
/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
@@ -621,38 +700,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
*/
ret = -EINVAL;
cpuset_for_each_child(c, css, par) {
- bool txset, cxset; /* Are exclusive_cpus set? */
-
if (c == cur)
continue;
-
- txset = !cpumask_empty(trial->exclusive_cpus);
- cxset = !cpumask_empty(c->exclusive_cpus);
- if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) ||
- (txset && cxset)) {
- if (!cpusets_are_exclusive(trial, c))
- goto out;
- } else if (txset || cxset) {
- struct cpumask *xcpus, *acpus;
-
- /*
- * When just one of the exclusive_cpus's is set,
- * cpus_allowed of the other cpuset, if set, cannot be
- * a subset of it or none of those CPUs will be
- * available if these exclusive CPUs are activated.
- */
- if (txset) {
- xcpus = trial->exclusive_cpus;
- acpus = c->cpus_allowed;
- } else {
- xcpus = c->exclusive_cpus;
- acpus = trial->cpus_allowed;
- }
- if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus))
- goto out;
- }
- if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ if (cpus_excl_conflict(trial, c))
+ goto out;
+ if (mems_excl_conflict(trial, c))
goto out;
}
@@ -1363,38 +1415,25 @@ bool cpuset_cpu_is_isolated(int cpu)
}
EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
-/*
- * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
- * @cs: cpuset
- * @xcpus: effective exclusive CPUs value to be set
- * @real_cs: the real cpuset (can be NULL)
- * Return: 0 if there is no sibling conflict, > 0 otherwise
+/**
+ * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets
+ * @parent: Parent cpuset containing all siblings
+ * @cs: Current cpuset (will be skipped)
+ * @excpus: exclusive effective CPU mask to modify
*
- * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to
- * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus
- * as well. The provision of real_cs means that a cpumask is being changed and
- * the given cs is a trial one.
+ * This function ensures the given @excpus mask doesn't include any CPUs that
+ * are exclusively allocated to sibling cpusets. It walks through all siblings
+ * of @cs under @parent and removes their exclusive CPUs from @excpus.
*/
-static int compute_effective_exclusive_cpumask(struct cpuset *cs,
- struct cpumask *xcpus,
- struct cpuset *real_cs)
+static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
+ struct cpumask *excpus)
{
struct cgroup_subsys_state *css;
- struct cpuset *parent = parent_cs(cs);
struct cpuset *sibling;
int retval = 0;
- if (!xcpus)
- xcpus = cs->effective_xcpus;
-
- cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
-
- if (!real_cs) {
- if (!cpumask_empty(cs->exclusive_cpus))
- return 0;
- } else {
- cs = real_cs;
- }
+ if (cpumask_empty(excpus))
+ return retval;
/*
* Exclude exclusive CPUs from siblings
@@ -1404,20 +1443,66 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs,
if (sibling == cs)
continue;
- if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
- cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus);
+ if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {
+ cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);
retval++;
continue;
}
- if (cpumask_intersects(xcpus, sibling->effective_xcpus)) {
- cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus);
+ if (cpumask_intersects(excpus, sibling->effective_xcpus)) {
+ cpumask_andnot(excpus, excpus, sibling->effective_xcpus);
retval++;
}
}
rcu_read_unlock();
+
return retval;
}
+/*
+ * compute_excpus - compute effective exclusive CPUs
+ * @cs: cpuset
+ * @xcpus: effective exclusive CPUs value to be set
+ * Return: 0 if there is no sibling conflict, > 0 otherwise
+ *
+ * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets
+ * and exclude their exclusive_cpus or effective_xcpus as well.
+ */
+static int compute_excpus(struct cpuset *cs, struct cpumask *excpus)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus);
+
+ if (!cpumask_empty(cs->exclusive_cpus))
+ return 0;
+
+ return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
+/*
+ * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset
+ * @trialcs: The trial cpuset containing the proposed new configuration
+ * @cs: The original cpuset that the trial configuration is based on
+ * Return: 0 if successful with no sibling conflict, >0 if a conflict is found
+ *
+ * Computes the effective_xcpus for a trial configuration. @cs is provided to represent
+ * the real cs.
+ */
+static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs)
+{
+ struct cpuset *parent = parent_cs(trialcs);
+ struct cpumask *excpus = trialcs->effective_xcpus;
+
+ /* trialcs is member, cpuset.cpus has no impact to excpus */
+ if (cs_is_member(cs))
+ cpumask_and(excpus, trialcs->exclusive_cpus,
+ parent->effective_xcpus);
+ else
+ cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus);
+
+ return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
static inline bool is_remote_partition(struct cpuset *cs)
{
return !list_empty(&cs->remote_sibling);
@@ -1459,7 +1544,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
* Note that creating a remote partition with any local partition root
* above it or remote partition root underneath it is not allowed.
*/
- compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL);
+ compute_excpus(cs, tmp->new_cpus);
WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));
if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
@@ -1508,7 +1593,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
cs->partition_root_state = PRS_MEMBER;
/* effective_xcpus may need to be changed */
- compute_effective_exclusive_cpumask(cs, NULL, NULL);
+ compute_excpus(cs, cs->effective_xcpus);
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
@@ -1677,7 +1762,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
old_prs = new_prs = cs->partition_root_state;
if (cmd == partcmd_invalidate) {
- if (is_prs_invalid(old_prs))
+ if (is_partition_invalid(cs))
return 0;
/*
@@ -1709,13 +1794,14 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
/*
- * Need to call compute_effective_exclusive_cpumask() in case
+ * Need to call compute_excpus() in case
* exclusive_cpus not set. Sibling conflict should only happen
* if exclusive_cpus isn't set.
*/
xcpus = tmp->delmask;
- if (compute_effective_exclusive_cpumask(cs, xcpus, NULL))
+ if (compute_excpus(cs, xcpus))
WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
+ new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
/*
* Enabling partition root is not allowed if its
@@ -1727,11 +1813,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (prstate_housekeeping_conflict(new_prs, xcpus))
return PERR_HKEEPING;
- /*
- * A parent can be left with no CPU as long as there is no
- * task directly associated with the parent partition.
- */
- if (nocpu)
+ if (tasks_nocpu_error(parent, cs, xcpus))
return PERR_NOCPUS;
/*
@@ -1748,7 +1830,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
deleting = true;
subparts_delta++;
- new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
* May need to add cpus back to parent's effective_cpus
@@ -1788,7 +1869,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
* For invalid partition:
* delmask = newmask & parent->effective_xcpus
*/
- if (is_prs_invalid(old_prs)) {
+ if (is_partition_invalid(cs)) {
adding = false;
deleting = cpumask_and(tmp->delmask,
newmask, parent->effective_xcpus);
@@ -1837,7 +1918,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
* A partition error happens when parent has tasks and all
* its effective CPUs will have to be distributed out.
*/
- WARN_ON_ONCE(!is_partition_valid(parent));
if (nocpu) {
part_error = PERR_NOCPUS;
if (is_partition_valid(cs))
@@ -1996,7 +2076,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
* 2) All the effective_cpus will be used up and cp
* has tasks
*/
- compute_effective_exclusive_cpumask(cs, new_ecpus, NULL);
+ compute_excpus(cs, new_ecpus);
cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
rcu_read_lock();
@@ -2075,7 +2155,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
* its value is being processed.
*/
if (remote && (cp != cs)) {
- compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL);
+ compute_excpus(cp, tmp->new_cpus);
if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
@@ -2177,7 +2257,7 @@ get_css:
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cp->partition_root_state = new_prs;
if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
- compute_effective_exclusive_cpumask(cp, NULL, NULL);
+ compute_excpus(cp, cp->effective_xcpus);
/*
* Make sure effective_xcpus is properly set for a valid
@@ -2284,82 +2364,54 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
rcu_read_unlock();
}
-/**
- * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
- * @cs: the cpuset to consider
- * @trialcs: trial cpuset
- * @buf: buffer of cpu numbers written to this cpuset
- */
-static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
- const char *buf)
+static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask)
{
int retval;
- struct tmpmasks tmp;
- struct cpuset *parent = parent_cs(cs);
- bool invalidate = false;
- bool force = false;
- int old_prs = cs->partition_root_state;
- /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */
- if (cs == &top_cpuset)
- return -EACCES;
+ retval = cpulist_parse(buf, out_mask);
+ if (retval < 0)
+ return retval;
+ if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed))
+ return -EINVAL;
- /*
- * An empty cpus_allowed is ok only if the cpuset has no tasks.
- * Since cpulist_parse() fails on an empty mask, we special case
- * that parsing. The validate_change() call ensures that cpusets
- * with tasks have cpus.
- */
- if (!*buf) {
- cpumask_clear(trialcs->cpus_allowed);
- if (cpumask_empty(trialcs->exclusive_cpus))
- cpumask_clear(trialcs->effective_xcpus);
- } else {
- retval = cpulist_parse(buf, trialcs->cpus_allowed);
- if (retval < 0)
- return retval;
+ return 0;
+}
- if (!cpumask_subset(trialcs->cpus_allowed,
- top_cpuset.cpus_allowed))
- return -EINVAL;
+/**
+ * validate_partition - Validate a cpuset partition configuration
+ * @cs: The cpuset to validate
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ *
+ * If any validation check fails, the appropriate error code is set in the
+ * cpuset's prs_err field.
+ *
+ * Return: PRS error code (0 if valid, non-zero error code if invalid)
+ */
+static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs)
+{
+ struct cpuset *parent = parent_cs(cs);
- /*
- * When exclusive_cpus isn't explicitly set, it is constrained
- * by cpus_allowed and parent's effective_xcpus. Otherwise,
- * trialcs->effective_xcpus is used as a temporary cpumask
- * for checking validity of the partition root.
- */
- trialcs->partition_root_state = PRS_MEMBER;
- if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
- compute_effective_exclusive_cpumask(trialcs, NULL, cs);
- }
+ if (cs_is_member(trialcs))
+ return PERR_NONE;
- /* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
- return 0;
+ if (cpumask_empty(trialcs->effective_xcpus))
+ return PERR_INVCPUS;
- if (alloc_cpumasks(NULL, &tmp))
- return -ENOMEM;
+ if (prstate_housekeeping_conflict(trialcs->partition_root_state,
+ trialcs->effective_xcpus))
+ return PERR_HKEEPING;
- if (old_prs) {
- if (is_partition_valid(cs) &&
- cpumask_empty(trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_INVCPUS;
- } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_HKEEPING;
- } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_NOCPUS;
- }
- }
+ if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus))
+ return PERR_NOCPUS;
- /*
- * Check all the descendants in update_cpumasks_hier() if
- * effective_xcpus is to be changed.
- */
- force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+ return PERR_NONE;
+}
+
+static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks *tmp)
+{
+ int retval;
+ struct cpuset *parent = parent_cs(cs);
retval = validate_change(cs, trialcs);
@@ -2374,7 +2426,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* partition. However, any conflicting sibling partitions
* have to be marked as invalid too.
*/
- invalidate = true;
+ trialcs->prs_err = PERR_NOTEXCL;
rcu_read_lock();
cpuset_for_each_child(cp, css, parent) {
struct cpumask *xcpus = user_xcpus(trialcs);
@@ -2382,36 +2434,92 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (is_partition_valid(cp) &&
cpumask_intersects(xcpus, cp->effective_xcpus)) {
rcu_read_unlock();
- update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+ update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
rcu_read_lock();
}
}
rcu_read_unlock();
retval = 0;
}
+ return retval;
+}
- if (retval < 0)
- goto out_free;
+/**
+ * partition_cpus_change - Handle partition state changes due to CPU mask updates
+ * @cs: The target cpuset being modified
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ * @tmp: Temporary masks for intermediate calculations
+ *
+ * This function handles partition state transitions triggered by CPU mask changes.
+ * CPU modifications may cause a partition to be disabled or require state updates.
+ */
+static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks *tmp)
+{
+ enum prs_errcode prs_err;
- if (is_partition_valid(cs) ||
- (is_partition_invalid(cs) && !invalidate)) {
- struct cpumask *xcpus = trialcs->effective_xcpus;
+ if (cs_is_member(cs))
+ return;
- if (cpumask_empty(xcpus) && is_partition_invalid(cs))
- xcpus = trialcs->cpus_allowed;
+ prs_err = validate_partition(cs, trialcs);
+ if (prs_err)
+ trialcs->prs_err = cs->prs_err = prs_err;
- /*
- * Call remote_cpus_update() to handle valid remote partition
- */
- if (is_remote_partition(cs))
- remote_cpus_update(cs, NULL, xcpus, &tmp);
- else if (invalidate)
+ if (is_remote_partition(cs)) {
+ if (trialcs->prs_err)
+ remote_partition_disable(cs, tmp);
+ else
+ remote_cpus_update(cs, trialcs->exclusive_cpus,
+ trialcs->effective_xcpus, tmp);
+ } else {
+ if (trialcs->prs_err)
update_parent_effective_cpumask(cs, partcmd_invalidate,
- NULL, &tmp);
+ NULL, tmp);
else
update_parent_effective_cpumask(cs, partcmd_update,
- xcpus, &tmp);
+ trialcs->effective_xcpus, tmp);
}
+}
+
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ const char *buf)
+{
+ int retval;
+ struct tmpmasks tmp;
+ bool force = false;
+ int old_prs = cs->partition_root_state;
+
+ retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed);
+ if (retval < 0)
+ return retval;
+
+ /* Nothing to do if the cpus didn't change */
+ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+ return 0;
+
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
+
+ compute_trialcs_excpus(trialcs, cs);
+ trialcs->prs_err = PERR_NONE;
+
+ retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
+ if (retval < 0)
+ goto out_free;
+
+ /*
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
+ */
+ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+
+ partition_cpus_change(cs, trialcs, &tmp);
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
@@ -2427,7 +2535,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
out_free:
- free_cpumasks(NULL, &tmp);
+ free_tmpmasks(&tmp);
return retval;
}
@@ -2444,33 +2552,23 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
{
int retval;
struct tmpmasks tmp;
- struct cpuset *parent = parent_cs(cs);
- bool invalidate = false;
bool force = false;
int old_prs = cs->partition_root_state;
- if (!*buf) {
- cpumask_clear(trialcs->exclusive_cpus);
- cpumask_clear(trialcs->effective_xcpus);
- } else {
- retval = cpulist_parse(buf, trialcs->exclusive_cpus);
- if (retval < 0)
- return retval;
- }
+ retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus);
+ if (retval < 0)
+ return retval;
/* Nothing to do if the CPUs didn't change */
if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
return 0;
- if (*buf) {
- trialcs->partition_root_state = PRS_MEMBER;
- /*
- * Reject the change if there is exclusive CPUs conflict with
- * the siblings.
- */
- if (compute_effective_exclusive_cpumask(trialcs, NULL, cs))
- return -EINVAL;
- }
+ /*
+ * Reject the change if there is exclusive CPUs conflict with
+ * the siblings.
+ */
+ if (compute_trialcs_excpus(trialcs, cs))
+ return -EINVAL;
/*
* Check all the descendants in update_cpumasks_hier() if
@@ -2482,35 +2580,12 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval)
return retval;
- if (alloc_cpumasks(NULL, &tmp))
+ if (alloc_tmpmasks(&tmp))
return -ENOMEM;
- if (old_prs) {
- if (cpumask_empty(trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_INVCPUS;
- } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_HKEEPING;
- } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_NOCPUS;
- }
+ trialcs->prs_err = PERR_NONE;
+ partition_cpus_change(cs, trialcs, &tmp);
- if (is_remote_partition(cs)) {
- if (invalidate)
- remote_partition_disable(cs, &tmp);
- else
- remote_cpus_update(cs, trialcs->exclusive_cpus,
- trialcs->effective_xcpus, &tmp);
- } else if (invalidate) {
- update_parent_effective_cpumask(cs, partcmd_invalidate,
- NULL, &tmp);
- } else {
- update_parent_effective_cpumask(cs, partcmd_update,
- trialcs->effective_xcpus, &tmp);
- }
- }
spin_lock_irq(&callback_lock);
cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
@@ -2530,7 +2605,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
- free_cpumasks(NULL, &tmp);
+ free_tmpmasks(&tmp);
return 0;
}
@@ -2582,9 +2657,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
}
}
-static void cpuset_post_attach(void)
+static void flush_migrate_mm_task_workfn(struct callback_head *head)
{
flush_workqueue(cpuset_migrate_mm_wq);
+ kfree(head);
+}
+
+static void schedule_flush_migrate_mm(void)
+{
+ struct callback_head *flush_cb;
+
+ flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
+ if (!flush_cb)
+ return;
+
+ init_task_work(flush_cb, flush_migrate_mm_task_workfn);
+
+ if (task_work_add(current, flush_cb, TWA_RESUME))
+ kfree(flush_cb);
}
/*
@@ -2750,32 +2840,17 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
int retval;
/*
- * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
- * it's read-only
- */
- if (cs == &top_cpuset) {
- retval = -EACCES;
- goto done;
- }
-
- /*
* An empty mems_allowed is ok iff there are no tasks in the cpuset.
- * Since nodelist_parse() fails on an empty mask, we special case
- * that parsing. The validate_change() call ensures that cpusets
- * with tasks have memory.
+ * The validate_change() call ensures that cpusets with tasks have memory.
*/
- if (!*buf) {
- nodes_clear(trialcs->mems_allowed);
- } else {
- retval = nodelist_parse(buf, trialcs->mems_allowed);
- if (retval < 0)
- goto done;
+ retval = nodelist_parse(buf, trialcs->mems_allowed);
+ if (retval < 0)
+ goto done;
- if (!nodes_subset(trialcs->mems_allowed,
- top_cpuset.mems_allowed)) {
- retval = -EINVAL;
- goto done;
- }
+ if (!nodes_subset(trialcs->mems_allowed,
+ top_cpuset.mems_allowed)) {
+ retval = -EINVAL;
+ goto done;
}
if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
@@ -2826,7 +2901,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int spread_flag_changed;
int err;
- trialcs = alloc_trial_cpuset(cs);
+ trialcs = dup_or_alloc_cpuset(cs);
if (!trialcs)
return -ENOMEM;
@@ -2884,10 +2959,10 @@ static int update_prstate(struct cpuset *cs, int new_prs)
/*
* Treat a previously invalid partition root as if it is a "member".
*/
- if (new_prs && is_prs_invalid(old_prs))
+ if (new_prs && is_partition_invalid(cs))
old_prs = PRS_MEMBER;
- if (alloc_cpumasks(NULL, &tmpmask))
+ if (alloc_tmpmasks(&tmpmask))
return -ENOMEM;
err = update_partition_exclusive_flag(cs, new_prs);
@@ -2983,7 +3058,7 @@ out:
notify_partition_change(cs, old_prs);
if (force_sd_rebuild)
rebuild_sched_domains_locked();
- free_cpumasks(NULL, &tmpmask);
+ free_tmpmasks(&tmpmask);
return 0;
}
@@ -3141,6 +3216,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
bool cpus_updated, mems_updated;
+ bool queue_task_work = false;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
@@ -3191,15 +3267,18 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* @old_mems_allowed is the right nodesets that we
* migrate mm from.
*/
- if (is_memory_migrate(cs))
+ if (is_memory_migrate(cs)) {
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
- else
+ queue_task_work = true;
+ } else
mmput(mm);
}
}
out:
+ if (queue_task_work)
+ schedule_flush_migrate_mm();
cs->old_mems_allowed = cpuset_attach_nodemask_to;
if (cs->nr_migrate_dl_tasks) {
@@ -3223,13 +3302,16 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
struct cpuset *trialcs;
int retval = -ENODEV;
+ /* root is read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
buf = strstrip(buf);
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
+ cpuset_full_lock();
if (!is_cpuset_online(cs))
goto out_unlock;
- trialcs = alloc_trial_cpuset(cs);
+ trialcs = dup_or_alloc_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
goto out_unlock;
@@ -3254,9 +3336,9 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
if (force_sd_rebuild)
rebuild_sched_domains_locked();
out_unlock:
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
- flush_workqueue(cpuset_migrate_mm_wq);
+ cpuset_full_unlock();
+ if (of_cft(of)->private == FILE_MEMLIST)
+ schedule_flush_migrate_mm();
return retval ?: nbytes;
}
@@ -3358,12 +3440,10 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
else
return -EINVAL;
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
+ cpuset_full_lock();
if (is_cpuset_online(cs))
retval = update_prstate(cs, val);
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
+ cpuset_full_unlock();
return retval ?: nbytes;
}
@@ -3462,15 +3542,10 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
if (!parent_css)
return &top_cpuset.css;
- cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ cs = dup_or_alloc_cpuset(NULL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (alloc_cpumasks(cs, NULL)) {
- kfree(cs);
- return ERR_PTR(-ENOMEM);
- }
-
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
@@ -3493,10 +3568,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
-
- set_bit(CS_ONLINE, &cs->flags);
+ cpuset_full_lock();
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
@@ -3548,8 +3620,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
+ cpuset_full_unlock();
return 0;
}
@@ -3564,17 +3635,12 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
-
+ cpuset_full_lock();
if (!cpuset_v2() && is_sched_load_balance(cs))
cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
cpuset_dec();
- clear_bit(CS_ONLINE, &cs->flags);
-
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
+ cpuset_full_unlock();
}
/*
@@ -3586,16 +3652,11 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
-
+ cpuset_full_lock();
/* Reset valid partition back to member */
if (is_partition_valid(cs))
update_prstate(cs, PRS_MEMBER);
-
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
-
+ cpuset_full_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3724,7 +3785,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
- .post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.can_fork = cpuset_can_fork,
.cancel_fork = cpuset_cancel_fork,
@@ -3928,7 +3988,7 @@ static void cpuset_handle_hotplug(void)
bool on_dfl = is_in_v2_mode();
struct tmpmasks tmp, *ptmp = NULL;
- if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+ if (on_dfl && !alloc_tmpmasks(&tmp))
ptmp = &tmp;
lockdep_assert_cpus_held();
@@ -4008,7 +4068,7 @@ static void cpuset_handle_hotplug(void)
if (force_sd_rebuild)
rebuild_sched_domains_cpuslocked();
- free_cpumasks(NULL, ptmp);
+ free_tmpmasks(ptmp);
}
void cpuset_update_active_cpus(void)
@@ -4073,7 +4133,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
struct cpuset *cs;
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
cs = task_cs(tsk);
if (cs != &top_cpuset)
@@ -4095,7 +4154,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
cpumask_copy(pmask, possible_mask);
}
- rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -4168,9 +4226,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
- rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
return mask;
@@ -4265,10 +4321,8 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
/* Not hardwall and node outside mems_allowed: scan up cpusets */
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
- rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 80aa3f027ac3..81ea38dd6f9d 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
return -ENODEV;
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
cset = task_css_set(current);
refcnt = refcount_read(&cset->refcount);
seq_printf(seq, "css_set %pK %d", cset, refcnt);
@@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name,
css, css->id);
}
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
cgroup_kn_unlock(of->kn);
return 0;
@@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
return -ENOMEM;
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
cset = task_css_set(current);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
seq_printf(seq, "Root %d group %s\n",
c->root->hierarchy_id, name_buf);
}
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
kfree(name_buf);
return 0;
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index bf1690a167dd..6c18854bff34 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze)
/*
* Freeze or unfreeze all tasks in the given cgroup.
*/
-static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
+static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec)
{
struct css_task_iter it;
struct task_struct *task;
@@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- if (freeze)
+ write_seqcount_begin(&cgrp->freezer.freeze_seq);
+ if (freeze) {
set_bit(CGRP_FREEZE, &cgrp->flags);
- else
+ cgrp->freezer.freeze_start_nsec = ts_nsec;
+ } else {
clear_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.frozen_nsec += (ts_nsec -
+ cgrp->freezer.freeze_start_nsec);
+ }
+ write_seqcount_end(&cgrp->freezer.freeze_seq);
spin_unlock_irq(&css_set_lock);
if (freeze)
@@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
struct cgroup *parent;
struct cgroup *dsct;
bool applied = false;
+ u64 ts_nsec;
bool old_e;
lockdep_assert_held(&cgroup_mutex);
@@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
return;
cgrp->freezer.freeze = freeze;
+ ts_nsec = ktime_get_ns();
/*
* Propagate changes downwards the cgroup tree.
@@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
/*
* Do change actual state: freeze or unfreeze.
*/
- cgroup_do_freeze(dsct, freeze);
+ cgroup_do_freeze(dsct, freeze, ts_nsec);
applied = true;
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6c83ad674d01..808c0d7a31fa 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -217,22 +217,26 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
}
struct perf_callchain_entry *
-get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
int rctx, start_entry_idx;
+ /* crosstask is not supported for user stacks */
+ if (crosstask && user && !kernel)
+ return NULL;
+
entry = get_callchain_entry(&rctx);
if (!entry)
return NULL;
- ctx.entry = entry;
- ctx.max_stack = max_stack;
- ctx.nr = entry->nr = init_nr;
- ctx.contexts = 0;
- ctx.contexts_maxed = false;
+ ctx.entry = entry;
+ ctx.max_stack = max_stack;
+ ctx.nr = entry->nr = 0;
+ ctx.contexts = 0;
+ ctx.contexts_maxed = false;
if (kernel && !user_mode(regs)) {
if (add_mark)
@@ -240,25 +244,19 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
perf_callchain_kernel(&ctx, regs);
}
- if (user) {
+ if (user && !crosstask) {
if (!user_mode(regs)) {
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
- }
-
- if (regs) {
- if (crosstask)
+ if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
goto exit_put;
+ regs = task_pt_regs(current);
+ }
- if (add_mark)
- perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+ if (add_mark)
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- start_entry_idx = entry->nr;
- perf_callchain_user(&ctx, regs);
- fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
- }
+ start_entry_idx = entry->nr;
+ perf_callchain_user(&ctx, regs);
+ fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
}
exit_put:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 820127536e62..fb1eae762044 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3974,7 +3974,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
*/
static inline bool event_update_userpage(struct perf_event *event)
{
- if (likely(!atomic_read(&event->mmap_count)))
+ if (likely(!refcount_read(&event->mmap_count)))
return false;
perf_event_update_time(event);
@@ -6710,11 +6710,11 @@ static void perf_mmap_open(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
mapped_f mapped = get_mapped(event, event_mapped);
- atomic_inc(&event->mmap_count);
- atomic_inc(&event->rb->mmap_count);
+ refcount_inc(&event->mmap_count);
+ refcount_inc(&event->rb->mmap_count);
if (vma->vm_pgoff)
- atomic_inc(&event->rb->aux_mmap_count);
+ refcount_inc(&event->rb->aux_mmap_count);
if (mapped)
mapped(event, vma->vm_mm);
@@ -6749,7 +6749,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
* to avoid complications.
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
- atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
+ refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
/*
* Stop all AUX events that are writing to this buffer,
* so that we can free its AUX pages and corresponding PMU
@@ -6769,10 +6769,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&rb->aux_mutex);
}
- if (atomic_dec_and_test(&rb->mmap_count))
+ if (refcount_dec_and_test(&rb->mmap_count))
detach_rest = true;
- if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
ring_buffer_attach(event, NULL);
@@ -6933,230 +6933,242 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
return err;
}
-static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra)
{
- struct perf_event *event = file->private_data;
- unsigned long user_locked, user_lock_limit;
+ unsigned long user_locked, user_lock_limit, locked, lock_limit;
struct user_struct *user = current_user();
- struct mutex *aux_mutex = NULL;
- struct perf_buffer *rb = NULL;
- unsigned long locked, lock_limit;
- unsigned long vma_size;
- unsigned long nr_pages;
- long user_extra = 0, extra = 0;
- int ret, flags = 0;
- mapped_f mapped;
+
+ user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+ /* Increase the limit linearly with more CPUs */
+ user_lock_limit *= num_online_cpus();
+
+ user_locked = atomic_long_read(&user->locked_vm);
/*
- * Don't allow mmap() of inherited per-task counters. This would
- * create a performance issue due to all children writing to the
- * same rb.
+ * sysctl_perf_event_mlock may have changed, so that
+ * user->locked_vm > user_lock_limit
*/
- if (event->cpu == -1 && event->attr.inherit)
- return -EINVAL;
+ if (user_locked > user_lock_limit)
+ user_locked = user_lock_limit;
+ user_locked += *user_extra;
- if (!(vma->vm_flags & VM_SHARED))
- return -EINVAL;
+ if (user_locked > user_lock_limit) {
+ /*
+ * charge locked_vm until it hits user_lock_limit;
+ * charge the rest from pinned_vm
+ */
+ *extra = user_locked - user_lock_limit;
+ *user_extra -= *extra;
+ }
- ret = security_perf_event_read(event);
- if (ret)
- return ret;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra;
- vma_size = vma->vm_end - vma->vm_start;
- nr_pages = vma_size / PAGE_SIZE;
+ return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK);
+}
- if (nr_pages > INT_MAX)
- return -ENOMEM;
+static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra)
+{
+ struct user_struct *user = current_user();
- if (vma_size != PAGE_SIZE * nr_pages)
- return -EINVAL;
+ atomic_long_add(user_extra, &user->locked_vm);
+ atomic64_add(extra, &vma->vm_mm->pinned_vm);
+}
- user_extra = nr_pages;
+static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
+ unsigned long nr_pages)
+{
+ long extra = 0, user_extra = nr_pages;
+ struct perf_buffer *rb;
+ int rb_flags = 0;
- mutex_lock(&event->mmap_mutex);
- ret = -EINVAL;
+ nr_pages -= 1;
/*
- * This relies on __pmu_detach_event() taking mmap_mutex after marking
- * the event REVOKED. Either we observe the state, or __pmu_detach_event()
- * will detach the rb created here.
+ * If we have rb pages ensure they're a power-of-two number, so we
+ * can do bitmasks instead of modulo.
*/
- if (event->state <= PERF_EVENT_STATE_REVOKED) {
- ret = -ENODEV;
- goto unlock;
- }
-
- if (vma->vm_pgoff == 0) {
- nr_pages -= 1;
-
- /*
- * If we have rb pages ensure they're a power-of-two number, so we
- * can do bitmasks instead of modulo.
- */
- if (nr_pages != 0 && !is_power_of_2(nr_pages))
- goto unlock;
-
- WARN_ON_ONCE(event->ctx->parent_ctx);
+ if (nr_pages != 0 && !is_power_of_2(nr_pages))
+ return -EINVAL;
- if (event->rb) {
- if (data_page_nr(event->rb) != nr_pages)
- goto unlock;
+ WARN_ON_ONCE(event->ctx->parent_ctx);
- if (atomic_inc_not_zero(&event->rb->mmap_count)) {
- /*
- * Success -- managed to mmap() the same buffer
- * multiple times.
- */
- ret = 0;
- /* We need the rb to map pages. */
- rb = event->rb;
- goto unlock;
- }
+ if (event->rb) {
+ if (data_page_nr(event->rb) != nr_pages)
+ return -EINVAL;
+ if (refcount_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close()'s
- * atomic_dec_and_mutex_lock() remove the
- * event and continue as if !event->rb
+ * Success -- managed to mmap() the same buffer
+ * multiple times.
*/
- ring_buffer_attach(event, NULL);
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_inc(&event->mmap_count);
+ return 0;
}
- } else {
/*
- * AUX area mapping: if rb->aux_nr_pages != 0, it's already
- * mapped, all subsequent mappings should have the same size
- * and offset. Must be above the normal perf buffer.
+ * Raced against perf_mmap_close()'s
+ * refcount_dec_and_mutex_lock() remove the
+ * event and continue as if !event->rb
*/
- u64 aux_offset, aux_size;
+ ring_buffer_attach(event, NULL);
+ }
- rb = event->rb;
- if (!rb)
- goto aux_unlock;
+ if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
+ return -EPERM;
- aux_mutex = &rb->aux_mutex;
- mutex_lock(aux_mutex);
+ if (vma->vm_flags & VM_WRITE)
+ rb_flags |= RING_BUFFER_WRITABLE;
- aux_offset = READ_ONCE(rb->user_page->aux_offset);
- aux_size = READ_ONCE(rb->user_page->aux_size);
+ rb = rb_alloc(nr_pages,
+ event->attr.watermark ? event->attr.wakeup_watermark : 0,
+ event->cpu, rb_flags);
- if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
- goto aux_unlock;
+ if (!rb)
+ return -ENOMEM;
- if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
- goto aux_unlock;
+ refcount_set(&rb->mmap_count, 1);
+ rb->mmap_user = get_current_user();
+ rb->mmap_locked = extra;
- /* already mapped with a different offset */
- if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
- goto aux_unlock;
+ ring_buffer_attach(event, rb);
- if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
- goto aux_unlock;
+ perf_event_update_time(event);
+ perf_event_init_userpage(event);
+ perf_event_update_userpage(event);
- /* already mapped with a different size */
- if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
- goto aux_unlock;
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_set(&event->mmap_count, 1);
- if (!is_power_of_2(nr_pages))
- goto aux_unlock;
+ return 0;
+}
- if (!atomic_inc_not_zero(&rb->mmap_count))
- goto aux_unlock;
+static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
+ unsigned long nr_pages)
+{
+ long extra = 0, user_extra = nr_pages;
+ u64 aux_offset, aux_size;
+ struct perf_buffer *rb;
+ int ret, rb_flags = 0;
- if (rb_has_aux(rb)) {
- atomic_inc(&rb->aux_mmap_count);
- ret = 0;
- goto unlock;
- }
- }
+ rb = event->rb;
+ if (!rb)
+ return -EINVAL;
- user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+ guard(mutex)(&rb->aux_mutex);
/*
- * Increase the limit linearly with more CPUs:
+ * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+ * mapped, all subsequent mappings should have the same size
+ * and offset. Must be above the normal perf buffer.
*/
- user_lock_limit *= num_online_cpus();
+ aux_offset = READ_ONCE(rb->user_page->aux_offset);
+ aux_size = READ_ONCE(rb->user_page->aux_size);
- user_locked = atomic_long_read(&user->locked_vm);
+ if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+ return -EINVAL;
- /*
- * sysctl_perf_event_mlock may have changed, so that
- * user->locked_vm > user_lock_limit
- */
- if (user_locked > user_lock_limit)
- user_locked = user_lock_limit;
- user_locked += user_extra;
+ if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+ return -EINVAL;
- if (user_locked > user_lock_limit) {
- /*
- * charge locked_vm until it hits user_lock_limit;
- * charge the rest from pinned_vm
- */
- extra = user_locked - user_lock_limit;
- user_extra -= extra;
- }
+ /* already mapped with a different offset */
+ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+ return -EINVAL;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
+ if (aux_size != nr_pages * PAGE_SIZE)
+ return -EINVAL;
- if ((locked > lock_limit) && perf_is_paranoid() &&
- !capable(CAP_IPC_LOCK)) {
- ret = -EPERM;
- goto unlock;
- }
+ /* already mapped with a different size */
+ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+ return -EINVAL;
- WARN_ON(!rb && event->rb);
+ if (!is_power_of_2(nr_pages))
+ return -EINVAL;
- if (vma->vm_flags & VM_WRITE)
- flags |= RING_BUFFER_WRITABLE;
+ if (!refcount_inc_not_zero(&rb->mmap_count))
+ return -EINVAL;
- if (!rb) {
- rb = rb_alloc(nr_pages,
- event->attr.watermark ? event->attr.wakeup_watermark : 0,
- event->cpu, flags);
+ if (rb_has_aux(rb)) {
+ refcount_inc(&rb->aux_mmap_count);
- if (!rb) {
- ret = -ENOMEM;
- goto unlock;
+ } else {
+ if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
+ refcount_dec(&rb->mmap_count);
+ return -EPERM;
}
- atomic_set(&rb->mmap_count, 1);
- rb->mmap_user = get_current_user();
- rb->mmap_locked = extra;
+ WARN_ON(!rb && event->rb);
- ring_buffer_attach(event, rb);
+ if (vma->vm_flags & VM_WRITE)
+ rb_flags |= RING_BUFFER_WRITABLE;
- perf_event_update_time(event);
- perf_event_init_userpage(event);
- perf_event_update_userpage(event);
- ret = 0;
- } else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
- event->attr.aux_watermark, flags);
- if (!ret) {
- atomic_set(&rb->aux_mmap_count, 1);
- rb->aux_mmap_locked = extra;
+ event->attr.aux_watermark, rb_flags);
+ if (ret) {
+ refcount_dec(&rb->mmap_count);
+ return ret;
}
+
+ refcount_set(&rb->aux_mmap_count, 1);
+ rb->aux_mmap_locked = extra;
}
-unlock:
- if (!ret) {
- atomic_long_add(user_extra, &user->locked_vm);
- atomic64_add(extra, &vma->vm_mm->pinned_vm);
-
- atomic_inc(&event->mmap_count);
- } else if (rb) {
- /* AUX allocation failed */
- atomic_dec(&rb->mmap_count);
- }
-aux_unlock:
- if (aux_mutex)
- mutex_unlock(aux_mutex);
- mutex_unlock(&event->mmap_mutex);
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_inc(&event->mmap_count);
+ return 0;
+}
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct perf_event *event = file->private_data;
+ unsigned long vma_size, nr_pages;
+ mapped_f mapped;
+ int ret;
+
+ /*
+ * Don't allow mmap() of inherited per-task counters. This would
+ * create a performance issue due to all children writing to the
+ * same rb.
+ */
+ if (event->cpu == -1 && event->attr.inherit)
+ return -EINVAL;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ ret = security_perf_event_read(event);
if (ret)
return ret;
+ vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = vma_size / PAGE_SIZE;
+
+ if (nr_pages > INT_MAX)
+ return -ENOMEM;
+
+ if (vma_size != PAGE_SIZE * nr_pages)
+ return -EINVAL;
+
+ scoped_guard (mutex, &event->mmap_mutex) {
+ /*
+ * This relies on __pmu_detach_event() taking mmap_mutex after marking
+ * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+ * will detach the rb created here.
+ */
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
+ if (vma->vm_pgoff == 0)
+ ret = perf_mmap_rb(vma, event, nr_pages);
+ else
+ ret = perf_mmap_aux(vma, event, nr_pages);
+ if (ret)
+ return ret;
+ }
+
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
@@ -7174,7 +7186,7 @@ aux_unlock:
* full cleanup in this case and therefore does not invoke
* vmops::close().
*/
- ret = map_range(rb, vma);
+ ret = map_range(event->rb, vma);
if (ret)
perf_mmap_close(vma);
@@ -7440,7 +7452,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
- } else if (!(current->flags & PF_KTHREAD)) {
+ } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
perf_get_regs_user(regs_user, regs);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -8080,7 +8092,7 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe get_user_page_fast_only first.
* If failed, leave phys_addr as 0.
*/
- if (current->mm != NULL) {
+ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
struct page *p;
pagefault_disable();
@@ -8192,7 +8204,8 @@ struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
bool kernel = !event->attr.exclude_callchain_kernel;
- bool user = !event->attr.exclude_callchain_user;
+ bool user = !event->attr.exclude_callchain_user &&
+ !(current->flags & (PF_KTHREAD | PF_USER_WORKER));
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
const u32 max_stack = event->attr.sample_max_stack;
@@ -8204,7 +8217,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (!kernel && !user)
return &__empty_callchain;
- callchain = get_perf_callchain(regs, 0, kernel, user,
+ callchain = get_perf_callchain(regs, kernel, user,
max_stack, crosstask, true);
return callchain ?: &__empty_callchain;
}
@@ -13249,7 +13262,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
/* Can't redirect output if we've got an active mmap() */
- if (atomic_read(&event->mmap_count))
+ if (refcount_read(&event->mmap_count))
goto unlock;
if (output_event) {
@@ -13262,7 +13275,7 @@ set:
goto unlock;
/* did we race against perf_mmap_close() */
- if (!atomic_read(&rb->mmap_count)) {
+ if (!refcount_read(&rb->mmap_count)) {
ring_buffer_put(rb);
goto unlock;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 249288d82b8d..d9cc57083091 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -35,7 +35,7 @@ struct perf_buffer {
spinlock_t event_lock;
struct list_head event_list;
- atomic_t mmap_count;
+ refcount_t mmap_count;
unsigned long mmap_locked;
struct user_struct *mmap_user;
@@ -47,7 +47,7 @@ struct perf_buffer {
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
- atomic_t aux_mmap_count;
+ refcount_t aux_mmap_count;
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
refcount_t aux_refcount;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index aa9a759e824f..20a905023736 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* the same order, see perf_mmap_close. Otherwise we end up freeing
* aux pages in this path, which is a bug, because in_atomic().
*/
- if (!atomic_read(&rb->aux_mmap_count))
+ if (!refcount_read(&rb->aux_mmap_count))
goto err;
if (!refcount_inc_not_zero(&rb->aux_refcount))
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6d3034ec418d..c8bfff023c6e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
return is_swbp_insn(insn);
}
-static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
void *kaddr = kmap_atomic(page);
memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
@@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
kunmap_atomic(kaddr);
}
-static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
+ int nbytes, void *data)
{
uprobe_opcode_t old_opcode;
bool is_swbp;
@@ -205,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* is a trap variant; uprobes always wins over any other (gdb)
* breakpoint.
*/
- copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+ uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
- if (is_swbp_insn(new_opcode)) {
+ if (is_swbp_insn(insn)) {
if (is_swbp) /* register: already installed? */
return 0;
} else {
@@ -399,12 +400,12 @@ static bool orig_page_is_identical(struct vm_area_struct *vma,
return identical;
}
-static int __uprobe_write_opcode(struct vm_area_struct *vma,
+static int __uprobe_write(struct vm_area_struct *vma,
struct folio_walk *fw, struct folio *folio,
- unsigned long opcode_vaddr, uprobe_opcode_t opcode)
+ unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
+ bool is_register)
{
- const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
- const bool is_register = !!is_swbp_insn(&opcode);
+ const unsigned long vaddr = insn_vaddr & PAGE_MASK;
bool pmd_mappable;
/* For now, we'll only handle PTE-mapped folios. */
@@ -429,7 +430,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma,
*/
flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
- copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ copy_to_page(fw->page, insn_vaddr, insn, nbytes);
/*
* When unregistering, we may only zap a PTE if uffd is disabled and
@@ -482,23 +483,32 @@ remap:
* @opcode_vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @opcode_vaddr.
*
- * Called with mm->mmap_lock held for read or write.
+ * Called with mm->mmap_lock held for write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
- const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
+ const unsigned long opcode_vaddr, uprobe_opcode_t opcode,
+ bool is_register)
{
- const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
+ verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
+}
+
+int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
+ uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+ void *data)
+{
+ const unsigned long vaddr = insn_vaddr & PAGE_MASK;
struct mm_struct *mm = vma->vm_mm;
struct uprobe *uprobe;
- int ret, is_register, ref_ctr_updated = 0;
+ int ret, ref_ctr_updated = 0;
unsigned int gup_flags = FOLL_FORCE;
struct mmu_notifier_range range;
struct folio_walk fw;
struct folio *folio;
struct page *page;
- is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
@@ -509,7 +519,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
* page that we can safely modify. Use FOLL_WRITE to trigger a write
* fault if required. When unregistering, we might be lucky and the
* anon page is already gone. So defer write faults until really
- * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
+ * required. Use FOLL_SPLIT_PMD, because __uprobe_write()
* cannot deal with PMDs yet.
*/
if (is_register)
@@ -521,14 +531,14 @@ retry:
goto out;
folio = page_folio(page);
- ret = verify_opcode(page, opcode_vaddr, &opcode);
+ ret = verify(page, insn_vaddr, insn, nbytes, data);
if (ret <= 0) {
folio_put(folio);
goto out;
}
/* We are going to replace instruction, update ref_ctr. */
- if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
+ if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
if (ret) {
folio_put(folio);
@@ -560,7 +570,7 @@ retry:
/* Walk the page tables again, to perform the actual update. */
if (folio_walk_start(&fw, vma, vaddr, 0)) {
if (fw.page == page)
- ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
+ ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register);
folio_walk_end(&fw, vma);
}
@@ -580,7 +590,7 @@ retry:
out:
/* Revert back reference counter if instruction update failed. */
- if (ret < 0 && ref_ctr_updated)
+ if (do_update_ref_ctr && ret < 0 && ref_ctr_updated)
update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
/* try collapse pmd for compound page */
@@ -602,7 +612,7 @@ out:
int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true);
}
/**
@@ -618,7 +628,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe,
struct vm_area_struct *vma, unsigned long vaddr)
{
return uprobe_write_opcode(auprobe, vma, vaddr,
- *(uprobe_opcode_t *)&auprobe->insn);
+ *(uprobe_opcode_t *)&auprobe->insn, false);
}
/* uprobe should have guaranteed positive refcount */
@@ -1051,7 +1061,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
if (IS_ERR(page))
return PTR_ERR(page);
- copy_from_page(page, offset, insn, nbytes);
+ uprobe_copy_from_page(page, offset, insn, nbytes);
put_page(page);
return 0;
@@ -1210,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
* reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
- GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ GFP_NOWAIT | __GFP_NOMEMALLOC);
if (prev)
prev->next = NULL;
}
@@ -1397,7 +1407,7 @@ struct uprobe *uprobe_register(struct inode *inode,
return ERR_PTR(-EINVAL);
/*
- * This ensures that copy_from_page(), copy_to_page() and
+ * This ensures that uprobe_copy_from_page(), copy_to_page() and
* __update_ref_ctr() can't cross page boundary.
*/
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
@@ -1463,7 +1473,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
struct vm_area_struct *vma;
int err = 0;
- mmap_read_lock(mm);
+ mmap_write_lock(mm);
for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -1480,7 +1490,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
vaddr = offset_to_vaddr(vma, uprobe->offset);
err |= remove_breakpoint(uprobe, vma, vaddr);
}
- mmap_read_unlock(mm);
+ mmap_write_unlock(mm);
return err;
}
@@ -1726,7 +1736,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
return ret;
}
-void * __weak arch_uprobe_trampoline(unsigned long *psize)
+void * __weak arch_uretprobe_trampoline(unsigned long *psize)
{
static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
@@ -1758,7 +1768,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
- insns = arch_uprobe_trampoline(&insns_size);
+ insns = arch_uretprobe_trampoline(&insns_size);
arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
if (!xol_add_vma(mm, area))
@@ -1792,6 +1802,14 @@ static struct xol_area *get_xol_area(void)
return area;
}
+void __weak arch_uprobe_clear_state(struct mm_struct *mm)
+{
+}
+
+void __weak arch_uprobe_init_state(struct mm_struct *mm)
+{
+}
+
/*
* uprobe_clear_state - Free the area allocated for slots.
*/
@@ -1803,6 +1821,8 @@ void uprobe_clear_state(struct mm_struct *mm)
delayed_uprobe_remove(NULL, mm);
mutex_unlock(&delayed_uprobe_lock);
+ arch_uprobe_clear_state(mm);
+
if (!area)
return;
@@ -2393,7 +2413,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (result < 0)
return result;
- copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out:
/* This needs to return true for any variant of the trap insn */
@@ -2677,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
return true;
}
+void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -2741,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs)
handler_chain(uprobe, regs);
+ /* Try to optimize after first hit. */
+ arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
@@ -2752,6 +2779,23 @@ out:
rcu_read_unlock_trace();
}
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+ struct uprobe *uprobe;
+ int is_swbp;
+
+ guard(rcu_tasks_trace)();
+
+ uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+ if (!uprobe)
+ return;
+ if (!get_utask())
+ return;
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ return;
+ handler_chain(uprobe, regs);
+}
+
/*
* Perform required fix-ups and disable singlestep.
* Allow pending signals to take effect.
diff --git a/kernel/fork.c b/kernel/fork.c
index f24f4c71d002..cffa6157a55a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1014,6 +1014,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
mm->uprobes_state.xol_area = NULL;
+ arch_uprobe_init_state(mm);
#endif
}
@@ -1688,6 +1689,10 @@ static int copy_signal(u64 clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->cgroup_threadgroup_rwsem);
+#endif
+
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 4b6da9116aa6..880c9bf2f315 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -39,6 +39,56 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
return 0;
}
+static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat)
+{
+#ifdef CONFIG_COMPAT
+ if (compat)
+ return p->compat_robust_list;
+#endif
+ return p->robust_list;
+}
+
+static void __user *futex_get_robust_list_common(int pid, bool compat)
+{
+ struct task_struct *p = current;
+ void __user *head;
+ int ret;
+
+ scoped_guard(rcu) {
+ if (pid) {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ return (void __user *)ERR_PTR(-ESRCH);
+ }
+ get_task_struct(p);
+ }
+
+ /*
+ * Hold exec_update_lock to serialize with concurrent exec()
+ * so ptrace_may_access() is checked against stable credentials
+ */
+ ret = down_read_killable(&p->signal->exec_update_lock);
+ if (ret)
+ goto err_put;
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = futex_task_robust_list(p, compat);
+
+ up_read(&p->signal->exec_update_lock);
+ put_task_struct(p);
+
+ return head;
+
+err_unlock:
+ up_read(&p->signal->exec_update_lock);
+err_put:
+ put_task_struct(p);
+ return (void __user *)ERR_PTR(ret);
+}
+
/**
* sys_get_robust_list() - Get the robust-futex list head of a task
* @pid: pid of the process [zero for current task]
@@ -49,36 +99,14 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
struct robust_list_head __user * __user *, head_ptr,
size_t __user *, len_ptr)
{
- struct robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
+ struct robust_list_head __user *head = futex_get_robust_list_common(pid, false);
- head = p->robust_list;
- rcu_read_unlock();
+ if (IS_ERR(head))
+ return PTR_ERR(head);
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(head, head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
@@ -455,36 +483,14 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
compat_uptr_t __user *, head_ptr,
compat_size_t __user *, len_ptr)
{
- struct compat_robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
+ struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true);
- head = p->compat_robust_list;
- rcu_read_unlock();
+ if (IS_ERR(head))
+ return PTR_ERR(head);
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
}
#endif /* CONFIG_COMPAT */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 1da5e9d9da71..1b4254d19a73 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -6,10 +6,6 @@ menu "IRQ subsystem"
config MAY_HAVE_SPARSE_IRQ
bool
-# Legacy support, required for itanic
-config GENERIC_IRQ_LEGACY
- bool
-
# Enable the generic irq autoprobe mechanism
config GENERIC_IRQ_PROBE
bool
@@ -147,7 +143,9 @@ config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
config IRQ_KUNIT_TEST
bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS
depends on KUNIT=y
+ depends on SPARSE_IRQ
default KUNIT_ALL_TESTS
+ select IRQ_DOMAIN
imply SMP
help
This option enables KUnit tests for the IRQ subsystem API. These are
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 0d0276378c70..3ffa0d80ddd1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1260,6 +1260,43 @@ int irq_chip_get_parent_state(struct irq_data *data,
EXPORT_SYMBOL_GPL(irq_chip_get_parent_state);
/**
+ * irq_chip_shutdown_parent - Shutdown the parent interrupt
+ * @data: Pointer to interrupt specific data
+ *
+ * Invokes the irq_shutdown() callback of the parent if available or falls
+ * back to irq_chip_disable_parent().
+ */
+void irq_chip_shutdown_parent(struct irq_data *data)
+{
+ struct irq_data *parent = data->parent_data;
+
+ if (parent->chip->irq_shutdown)
+ parent->chip->irq_shutdown(parent);
+ else
+ irq_chip_disable_parent(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_shutdown_parent);
+
+/**
+ * irq_chip_startup_parent - Startup the parent interrupt
+ * @data: Pointer to interrupt specific data
+ *
+ * Invokes the irq_startup() callback of the parent if available or falls
+ * back to irq_chip_enable_parent().
+ */
+unsigned int irq_chip_startup_parent(struct irq_data *data)
+{
+ struct irq_data *parent = data->parent_data;
+
+ if (parent->chip->irq_startup)
+ return parent->chip->irq_startup(parent);
+
+ irq_chip_enable_parent(data);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_chip_startup_parent);
+
+/**
* irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
* NULL)
* @data: Pointer to interrupt specific data
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index eb16a58e0322..b41188698622 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -30,29 +30,22 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
return this->irq == match->irq && this->dev_id == match->dev_id;
}
-/**
- * devm_request_threaded_irq - allocate an interrupt line for a managed device
- * @dev: device to request interrupt for
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs
- * @thread_fn: function to be called in a threaded interrupt context. NULL
- * for devices which handle everything in @handler
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
- * @dev_id: A cookie passed back to the handler function
- *
- * Except for the extra @dev argument, this function takes the
- * same arguments and performs the same function as
- * request_threaded_irq(). IRQs requested with this function will be
- * automatically freed on driver detach.
- *
- * If an IRQ allocated with this function needs to be freed
- * separately, devm_free_irq() must be used.
- */
-int devm_request_threaded_irq(struct device *dev, unsigned int irq,
- irq_handler_t handler, irq_handler_t thread_fn,
- unsigned long irqflags, const char *devname,
- void *dev_id)
+static int devm_request_result(struct device *dev, int rc, unsigned int irq,
+ irq_handler_t handler, irq_handler_t thread_fn,
+ const char *devname)
+{
+ if (rc >= 0)
+ return rc;
+
+ return dev_err_probe(dev, rc, "request_irq(%u) %ps %ps %s\n",
+ irq, handler, thread_fn, devname ? : "");
+}
+
+static int __devm_request_threaded_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler,
+ irq_handler_t thread_fn,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
{
struct irq_devres *dr;
int rc;
@@ -78,28 +71,48 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
return 0;
}
-EXPORT_SYMBOL(devm_request_threaded_irq);
/**
- * devm_request_any_context_irq - allocate an interrupt line for a managed device
- * @dev: device to request interrupt for
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
- * @dev_id: A cookie passed back to the handler function
+ * devm_request_threaded_irq - allocate an interrupt line for a managed device with error logging
+ * @dev: Device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the interrupt occurs
+ * @thread_fn: Function to be called in a threaded interrupt context. NULL
+ * for devices which handle everything in @handler
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id: A cookie passed back to the handler function
*
- * Except for the extra @dev argument, this function takes the
- * same arguments and performs the same function as
- * request_any_context_irq(). IRQs requested with this function will be
- * automatically freed on driver detach.
+ * Except for the extra @dev argument, this function takes the same
+ * arguments and performs the same function as request_threaded_irq().
+ * Interrupts requested with this function will be automatically freed on
+ * driver detach.
+ *
+ * If an interrupt allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ *
+ * When the request fails, an error message is printed with contextual
+ * information (device name, interrupt number, handler functions and
+ * error code). Don't add extra error messages at the call sites.
*
- * If an IRQ allocated with this function needs to be freed
- * separately, devm_free_irq() must be used.
+ * Return: 0 on success or a negative error number.
*/
-int devm_request_any_context_irq(struct device *dev, unsigned int irq,
- irq_handler_t handler, unsigned long irqflags,
- const char *devname, void *dev_id)
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, irq_handler_t thread_fn,
+ unsigned long irqflags, const char *devname,
+ void *dev_id)
+{
+ int rc = __devm_request_threaded_irq(dev, irq, handler, thread_fn,
+ irqflags, devname, dev_id);
+
+ return devm_request_result(dev, rc, irq, handler, thread_fn, devname);
+}
+EXPORT_SYMBOL(devm_request_threaded_irq);
+
+static int __devm_request_any_context_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
{
struct irq_devres *dr;
int rc;
@@ -124,6 +137,40 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
return rc;
}
+
+/**
+ * devm_request_any_context_irq - allocate an interrupt line for a managed device with error logging
+ * @dev: Device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the interrupt occurs
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * Except for the extra @dev argument, this function takes the same
+ * arguments and performs the same function as request_any_context_irq().
+ * Interrupts requested with this function will be automatically freed on
+ * driver detach.
+ *
+ * If an interrupt allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ *
+ * When the request fails, an error message is printed with contextual
+ * information (device name, interrupt number, handler functions and
+ * error code). Don't add extra error messages at the call sites.
+ *
+ * Return: IRQC_IS_HARDIRQ or IRQC_IS_NESTED on success, or a negative error
+ * number.
+ */
+int devm_request_any_context_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ int rc = __devm_request_any_context_irq(dev, irq, handler, irqflags,
+ devname, dev_id);
+
+ return devm_request_result(dev, rc, irq, handler, NULL, devname);
+}
EXPORT_SYMBOL(devm_request_any_context_irq);
/**
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 9489f93b3db3..e103451243a0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -136,6 +136,44 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
+static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled);
+static u64 irqhandler_duration_threshold_ns __ro_after_init;
+
+static int __init irqhandler_duration_check_setup(char *arg)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(arg, 0, &val);
+ if (ret) {
+ pr_err("Unable to parse irqhandler.duration_warn_us setting: ret=%d\n", ret);
+ return 0;
+ }
+
+ if (!val) {
+ pr_err("Invalid irqhandler.duration_warn_us setting, must be > 0\n");
+ return 0;
+ }
+
+ irqhandler_duration_threshold_ns = val * 1000;
+ static_branch_enable(&irqhandler_duration_check_enabled);
+
+ return 1;
+}
+__setup("irqhandler.duration_warn_us=", irqhandler_duration_check_setup);
+
+static inline void irqhandler_duration_check(u64 ts_start, unsigned int irq,
+ const struct irqaction *action)
+{
+ u64 delta_ns = local_clock() - ts_start;
+
+ if (unlikely(delta_ns > irqhandler_duration_threshold_ns)) {
+ pr_warn_ratelimited("[CPU%u] long duration of IRQ[%u:%ps], took: %llu us\n",
+ smp_processor_id(), irq, action->handler,
+ div_u64(delta_ns, NSEC_PER_USEC));
+ }
+}
+
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
@@ -155,7 +193,16 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
lockdep_hardirq_threaded();
trace_irq_handler_entry(irq, action);
- res = action->handler(irq, action->dev_id);
+
+ if (static_branch_unlikely(&irqhandler_duration_check_enabled)) {
+ u64 ts_start = local_clock();
+
+ res = action->handler(irq, action->dev_id);
+ irqhandler_duration_check(ts_start, irq, action);
+ } else {
+ res = action->handler(irq, action->dev_id);
+ }
+
trace_irq_handler_exit(irq, action, res);
if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c
index a75abebed7f2..e2d31914b3c4 100644
--- a/kernel/irq/irq_test.c
+++ b/kernel/irq/irq_test.c
@@ -41,21 +41,37 @@ static struct irq_chip fake_irq_chip = {
.flags = IRQCHIP_SKIP_SET_WAKE,
};
-static void irq_disable_depth_test(struct kunit *test)
+static int irq_test_setup_fake_irq(struct kunit *test, struct irq_affinity_desc *affd)
{
struct irq_desc *desc;
- int virq, ret;
+ int virq;
- virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL);
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, affd);
KUNIT_ASSERT_GE(test, virq, 0);
- irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+ irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */
+ irq_settings_clr_norequest(desc);
+
+ return virq;
+}
+
+static void irq_disable_depth_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ int virq, ret;
+
+ virq = irq_test_setup_fake_irq(test, NULL);
desc = irq_to_desc(virq);
KUNIT_ASSERT_PTR_NE(test, desc, NULL);
ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
- KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
KUNIT_EXPECT_EQ(test, desc->depth, 0);
@@ -73,16 +89,13 @@ static void irq_free_disabled_test(struct kunit *test)
struct irq_desc *desc;
int virq, ret;
- virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL);
- KUNIT_ASSERT_GE(test, virq, 0);
-
- irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+ virq = irq_test_setup_fake_irq(test, NULL);
desc = irq_to_desc(virq);
KUNIT_ASSERT_PTR_NE(test, desc, NULL);
ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
- KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
KUNIT_EXPECT_EQ(test, desc->depth, 0);
@@ -93,7 +106,7 @@ static void irq_free_disabled_test(struct kunit *test)
KUNIT_EXPECT_GE(test, desc->depth, 1);
ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
- KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
KUNIT_EXPECT_EQ(test, desc->depth, 0);
free_irq(virq, NULL);
@@ -112,10 +125,7 @@ static void irq_shutdown_depth_test(struct kunit *test)
if (!IS_ENABLED(CONFIG_SMP))
kunit_skip(test, "requires CONFIG_SMP for managed shutdown");
- virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity);
- KUNIT_ASSERT_GE(test, virq, 0);
-
- irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+ virq = irq_test_setup_fake_irq(test, &affinity);
desc = irq_to_desc(virq);
KUNIT_ASSERT_PTR_NE(test, desc, NULL);
@@ -124,7 +134,7 @@ static void irq_shutdown_depth_test(struct kunit *test)
KUNIT_ASSERT_PTR_NE(test, data, NULL);
ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
- KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
@@ -169,13 +179,12 @@ static void irq_cpuhotplug_test(struct kunit *test)
kunit_skip(test, "requires more than 1 CPU for CPU hotplug");
if (!cpu_is_hotpluggable(1))
kunit_skip(test, "CPU 1 must be hotpluggable");
+ if (!cpu_online(1))
+ kunit_skip(test, "CPU 1 must be online");
cpumask_copy(&affinity.mask, cpumask_of(1));
- virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity);
- KUNIT_ASSERT_GE(test, virq, 0);
-
- irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq);
+ virq = irq_test_setup_fake_irq(test, &affinity);
desc = irq_to_desc(virq);
KUNIT_ASSERT_PTR_NE(test, desc, NULL);
@@ -184,7 +193,7 @@ static void irq_cpuhotplug_test(struct kunit *test)
KUNIT_ASSERT_PTR_NE(test, data, NULL);
ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
- KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_ASSERT_EQ(test, ret, 0);
KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
@@ -196,13 +205,9 @@ static void irq_cpuhotplug_test(struct kunit *test)
KUNIT_EXPECT_EQ(test, desc->depth, 1);
KUNIT_EXPECT_EQ(test, remove_cpu(1), 0);
- KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
- KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
KUNIT_EXPECT_GE(test, desc->depth, 1);
KUNIT_EXPECT_EQ(test, add_cpu(1), 0);
- KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
- KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
KUNIT_EXPECT_EQ(test, desc->depth, 1);
enable_irq(virq);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index b64c57b44c20..db714d3014b5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -653,13 +653,6 @@ void irq_mark_irq(unsigned int irq)
irq_insert_desc(irq, irq_desc + irq);
}
-#ifdef CONFIG_GENERIC_IRQ_LEGACY
-void irq_init_desc(unsigned int irq)
-{
- free_desc(irq);
-}
-#endif
-
#endif /* !CONFIG_SPARSE_IRQ */
int handle_irq_desc(struct irq_desc *desc)
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b7a1ec327e81..2452b7366b00 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -342,12 +342,12 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
/*
* Load and clear event mask atomically with respect to
- * scheduler preemption.
+ * scheduler preemption and membarrier IPIs.
*/
- preempt_disable();
- event_mask = t->rseq_event_mask;
- t->rseq_event_mask = 0;
- preempt_enable();
+ scoped_guard(RSEQ_EVENT_GUARD) {
+ event_mask = t->rseq_event_mask;
+ t->rseq_event_mask = 0;
+ }
return !!event_mask;
}
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index c4a488e67aa7..755883faf751 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -58,6 +58,7 @@
#include "deadline.c"
#ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext_internal.h"
# include "ext.c"
# include "ext_idle.c"
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec33b0353027..198d2dd45f59 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,6 +7,8 @@
* Copyright (C) 1991-2002 Linus Torvalds
* Copyright (C) 1998-2024 Ingo Molnar, Red Hat
*/
+#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE
+#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/hrtimer_api.h>
#include <linux/ktime_api.h>
@@ -917,7 +919,7 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
- rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
+ rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta);
if (rq == this_rq())
__hrtick_restart(rq);
@@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
__do_set_cpus_allowed(p, &ac);
}
-void migrate_disable(void)
-{
- struct task_struct *p = current;
-
- if (p->migration_disabled) {
-#ifdef CONFIG_DEBUG_PREEMPT
- /*
- *Warn about overflow half-way through the range.
- */
- WARN_ON_ONCE((s16)p->migration_disabled < 0);
-#endif
- p->migration_disabled++;
- return;
- }
-
- guard(preempt)();
- this_rq()->nr_pinned++;
- p->migration_disabled = 1;
-}
-EXPORT_SYMBOL_GPL(migrate_disable);
-
-void migrate_enable(void)
+void ___migrate_enable(void)
{
struct task_struct *p = current;
struct affinity_context ac = {
@@ -2410,35 +2391,19 @@ void migrate_enable(void)
.flags = SCA_MIGRATE_ENABLE,
};
-#ifdef CONFIG_DEBUG_PREEMPT
- /*
- * Check both overflow from migrate_disable() and superfluous
- * migrate_enable().
- */
- if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
- return;
-#endif
+ __set_cpus_allowed_ptr(p, &ac);
+}
+EXPORT_SYMBOL_GPL(___migrate_enable);
- if (p->migration_disabled > 1) {
- p->migration_disabled--;
- return;
- }
+void migrate_disable(void)
+{
+ __migrate_disable();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
- /*
- * Ensure stop_task runs either before or after this, and that
- * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
- */
- guard(preempt)();
- if (p->cpus_ptr != &p->cpus_mask)
- __set_cpus_allowed_ptr(p, &ac);
- /*
- * Mustn't clear migration_disabled() until cpus_ptr points back at the
- * regular cpus_mask, otherwise things that race (eg.
- * select_fallback_rq) get confused.
- */
- barrier();
- p->migration_disabled = 0;
- this_rq()->nr_pinned--;
+void migrate_enable(void)
+{
+ __migrate_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
@@ -4490,6 +4455,9 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
+#ifdef CONFIG_CFS_BANDWIDTH
+ init_cfs_throttle_work(p);
+#endif
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -9362,8 +9330,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, css, tset)
sched_move_task(task, false);
-
- scx_cgroup_finish_attach();
}
static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 72c1f72463c7..615411a0a881 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2551,6 +2551,25 @@ static int find_later_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
+
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
+
+ return p;
+}
+
/* Locks the rq it finds */
static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
{
@@ -2578,12 +2597,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
- if (unlikely(task_rq(task) != rq ||
+ /*
+ * double_lock_balance had to release rq->lock, in the
+ * meantime, task may no longer be fit to be migrated.
+ * Check the following to ensure that the task is
+ * still suitable for migration:
+ * 1. It is possible the task was scheduled,
+ * migrate_disabled was set and then got preempted,
+ * so we must check the task migration disable
+ * flag.
+ * 2. The CPU picked is in the task's affinity.
+ * 3. For throttled task (dl_task_offline_migration),
+ * check the following:
+ * - the task is not on the rq anymore (it was
+ * migrated)
+ * - the task is not on CPU anymore
+ * - the task is still a dl task
+ * - the task is not queued on the rq anymore
+ * 4. For the non-throttled task (push_dl_task), the
+ * check to ensure that this task is still at the
+ * head of the pushable tasks list is enough.
+ */
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !dl_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ (task->dl.dl_throttled &&
+ (task_rq(task) != rq ||
+ task_on_cpu(rq, task) ||
+ !dl_task(task) ||
+ !task_on_rq_queued(task))) ||
+ (!task->dl.dl_throttled &&
+ task != pick_next_pushable_dl_task(rq)))) {
+
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -2606,25 +2650,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
return later_rq;
}
-static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_dl_tasks(rq))
- return NULL;
-
- p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
-
- WARN_ON_ONCE(rq->cpu != task_cpu(p));
- WARN_ON_ONCE(task_current(rq, p));
- WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
-
- WARN_ON_ONCE(!task_on_rq_queued(p));
- WARN_ON_ONCE(!dl_task(p));
-
- return p;
-}
-
/*
* See if the non running -deadline tasks on this rq
* can be sent to some other CPU where they can preempt
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 088ceff38c8a..2b0e88206d07 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,1040 +9,6 @@
#include <linux/btf_ids.h>
#include "ext_idle.h"
-#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
-
-enum scx_consts {
- SCX_DSP_DFL_MAX_BATCH = 32,
- SCX_DSP_MAX_LOOPS = 32,
- SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
-
- SCX_EXIT_BT_LEN = 64,
- SCX_EXIT_MSG_LEN = 1024,
- SCX_EXIT_DUMP_DFL_LEN = 32768,
-
- SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
-
- /*
- * Iterating all tasks may take a while. Periodically drop
- * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
- */
- SCX_TASK_ITER_BATCH = 32,
-};
-
-enum scx_exit_kind {
- SCX_EXIT_NONE,
- SCX_EXIT_DONE,
-
- SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
- SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
- SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
- SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
-
- SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
- SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
- SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
-};
-
-/*
- * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
- * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
- * are 64bit of the format:
- *
- * Bits: [63 .. 48 47 .. 32 31 .. 0]
- * [ SYS ACT ] [ SYS RSN ] [ USR ]
- *
- * SYS ACT: System-defined exit actions
- * SYS RSN: System-defined exit reasons
- * USR : User-defined exit codes and reasons
- *
- * Using the above, users may communicate intention and context by ORing system
- * actions and/or system reasons with a user-defined exit code.
- */
-enum scx_exit_code {
- /* Reasons */
- SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
-
- /* Actions */
- SCX_ECODE_ACT_RESTART = 1LLU << 48,
-};
-
-/*
- * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
- * being disabled.
- */
-struct scx_exit_info {
- /* %SCX_EXIT_* - broad category of the exit reason */
- enum scx_exit_kind kind;
-
- /* exit code if gracefully exiting */
- s64 exit_code;
-
- /* textual representation of the above */
- const char *reason;
-
- /* backtrace if exiting due to an error */
- unsigned long *bt;
- u32 bt_len;
-
- /* informational message */
- char *msg;
-
- /* debug dump */
- char *dump;
-};
-
-/* sched_ext_ops.flags */
-enum scx_ops_flags {
- /*
- * Keep built-in idle tracking even if ops.update_idle() is implemented.
- */
- SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
-
- /*
- * By default, if there are no other task to run on the CPU, ext core
- * keeps running the current task even after its slice expires. If this
- * flag is specified, such tasks are passed to ops.enqueue() with
- * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
- */
- SCX_OPS_ENQ_LAST = 1LLU << 1,
-
- /*
- * An exiting task may schedule after PF_EXITING is set. In such cases,
- * bpf_task_from_pid() may not be able to find the task and if the BPF
- * scheduler depends on pid lookup for dispatching, the task will be
- * lost leading to various issues including RCU grace period stalls.
- *
- * To mask this problem, by default, unhashed tasks are automatically
- * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
- * depend on pid lookups and wants to handle these tasks directly, the
- * following flag can be used.
- */
- SCX_OPS_ENQ_EXITING = 1LLU << 2,
-
- /*
- * If set, only tasks with policy set to SCHED_EXT are attached to
- * sched_ext. If clear, SCHED_NORMAL tasks are also included.
- */
- SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
-
- /*
- * A migration disabled task can only execute on its current CPU. By
- * default, such tasks are automatically put on the CPU's local DSQ with
- * the default slice on enqueue. If this ops flag is set, they also go
- * through ops.enqueue().
- *
- * A migration disabled task never invokes ops.select_cpu() as it can
- * only select the current CPU. Also, p->cpus_ptr will only contain its
- * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
- * and thus may disagree with cpumask_weight(p->cpus_ptr).
- */
- SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
-
- /*
- * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
- * ops.enqueue() on the ops.select_cpu() selected or the wakee's
- * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
- * transfers. When this optimization is enabled, ops.select_cpu() is
- * skipped in some cases (when racing against the wakee switching out).
- * As the BPF scheduler may depend on ops.select_cpu() being invoked
- * during wakeups, queued wakeup is disabled by default.
- *
- * If this ops flag is set, queued wakeup optimization is enabled and
- * the BPF scheduler must be able to handle ops.enqueue() invoked on the
- * wakee's CPU without preceding ops.select_cpu() even for tasks which
- * may be executed on multiple CPUs.
- */
- SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
-
- /*
- * If set, enable per-node idle cpumasks. If clear, use a single global
- * flat idle cpumask.
- */
- SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
-
- /*
- * CPU cgroup support flags
- */
- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
-
- SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
- SCX_OPS_ENQ_LAST |
- SCX_OPS_ENQ_EXITING |
- SCX_OPS_ENQ_MIGRATION_DISABLED |
- SCX_OPS_ALLOW_QUEUED_WAKEUP |
- SCX_OPS_SWITCH_PARTIAL |
- SCX_OPS_BUILTIN_IDLE_PER_NODE |
- SCX_OPS_HAS_CGROUP_WEIGHT,
-
- /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
- __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
-
- SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
-};
-
-/* argument container for ops.init_task() */
-struct scx_init_task_args {
- /*
- * Set if ops.init_task() is being invoked on the fork path, as opposed
- * to the scheduler transition path.
- */
- bool fork;
-#ifdef CONFIG_EXT_GROUP_SCHED
- /* the cgroup the task is joining */
- struct cgroup *cgroup;
-#endif
-};
-
-/* argument container for ops.exit_task() */
-struct scx_exit_task_args {
- /* Whether the task exited before running on sched_ext. */
- bool cancelled;
-};
-
-/* argument container for ops->cgroup_init() */
-struct scx_cgroup_init_args {
- /* the weight of the cgroup [1..10000] */
- u32 weight;
-
- /* bandwidth control parameters from cpu.max and cpu.max.burst */
- u64 bw_period_us;
- u64 bw_quota_us;
- u64 bw_burst_us;
-};
-
-enum scx_cpu_preempt_reason {
- /* next task is being scheduled by &sched_class_rt */
- SCX_CPU_PREEMPT_RT,
- /* next task is being scheduled by &sched_class_dl */
- SCX_CPU_PREEMPT_DL,
- /* next task is being scheduled by &sched_class_stop */
- SCX_CPU_PREEMPT_STOP,
- /* unknown reason for SCX being preempted */
- SCX_CPU_PREEMPT_UNKNOWN,
-};
-
-/*
- * Argument container for ops->cpu_acquire(). Currently empty, but may be
- * expanded in the future.
- */
-struct scx_cpu_acquire_args {};
-
-/* argument container for ops->cpu_release() */
-struct scx_cpu_release_args {
- /* the reason the CPU was preempted */
- enum scx_cpu_preempt_reason reason;
-
- /* the task that's going to be scheduled on the CPU */
- struct task_struct *task;
-};
-
-/*
- * Informational context provided to dump operations.
- */
-struct scx_dump_ctx {
- enum scx_exit_kind kind;
- s64 exit_code;
- const char *reason;
- u64 at_ns;
- u64 at_jiffies;
-};
-
-/**
- * struct sched_ext_ops - Operation table for BPF scheduler implementation
- *
- * A BPF scheduler can implement an arbitrary scheduling policy by
- * implementing and loading operations in this table. Note that a userland
- * scheduling policy can also be implemented using the BPF scheduler
- * as a shim layer.
- */
-struct sched_ext_ops {
- /**
- * @select_cpu: Pick the target CPU for a task which is being woken up
- * @p: task being woken up
- * @prev_cpu: the cpu @p was on before sleeping
- * @wake_flags: SCX_WAKE_*
- *
- * Decision made here isn't final. @p may be moved to any CPU while it
- * is getting dispatched for execution later. However, as @p is not on
- * the rq at this point, getting the eventual execution CPU right here
- * saves a small bit of overhead down the line.
- *
- * If an idle CPU is returned, the CPU is kicked and will try to
- * dispatch. While an explicit custom mechanism can be added,
- * select_cpu() serves as the default way to wake up idle CPUs.
- *
- * @p may be inserted into a DSQ directly by calling
- * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
- * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
- * of the CPU returned by this operation.
- *
- * Note that select_cpu() is never called for tasks that can only run
- * on a single CPU or tasks with migration disabled, as they don't have
- * the option to select a different CPU. See select_task_rq() for
- * details.
- */
- s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
-
- /**
- * @enqueue: Enqueue a task on the BPF scheduler
- * @p: task being enqueued
- * @enq_flags: %SCX_ENQ_*
- *
- * @p is ready to run. Insert directly into a DSQ by calling
- * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
- * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
- * the task will stall.
- *
- * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
- * skipped.
- */
- void (*enqueue)(struct task_struct *p, u64 enq_flags);
-
- /**
- * @dequeue: Remove a task from the BPF scheduler
- * @p: task being dequeued
- * @deq_flags: %SCX_DEQ_*
- *
- * Remove @p from the BPF scheduler. This is usually called to isolate
- * the task while updating its scheduling properties (e.g. priority).
- *
- * The ext core keeps track of whether the BPF side owns a given task or
- * not and can gracefully ignore spurious dispatches from BPF side,
- * which makes it safe to not implement this method. However, depending
- * on the scheduling logic, this can lead to confusing behaviors - e.g.
- * scheduling position not being updated across a priority change.
- */
- void (*dequeue)(struct task_struct *p, u64 deq_flags);
-
- /**
- * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
- * @cpu: CPU to dispatch tasks for
- * @prev: previous task being switched out
- *
- * Called when a CPU's local dsq is empty. The operation should dispatch
- * one or more tasks from the BPF scheduler into the DSQs using
- * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
- * using scx_bpf_dsq_move_to_local().
- *
- * The maximum number of times scx_bpf_dsq_insert() can be called
- * without an intervening scx_bpf_dsq_move_to_local() is specified by
- * ops.dispatch_max_batch. See the comments on top of the two functions
- * for more details.
- *
- * When not %NULL, @prev is an SCX task with its slice depleted. If
- * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
- * @prev->scx.flags, it is not enqueued yet and will be enqueued after
- * ops.dispatch() returns. To keep executing @prev, return without
- * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
- */
- void (*dispatch)(s32 cpu, struct task_struct *prev);
-
- /**
- * @tick: Periodic tick
- * @p: task running currently
- *
- * This operation is called every 1/HZ seconds on CPUs which are
- * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
- * immediate dispatch cycle on the CPU.
- */
- void (*tick)(struct task_struct *p);
-
- /**
- * @runnable: A task is becoming runnable on its associated CPU
- * @p: task becoming runnable
- * @enq_flags: %SCX_ENQ_*
- *
- * This and the following three functions can be used to track a task's
- * execution state transitions. A task becomes ->runnable() on a CPU,
- * and then goes through one or more ->running() and ->stopping() pairs
- * as it runs on the CPU, and eventually becomes ->quiescent() when it's
- * done running on the CPU.
- *
- * @p is becoming runnable on the CPU because it's
- *
- * - waking up (%SCX_ENQ_WAKEUP)
- * - being moved from another CPU
- * - being restored after temporarily taken off the queue for an
- * attribute change.
- *
- * This and ->enqueue() are related but not coupled. This operation
- * notifies @p's state transition and may not be followed by ->enqueue()
- * e.g. when @p is being dispatched to a remote CPU, or when @p is
- * being enqueued on a CPU experiencing a hotplug event. Likewise, a
- * task may be ->enqueue()'d without being preceded by this operation
- * e.g. after exhausting its slice.
- */
- void (*runnable)(struct task_struct *p, u64 enq_flags);
-
- /**
- * @running: A task is starting to run on its associated CPU
- * @p: task starting to run
- *
- * Note that this callback may be called from a CPU other than the
- * one the task is going to run on. This can happen when a task
- * property is changed (i.e., affinity), since scx_next_task_scx(),
- * which triggers this callback, may run on a CPU different from
- * the task's assigned CPU.
- *
- * Therefore, always use scx_bpf_task_cpu(@p) to determine the
- * target CPU the task is going to use.
- *
- * See ->runnable() for explanation on the task state notifiers.
- */
- void (*running)(struct task_struct *p);
-
- /**
- * @stopping: A task is stopping execution
- * @p: task stopping to run
- * @runnable: is task @p still runnable?
- *
- * Note that this callback may be called from a CPU other than the
- * one the task was running on. This can happen when a task
- * property is changed (i.e., affinity), since dequeue_task_scx(),
- * which triggers this callback, may run on a CPU different from
- * the task's assigned CPU.
- *
- * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
- * the task was running on.
- *
- * See ->runnable() for explanation on the task state notifiers. If
- * !@runnable, ->quiescent() will be invoked after this operation
- * returns.
- */
- void (*stopping)(struct task_struct *p, bool runnable);
-
- /**
- * @quiescent: A task is becoming not runnable on its associated CPU
- * @p: task becoming not runnable
- * @deq_flags: %SCX_DEQ_*
- *
- * See ->runnable() for explanation on the task state notifiers.
- *
- * @p is becoming quiescent on the CPU because it's
- *
- * - sleeping (%SCX_DEQ_SLEEP)
- * - being moved to another CPU
- * - being temporarily taken off the queue for an attribute change
- * (%SCX_DEQ_SAVE)
- *
- * This and ->dequeue() are related but not coupled. This operation
- * notifies @p's state transition and may not be preceded by ->dequeue()
- * e.g. when @p is being dispatched to a remote CPU.
- */
- void (*quiescent)(struct task_struct *p, u64 deq_flags);
-
- /**
- * @yield: Yield CPU
- * @from: yielding task
- * @to: optional yield target task
- *
- * If @to is NULL, @from is yielding the CPU to other runnable tasks.
- * The BPF scheduler should ensure that other available tasks are
- * dispatched before the yielding task. Return value is ignored in this
- * case.
- *
- * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
- * scheduler can implement the request, return %true; otherwise, %false.
- */
- bool (*yield)(struct task_struct *from, struct task_struct *to);
-
- /**
- * @core_sched_before: Task ordering for core-sched
- * @a: task A
- * @b: task B
- *
- * Used by core-sched to determine the ordering between two tasks. See
- * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
- * core-sched.
- *
- * Both @a and @b are runnable and may or may not currently be queued on
- * the BPF scheduler. Should return %true if @a should run before @b.
- * %false if there's no required ordering or @b should run before @a.
- *
- * If not specified, the default is ordering them according to when they
- * became runnable.
- */
- bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
-
- /**
- * @set_weight: Set task weight
- * @p: task to set weight for
- * @weight: new weight [1..10000]
- *
- * Update @p's weight to @weight.
- */
- void (*set_weight)(struct task_struct *p, u32 weight);
-
- /**
- * @set_cpumask: Set CPU affinity
- * @p: task to set CPU affinity for
- * @cpumask: cpumask of cpus that @p can run on
- *
- * Update @p's CPU affinity to @cpumask.
- */
- void (*set_cpumask)(struct task_struct *p,
- const struct cpumask *cpumask);
-
- /**
- * @update_idle: Update the idle state of a CPU
- * @cpu: CPU to update the idle state for
- * @idle: whether entering or exiting the idle state
- *
- * This operation is called when @rq's CPU goes or leaves the idle
- * state. By default, implementing this operation disables the built-in
- * idle CPU tracking and the following helpers become unavailable:
- *
- * - scx_bpf_select_cpu_dfl()
- * - scx_bpf_select_cpu_and()
- * - scx_bpf_test_and_clear_cpu_idle()
- * - scx_bpf_pick_idle_cpu()
- *
- * The user also must implement ops.select_cpu() as the default
- * implementation relies on scx_bpf_select_cpu_dfl().
- *
- * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
- * tracking.
- */
- void (*update_idle)(s32 cpu, bool idle);
-
- /**
- * @cpu_acquire: A CPU is becoming available to the BPF scheduler
- * @cpu: The CPU being acquired by the BPF scheduler.
- * @args: Acquire arguments, see the struct definition.
- *
- * A CPU that was previously released from the BPF scheduler is now once
- * again under its control.
- */
- void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
-
- /**
- * @cpu_release: A CPU is taken away from the BPF scheduler
- * @cpu: The CPU being released by the BPF scheduler.
- * @args: Release arguments, see the struct definition.
- *
- * The specified CPU is no longer under the control of the BPF
- * scheduler. This could be because it was preempted by a higher
- * priority sched_class, though there may be other reasons as well. The
- * caller should consult @args->reason to determine the cause.
- */
- void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
-
- /**
- * @init_task: Initialize a task to run in a BPF scheduler
- * @p: task to initialize for BPF scheduling
- * @args: init arguments, see the struct definition
- *
- * Either we're loading a BPF scheduler or a new task is being forked.
- * Initialize @p for BPF scheduling. This operation may block and can
- * be used for allocations, and is called exactly once for a task.
- *
- * Return 0 for success, -errno for failure. An error return while
- * loading will abort loading of the BPF scheduler. During a fork, it
- * will abort that specific fork.
- */
- s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
-
- /**
- * @exit_task: Exit a previously-running task from the system
- * @p: task to exit
- * @args: exit arguments, see the struct definition
- *
- * @p is exiting or the BPF scheduler is being unloaded. Perform any
- * necessary cleanup for @p.
- */
- void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
-
- /**
- * @enable: Enable BPF scheduling for a task
- * @p: task to enable BPF scheduling for
- *
- * Enable @p for BPF scheduling. enable() is called on @p any time it
- * enters SCX, and is always paired with a matching disable().
- */
- void (*enable)(struct task_struct *p);
-
- /**
- * @disable: Disable BPF scheduling for a task
- * @p: task to disable BPF scheduling for
- *
- * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
- * Disable BPF scheduling for @p. A disable() call is always matched
- * with a prior enable() call.
- */
- void (*disable)(struct task_struct *p);
-
- /**
- * @dump: Dump BPF scheduler state on error
- * @ctx: debug dump context
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
- */
- void (*dump)(struct scx_dump_ctx *ctx);
-
- /**
- * @dump_cpu: Dump BPF scheduler state for a CPU on error
- * @ctx: debug dump context
- * @cpu: CPU to generate debug dump for
- * @idle: @cpu is currently idle without any runnable tasks
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
- * @cpu. If @idle is %true and this operation doesn't produce any
- * output, @cpu is skipped for dump.
- */
- void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
-
- /**
- * @dump_task: Dump BPF scheduler state for a runnable task on error
- * @ctx: debug dump context
- * @p: runnable task to generate debug dump for
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
- * @p.
- */
- void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
-
-#ifdef CONFIG_EXT_GROUP_SCHED
- /**
- * @cgroup_init: Initialize a cgroup
- * @cgrp: cgroup being initialized
- * @args: init arguments, see the struct definition
- *
- * Either the BPF scheduler is being loaded or @cgrp created, initialize
- * @cgrp for sched_ext. This operation may block.
- *
- * Return 0 for success, -errno for failure. An error return while
- * loading will abort loading of the BPF scheduler. During cgroup
- * creation, it will abort the specific cgroup creation.
- */
- s32 (*cgroup_init)(struct cgroup *cgrp,
- struct scx_cgroup_init_args *args);
-
- /**
- * @cgroup_exit: Exit a cgroup
- * @cgrp: cgroup being exited
- *
- * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
- * @cgrp for sched_ext. This operation my block.
- */
- void (*cgroup_exit)(struct cgroup *cgrp);
-
- /**
- * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
- * @p: task being moved
- * @from: cgroup @p is being moved from
- * @to: cgroup @p is being moved to
- *
- * Prepare @p for move from cgroup @from to @to. This operation may
- * block and can be used for allocations.
- *
- * Return 0 for success, -errno for failure. An error return aborts the
- * migration.
- */
- s32 (*cgroup_prep_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_move: Commit cgroup move
- * @p: task being moved
- * @from: cgroup @p is being moved from
- * @to: cgroup @p is being moved to
- *
- * Commit the move. @p is dequeued during this operation.
- */
- void (*cgroup_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_cancel_move: Cancel cgroup move
- * @p: task whose cgroup move is being canceled
- * @from: cgroup @p was being moved from
- * @to: cgroup @p was being moved to
- *
- * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
- * Undo the preparation.
- */
- void (*cgroup_cancel_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_set_weight: A cgroup's weight is being changed
- * @cgrp: cgroup whose weight is being updated
- * @weight: new weight [1..10000]
- *
- * Update @cgrp's weight to @weight.
- */
- void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
-
- /**
- * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
- * @cgrp: cgroup whose bandwidth is being updated
- * @period_us: bandwidth control period
- * @quota_us: bandwidth control quota
- * @burst_us: bandwidth control burst
- *
- * Update @cgrp's bandwidth control parameters. This is from the cpu.max
- * cgroup interface.
- *
- * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
- * to. For example, if @period_us is 1_000_000 and @quota_us is
- * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
- * interpreted in the same fashion and specifies how much @cgrp can
- * burst temporarily. The specific control mechanism and thus the
- * interpretation of @period_us and burstiness is upto to the BPF
- * scheduler.
- */
- void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
- u64 period_us, u64 quota_us, u64 burst_us);
-
-#endif /* CONFIG_EXT_GROUP_SCHED */
-
- /*
- * All online ops must come before ops.cpu_online().
- */
-
- /**
- * @cpu_online: A CPU became online
- * @cpu: CPU which just came up
- *
- * @cpu just came online. @cpu will not call ops.enqueue() or
- * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
- */
- void (*cpu_online)(s32 cpu);
-
- /**
- * @cpu_offline: A CPU is going offline
- * @cpu: CPU which is going offline
- *
- * @cpu is going offline. @cpu will not call ops.enqueue() or
- * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
- */
- void (*cpu_offline)(s32 cpu);
-
- /*
- * All CPU hotplug ops must come before ops.init().
- */
-
- /**
- * @init: Initialize the BPF scheduler
- */
- s32 (*init)(void);
-
- /**
- * @exit: Clean up after the BPF scheduler
- * @info: Exit info
- *
- * ops.exit() is also called on ops.init() failure, which is a bit
- * unusual. This is to allow rich reporting through @info on how
- * ops.init() failed.
- */
- void (*exit)(struct scx_exit_info *info);
-
- /**
- * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
- */
- u32 dispatch_max_batch;
-
- /**
- * @flags: %SCX_OPS_* flags
- */
- u64 flags;
-
- /**
- * @timeout_ms: The maximum amount of time, in milliseconds, that a
- * runnable task should be able to wait before being scheduled. The
- * maximum timeout may not exceed the default timeout of 30 seconds.
- *
- * Defaults to the maximum allowed timeout value of 30 seconds.
- */
- u32 timeout_ms;
-
- /**
- * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
- * value of 32768 is used.
- */
- u32 exit_dump_len;
-
- /**
- * @hotplug_seq: A sequence number that may be set by the scheduler to
- * detect when a hotplug event has occurred during the loading process.
- * If 0, no detection occurs. Otherwise, the scheduler will fail to
- * load if the sequence number does not match @scx_hotplug_seq on the
- * enable path.
- */
- u64 hotplug_seq;
-
- /**
- * @name: BPF scheduler's name
- *
- * Must be a non-zero valid BPF object name including only isalnum(),
- * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
- * BPF scheduler is enabled.
- */
- char name[SCX_OPS_NAME_LEN];
-
- /* internal use only, must be NULL */
- void *priv;
-};
-
-enum scx_opi {
- SCX_OPI_BEGIN = 0,
- SCX_OPI_NORMAL_BEGIN = 0,
- SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
- SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
- SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
- SCX_OPI_END = SCX_OP_IDX(init),
-};
-
-/*
- * Collection of event counters. Event types are placed in descending order.
- */
-struct scx_event_stats {
- /*
- * If ops.select_cpu() returns a CPU which can't be used by the task,
- * the core scheduler code silently picks a fallback CPU.
- */
- s64 SCX_EV_SELECT_CPU_FALLBACK;
-
- /*
- * When dispatching to a local DSQ, the CPU may have gone offline in
- * the meantime. In this case, the task is bounced to the global DSQ.
- */
- s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
-
- /*
- * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
- * continued to run because there were no other tasks on the CPU.
- */
- s64 SCX_EV_DISPATCH_KEEP_LAST;
-
- /*
- * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
- * is dispatched to a local DSQ when exiting.
- */
- s64 SCX_EV_ENQ_SKIP_EXITING;
-
- /*
- * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
- * migration disabled task skips ops.enqueue() and is dispatched to its
- * local DSQ.
- */
- s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
-
- /*
- * Total number of times a task's time slice was refilled with the
- * default value (SCX_SLICE_DFL).
- */
- s64 SCX_EV_REFILL_SLICE_DFL;
-
- /*
- * The total duration of bypass modes in nanoseconds.
- */
- s64 SCX_EV_BYPASS_DURATION;
-
- /*
- * The number of tasks dispatched in the bypassing mode.
- */
- s64 SCX_EV_BYPASS_DISPATCH;
-
- /*
- * The number of times the bypassing mode has been activated.
- */
- s64 SCX_EV_BYPASS_ACTIVATE;
-};
-
-struct scx_sched {
- struct sched_ext_ops ops;
- DECLARE_BITMAP(has_op, SCX_OPI_END);
-
- /*
- * Dispatch queues.
- *
- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
- * This is to avoid live-locking in bypass mode where all tasks are
- * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
- * per-node split isn't sufficient, it can be further split.
- */
- struct rhashtable dsq_hash;
- struct scx_dispatch_q **global_dsqs;
-
- /*
- * The event counters are in a per-CPU variable to minimize the
- * accounting overhead. A system-wide view on the event counter is
- * constructed when requested by scx_bpf_events().
- */
- struct scx_event_stats __percpu *event_stats_cpu;
-
- bool warned_zero_slice;
-
- atomic_t exit_kind;
- struct scx_exit_info *exit_info;
-
- struct kobject kobj;
-
- struct kthread_worker *helper;
- struct irq_work error_irq_work;
- struct kthread_work disable_work;
- struct rcu_work rcu_work;
-};
-
-enum scx_wake_flags {
- /* expose select WF_* flags as enums */
- SCX_WAKE_FORK = WF_FORK,
- SCX_WAKE_TTWU = WF_TTWU,
- SCX_WAKE_SYNC = WF_SYNC,
-};
-
-enum scx_enq_flags {
- /* expose select ENQUEUE_* flags as enums */
- SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
- SCX_ENQ_HEAD = ENQUEUE_HEAD,
- SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
-
- /* high 32bits are SCX specific */
-
- /*
- * Set the following to trigger preemption when calling
- * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
- * current task is cleared to zero and the CPU is kicked into the
- * scheduling path. Implies %SCX_ENQ_HEAD.
- */
- SCX_ENQ_PREEMPT = 1LLU << 32,
-
- /*
- * The task being enqueued was previously enqueued on the current CPU's
- * %SCX_DSQ_LOCAL, but was removed from it in a call to the
- * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
- * invoked in a ->cpu_release() callback, and the task is again
- * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
- * task will not be scheduled on the CPU until at least the next invocation
- * of the ->cpu_acquire() callback.
- */
- SCX_ENQ_REENQ = 1LLU << 40,
-
- /*
- * The task being enqueued is the only task available for the cpu. By
- * default, ext core keeps executing such tasks but when
- * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
- * %SCX_ENQ_LAST flag set.
- *
- * The BPF scheduler is responsible for triggering a follow-up
- * scheduling event. Otherwise, Execution may stall.
- */
- SCX_ENQ_LAST = 1LLU << 41,
-
- /* high 8 bits are internal */
- __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
-
- SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
- SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
-};
-
-enum scx_deq_flags {
- /* expose select DEQUEUE_* flags as enums */
- SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
-
- /* high 32bits are SCX specific */
-
- /*
- * The generic core-sched layer decided to execute the task even though
- * it hasn't been dispatched yet. Dequeue from the BPF side.
- */
- SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
-};
-
-enum scx_pick_idle_cpu_flags {
- SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
- SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
-};
-
-enum scx_kick_flags {
- /*
- * Kick the target CPU if idle. Guarantees that the target CPU goes
- * through at least one full scheduling cycle before going idle. If the
- * target CPU can be determined to be currently not idle and going to go
- * through a scheduling cycle before going idle, noop.
- */
- SCX_KICK_IDLE = 1LLU << 0,
-
- /*
- * Preempt the current task and execute the dispatch path. If the
- * current task of the target CPU is an SCX task, its ->scx.slice is
- * cleared to zero before the scheduling path is invoked so that the
- * task expires and the dispatch path is invoked.
- */
- SCX_KICK_PREEMPT = 1LLU << 1,
-
- /*
- * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
- * return after the target CPU finishes picking the next task.
- */
- SCX_KICK_WAIT = 1LLU << 2,
-};
-
-enum scx_tg_flags {
- SCX_TG_ONLINE = 1U << 0,
- SCX_TG_INITED = 1U << 1,
-};
-
-enum scx_enable_state {
- SCX_ENABLING,
- SCX_ENABLED,
- SCX_DISABLING,
- SCX_DISABLED,
-};
-
-static const char *scx_enable_state_str[] = {
- [SCX_ENABLING] = "enabling",
- [SCX_ENABLED] = "enabled",
- [SCX_DISABLING] = "disabling",
- [SCX_DISABLED] = "disabled",
-};
-
-/*
- * sched_ext_entity->ops_state
- *
- * Used to track the task ownership between the SCX core and the BPF scheduler.
- * State transitions look as follows:
- *
- * NONE -> QUEUEING -> QUEUED -> DISPATCHING
- * ^ | |
- * | v v
- * \-------------------------------/
- *
- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
- * sites for explanations on the conditions being waited upon and why they are
- * safe. Transitions out of them into NONE or QUEUED must store_release and the
- * waiters should load_acquire.
- *
- * Tracking scx_ops_state enables sched_ext core to reliably determine whether
- * any given task can be dispatched by the BPF scheduler at all times and thus
- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
- * to try to dispatch any task anytime regardless of its state as the SCX core
- * can safely reject invalid dispatches.
- */
-enum scx_ops_state {
- SCX_OPSS_NONE, /* owned by the SCX core */
- SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
- SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
- SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
-
- /*
- * QSEQ brands each QUEUED instance so that, when dispatch races
- * dequeue/requeue, the dispatcher can tell whether it still has a claim
- * on the task being dispatched.
- *
- * As some 32bit archs can't do 64bit store_release/load_acquire,
- * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
- * 32bit machines. The dispatch race window QSEQ protects is very narrow
- * and runs with IRQ disabled. 30 bits should be sufficient.
- */
- SCX_OPSS_QSEQ_SHIFT = 2,
-};
-
-/* Use macros to ensure that the type is unsigned long for the masks */
-#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
-#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
-
/*
* NOTE: sched_ext is in the process of growing multiple scheduler support and
* scx_root usage is in a transitional state. Naked dereferences are safe if the
@@ -1170,7 +136,7 @@ static struct kset *scx_kset;
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
-static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
s64 exit_code, const char *fmt, va_list args);
@@ -1185,24 +151,7 @@ static __printf(4, 5) void scx_exit(struct scx_sched *sch,
va_end(args);
}
-static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code,
- const char *fmt, ...)
-{
- struct scx_sched *sch;
- va_list args;
-
- rcu_read_lock();
- sch = rcu_dereference(scx_root);
- if (sch) {
- va_start(args, fmt);
- scx_vexit(sch, kind, exit_code, fmt, args);
- va_end(args);
- }
- rcu_read_unlock();
-}
-
#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
-#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args)
#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
@@ -1232,10 +181,9 @@ static bool u32_before(u32 a, u32 b)
return (s32)(a - b) < 0;
}
-static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
+static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
+ struct task_struct *p)
{
- struct scx_sched *sch = scx_root;
-
return sch->global_dsqs[cpu_to_node(task_cpu(p))];
}
@@ -1363,11 +311,11 @@ do { \
})
/* @mask is constant, always inline to cull unnecessary branches */
-static __always_inline bool scx_kf_allowed(u32 mask)
+static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask)
{
if (unlikely(!(current->scx.kf_mask & mask))) {
- scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
- mask, current->scx.kf_mask);
+ scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
return false;
}
@@ -1380,13 +328,13 @@ static __always_inline bool scx_kf_allowed(u32 mask)
*/
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
- scx_kf_error("cpu_release kfunc called from a nested operation");
+ scx_error(sch, "cpu_release kfunc called from a nested operation");
return false;
}
if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
- scx_kf_error("dispatch kfunc called from a nested operation");
+ scx_error(sch, "dispatch kfunc called from a nested operation");
return false;
}
@@ -1394,15 +342,16 @@ static __always_inline bool scx_kf_allowed(u32 mask)
}
/* see SCX_CALL_OP_TASK() */
-static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
+static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
+ u32 mask,
struct task_struct *p)
{
- if (!scx_kf_allowed(mask))
+ if (!scx_kf_allowed(sch, mask))
return false;
if (unlikely((p != current->scx.kf_tasks[0] &&
p != current->scx.kf_tasks[1]))) {
- scx_kf_error("called on a task not being operated on");
+ scx_error(sch, "called on a task not being operated on");
return false;
}
@@ -1488,10 +437,11 @@ struct bpf_iter_scx_dsq {
*/
struct scx_task_iter {
struct sched_ext_entity cursor;
- struct task_struct *locked;
+ struct task_struct *locked_task;
struct rq *rq;
struct rq_flags rf;
u32 cnt;
+ bool list_locked;
};
/**
@@ -1519,15 +469,16 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
list_add(&iter->cursor.tasks_node, &scx_tasks);
- iter->locked = NULL;
+ iter->locked_task = NULL;
iter->cnt = 0;
+ iter->list_locked = true;
}
static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
- if (iter->locked) {
- task_rq_unlock(iter->rq, iter->locked, &iter->rf);
- iter->locked = NULL;
+ if (iter->locked_task) {
+ task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
+ iter->locked_task = NULL;
}
}
@@ -1537,24 +488,24 @@ static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
*
* If @iter is in the middle of a locked iteration, it may be locking the rq of
* the task currently being visited in addition to scx_tasks_lock. Unlock both.
- * This function can be safely called anytime during an iteration.
+ * This function can be safely called anytime during an iteration. The next
+ * iterator operation will automatically restore the necessary locking.
*/
static void scx_task_iter_unlock(struct scx_task_iter *iter)
{
__scx_task_iter_rq_unlock(iter);
- spin_unlock_irq(&scx_tasks_lock);
+ if (iter->list_locked) {
+ iter->list_locked = false;
+ spin_unlock_irq(&scx_tasks_lock);
+ }
}
-/**
- * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
- * @iter: iterator to re-lock
- *
- * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
- * doesn't re-lock the rq lock. Must be called before other iterator operations.
- */
-static void scx_task_iter_relock(struct scx_task_iter *iter)
+static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
{
- spin_lock_irq(&scx_tasks_lock);
+ if (!iter->list_locked) {
+ spin_lock_irq(&scx_tasks_lock);
+ iter->list_locked = true;
+ }
}
/**
@@ -1567,6 +518,7 @@ static void scx_task_iter_relock(struct scx_task_iter *iter)
*/
static void scx_task_iter_stop(struct scx_task_iter *iter)
{
+ __scx_task_iter_maybe_relock(iter);
list_del_init(&iter->cursor.tasks_node);
scx_task_iter_unlock(iter);
}
@@ -1584,10 +536,12 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
+ __scx_task_iter_maybe_relock(iter);
+
if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
- scx_task_iter_relock(iter);
+ __scx_task_iter_maybe_relock(iter);
}
list_for_each_entry(pos, cursor, tasks_node) {
@@ -1650,7 +604,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return NULL;
iter->rq = task_rq_lock(p, &iter->rf);
- iter->locked = p;
+ iter->locked_task = p;
return p;
}
@@ -1664,7 +618,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* This can be used when preemption is not disabled.
*/
#define scx_add_event(sch, name, cnt) do { \
- this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
trace_sched_ext_event(#name, (cnt)); \
} while(0)
@@ -1677,7 +631,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* This should be used only when preemption is disabled.
*/
#define __scx_add_event(sch, name, cnt) do { \
- __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
trace_sched_ext_event(#name, cnt); \
} while(0)
@@ -1766,23 +720,6 @@ static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
}
/**
- * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args
- * @cpu: cpu number which came from a BPF ops
- * @where: extra information reported on error
- *
- * The same as ops_cpu_valid() but @sch is implicit.
- */
-static bool kf_cpu_valid(u32 cpu, const char *where)
-{
- if (__cpu_valid(cpu)) {
- return true;
- } else {
- scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
- return false;
- }
-}
-
-/**
* ops_sanitize_err - Sanitize a -errno value
* @sch: scx_sched to error out on error
* @ops_name: operation to blame on failure
@@ -1942,10 +879,10 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
WRITE_ONCE(dsq->nr, dsq->nr + delta);
}
-static void refill_task_slice_dfl(struct task_struct *p)
+static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
{
p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
+ __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
}
static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
@@ -1963,7 +900,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
- dsq = find_global_dsq(p);
+ dsq = find_global_dsq(sch, p);
raw_spin_lock(&dsq->lock);
}
}
@@ -2142,26 +1079,27 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
- return find_global_dsq(p);
+ return find_global_dsq(sch, p);
return &cpu_rq(cpu)->scx.local_dsq;
}
if (dsq_id == SCX_DSQ_GLOBAL)
- dsq = find_global_dsq(p);
+ dsq = find_global_dsq(sch, p);
else
dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
dsq_id, p->comm, p->pid);
- return find_global_dsq(p);
+ return find_global_dsq(sch, p);
}
return dsq;
}
-static void mark_direct_dispatch(struct task_struct *ddsp_task,
+static void mark_direct_dispatch(struct scx_sched *sch,
+ struct task_struct *ddsp_task,
struct task_struct *p, u64 dsq_id,
u64 enq_flags)
{
@@ -2175,10 +1113,10 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
/* @p must match the task on the enqueue path */
if (unlikely(p != ddsp_task)) {
if (IS_ERR(ddsp_task))
- scx_kf_error("%s[%d] already direct-dispatched",
+ scx_error(sch, "%s[%d] already direct-dispatched",
p->comm, p->pid);
else
- scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
ddsp_task->comm, ddsp_task->pid,
p->comm, p->pid);
return;
@@ -2333,15 +1271,15 @@ local:
* higher priority it becomes from scx_prio_less()'s POV.
*/
touch_core_sched(rq, p);
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
local_norefill:
dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
return;
global:
touch_core_sched(rq, p); /* see the comment in local: */
- refill_task_slice_dfl(p);
- dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags);
+ refill_task_slice_dfl(sch, p);
+ dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -2651,8 +1589,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch,
if (!scx_rq_online(rq)) {
if (enforce)
- __scx_add_event(scx_root,
- SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+ __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
}
@@ -2754,7 +1691,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
- dst_dsq = find_global_dsq(p);
+ dst_dsq = find_global_dsq(sch, p);
dst_rq = src_rq;
}
} else {
@@ -2910,7 +1847,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
- dispatch_enqueue(sch, find_global_dsq(p), p,
+ dispatch_enqueue(sch, find_global_dsq(sch, p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
@@ -3155,10 +2092,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* balance(), we want to complete this scheduling cycle and then
* start a new one. IOW, we want to call resched_curr() on the
* next, most likely idle, task, not the current one. Use
- * scx_bpf_kick_cpu() for deferred kicking.
+ * scx_kick_cpu() for deferred kicking.
*/
if (unlikely(!--nr_loops)) {
- scx_bpf_kick_cpu(cpu_of(rq), 0);
+ scx_kick_cpu(sch, cpu_of(rq), 0);
break;
}
} while (dspc->nr_tasks);
@@ -3442,24 +2379,25 @@ static struct task_struct *pick_task_scx(struct rq *rq)
if (keep_prev) {
p = prev;
if (!p->scx.slice)
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
} else {
p = first_local_task(rq);
if (!p) {
if (kick_idle)
- scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
+ scx_kick_cpu(rcu_dereference_sched(scx_root),
+ cpu_of(rq), SCX_KICK_IDLE);
return NULL;
}
if (unlikely(!p->scx.slice)) {
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = rcu_dereference_sched(scx_root);
if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
p->comm, p->pid, __func__);
sch->warned_zero_slice = true;
}
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
}
}
@@ -3548,7 +2486,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
} else {
cpu = prev_cpu;
@@ -4084,7 +3022,7 @@ bool scx_can_stop_tick(struct rq *rq)
#ifdef CONFIG_EXT_GROUP_SCHED
-DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem);
static bool scx_cgroup_enabled;
void scx_tg_init(struct task_group *tg)
@@ -4101,8 +3039,6 @@ int scx_tg_online(struct task_group *tg)
WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
- percpu_down_read(&scx_cgroup_rwsem);
-
if (scx_cgroup_enabled) {
if (SCX_HAS_OP(sch, cgroup_init)) {
struct scx_cgroup_init_args args =
@@ -4122,7 +3058,6 @@ int scx_tg_online(struct task_group *tg)
tg->scx.flags |= SCX_TG_ONLINE;
}
- percpu_up_read(&scx_cgroup_rwsem);
return ret;
}
@@ -4132,15 +3067,11 @@ void scx_tg_offline(struct task_group *tg)
WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
- percpu_down_read(&scx_cgroup_rwsem);
-
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
(tg->scx.flags & SCX_TG_INITED))
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
tg->css.cgroup);
tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
-
- percpu_up_read(&scx_cgroup_rwsem);
}
int scx_cgroup_can_attach(struct cgroup_taskset *tset)
@@ -4150,9 +3081,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
struct task_struct *p;
int ret;
- /* released in scx_finish/cancel_attach() */
- percpu_down_read(&scx_cgroup_rwsem);
-
if (!scx_cgroup_enabled)
return 0;
@@ -4192,7 +3120,6 @@ err:
p->scx.cgrp_moving_from = NULL;
}
- percpu_up_read(&scx_cgroup_rwsem);
return ops_sanitize_err(sch, "cgroup_prep_move", ret);
}
@@ -4215,11 +3142,6 @@ void scx_cgroup_move_task(struct task_struct *p)
p->scx.cgrp_moving_from = NULL;
}
-void scx_cgroup_finish_attach(void)
-{
- percpu_up_read(&scx_cgroup_rwsem);
-}
-
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
struct scx_sched *sch = scx_root;
@@ -4227,7 +3149,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
struct task_struct *p;
if (!scx_cgroup_enabled)
- goto out_unlock;
+ return;
cgroup_taskset_for_each(p, css, tset) {
if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
@@ -4236,15 +3158,13 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
-out_unlock:
- percpu_up_read(&scx_cgroup_rwsem);
}
void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
struct scx_sched *sch = scx_root;
- percpu_down_read(&scx_cgroup_rwsem);
+ percpu_down_read(&scx_cgroup_ops_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
tg->scx.weight != weight)
@@ -4253,7 +3173,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
tg->scx.weight = weight;
- percpu_up_read(&scx_cgroup_rwsem);
+ percpu_up_read(&scx_cgroup_ops_rwsem);
}
void scx_group_set_idle(struct task_group *tg, bool idle)
@@ -4266,7 +3186,7 @@ void scx_group_set_bandwidth(struct task_group *tg,
{
struct scx_sched *sch = scx_root;
- percpu_down_read(&scx_cgroup_rwsem);
+ percpu_down_read(&scx_cgroup_ops_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
(tg->scx.bw_period_us != period_us ||
@@ -4279,23 +3199,25 @@ void scx_group_set_bandwidth(struct task_group *tg,
tg->scx.bw_quota_us = quota_us;
tg->scx.bw_burst_us = burst_us;
- percpu_up_read(&scx_cgroup_rwsem);
+ percpu_up_read(&scx_cgroup_ops_rwsem);
}
static void scx_cgroup_lock(void)
{
- percpu_down_write(&scx_cgroup_rwsem);
+ percpu_down_write(&scx_cgroup_ops_rwsem);
+ cgroup_lock();
}
static void scx_cgroup_unlock(void)
{
- percpu_up_write(&scx_cgroup_rwsem);
+ cgroup_unlock();
+ percpu_up_write(&scx_cgroup_ops_rwsem);
}
#else /* CONFIG_EXT_GROUP_SCHED */
-static inline void scx_cgroup_lock(void) {}
-static inline void scx_cgroup_unlock(void) {}
+static void scx_cgroup_lock(void) {}
+static void scx_cgroup_unlock(void) {}
#endif /* CONFIG_EXT_GROUP_SCHED */
@@ -4411,15 +3333,12 @@ static void scx_cgroup_exit(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
- percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-
scx_cgroup_enabled = false;
/*
- * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
* cgroups and exit all the inited ones, all online cgroups are exited.
*/
- rcu_read_lock();
css_for_each_descendant_post(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
@@ -4430,17 +3349,9 @@ static void scx_cgroup_exit(struct scx_sched *sch)
if (!sch->ops.cgroup_exit)
continue;
- if (WARN_ON_ONCE(!css_tryget(css)))
- continue;
- rcu_read_unlock();
-
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
css->cgroup);
-
- rcu_read_lock();
- css_put(css);
}
- rcu_read_unlock();
}
static int scx_cgroup_init(struct scx_sched *sch)
@@ -4448,13 +3359,10 @@ static int scx_cgroup_init(struct scx_sched *sch)
struct cgroup_subsys_state *css;
int ret;
- percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-
/*
- * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
* cgroups and init, all online cgroups are initialized.
*/
- rcu_read_lock();
css_for_each_descendant_pre(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
struct scx_cgroup_init_args args = {
@@ -4473,10 +3381,6 @@ static int scx_cgroup_init(struct scx_sched *sch)
continue;
}
- if (WARN_ON_ONCE(!css_tryget(css)))
- continue;
- rcu_read_unlock();
-
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
@@ -4485,11 +3389,7 @@ static int scx_cgroup_init(struct scx_sched *sch)
return ret;
}
tg->scx.flags |= SCX_TG_INITED;
-
- rcu_read_lock();
- css_put(css);
}
- rcu_read_unlock();
WARN_ON_ONCE(scx_cgroup_enabled);
scx_cgroup_enabled = true;
@@ -4572,7 +3472,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
int node;
kthread_stop(sch->helper->task);
- free_percpu(sch->event_stats_cpu);
+ free_percpu(sch->pcpu);
for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]);
@@ -4671,9 +3571,22 @@ bool task_should_scx(int policy)
bool scx_allow_ttwu_queue(const struct task_struct *p)
{
- return !scx_enabled() ||
- (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
- p->sched_class != &ext_sched_class;
+ struct scx_sched *sch;
+
+ if (!scx_enabled())
+ return true;
+
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch))
+ return true;
+
+ if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+ return true;
+
+ if (unlikely(p->sched_class != &ext_sched_class))
+ return true;
+
+ return false;
}
/**
@@ -4789,7 +3702,7 @@ static void scx_clear_softlockup(void)
*
* - pick_next_task() suppresses zero slice warning.
*
- * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM
* operations.
*
* - scx_prio_less() reverts to the default core_sched_at order.
@@ -5234,7 +4147,8 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
- dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
+ dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr),
+ p->migration_disabled);
if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
@@ -5473,13 +4387,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
sch->global_dsqs[node] = dsq;
}
- sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
- if (!sch->event_stats_cpu)
+ sch->pcpu = alloc_percpu(struct scx_sched_pcpu);
+ if (!sch->pcpu)
goto err_free_gdsqs;
sch->helper = kthread_run_worker(0, "sched_ext_helper");
if (!sch->helper)
- goto err_free_event_stats;
+ goto err_free_pcpu;
sched_set_fifo(sch->helper->task);
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
@@ -5497,8 +4411,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
err_stop_helper:
kthread_stop(sch->helper->task);
-err_free_event_stats:
- free_percpu(sch->event_stats_cpu);
+err_free_pcpu:
+ free_percpu(sch->pcpu);
err_free_gdsqs:
for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]);
@@ -5621,6 +4535,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_error(sch, "ops.init() failed (%d)", ret);
goto err_disable;
}
+ sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
@@ -5713,7 +4628,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
ret = scx_init_task(p, task_group(p), false);
if (ret) {
put_task_struct(p);
- scx_task_iter_relock(&sti);
scx_task_iter_stop(&sti);
scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
ret, p->comm, p->pid);
@@ -5723,7 +4637,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_set_task_state(p, SCX_TASK_READY);
put_task_struct(p);
- scx_task_iter_relock(&sti);
}
scx_task_iter_stop(&sti);
scx_cgroup_unlock();
@@ -5795,7 +4708,7 @@ err_unlock:
err_disable_unlock_all:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
- scx_bypass(false);
+ /* we'll soon enter disable path, keep bypass on */
err_disable:
mutex_unlock(&scx_enable_mutex);
/*
@@ -6328,40 +5241,41 @@ void __init init_sched_ext_class(void)
/********************************************************************************
* Helpers that can be called from the BPF scheduler.
*/
-static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
+static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
+ u64 enq_flags)
{
- if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
+ if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
return false;
lockdep_assert_irqs_disabled();
if (unlikely(!p)) {
- scx_kf_error("called with NULL task");
+ scx_error(sch, "called with NULL task");
return false;
}
if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
- scx_kf_error("invalid enq_flags 0x%llx", enq_flags);
+ scx_error(sch, "invalid enq_flags 0x%llx", enq_flags);
return false;
}
return true;
}
-static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
+static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
+ u64 dsq_id, u64 enq_flags)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct task_struct *ddsp_task;
ddsp_task = __this_cpu_read(direct_dispatch_task);
if (ddsp_task) {
- mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
+ mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags);
return;
}
if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
- scx_kf_error("dispatch buffer overflow");
+ scx_error(sch, "dispatch buffer overflow");
return;
}
@@ -6413,7 +5327,14 @@ __bpf_kfunc_start_defs();
__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
u64 enq_flags)
{
- if (!scx_dsq_insert_preamble(p, enq_flags))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
return;
if (slice)
@@ -6421,7 +5342,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
else
p->scx.slice = p->scx.slice ?: 1;
- scx_dsq_insert_commit(p, dsq_id, enq_flags);
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags);
}
/**
@@ -6448,7 +5369,14 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
u64 slice, u64 vtime, u64 enq_flags)
{
- if (!scx_dsq_insert_preamble(p, enq_flags))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
return;
if (slice)
@@ -6458,7 +5386,7 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
p->scx.dsq_vtime = vtime;
- scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
__bpf_kfunc_end_defs();
@@ -6483,7 +5411,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
bool in_balance;
unsigned long flags;
- if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
+ if (!scx_kf_allowed_if_unlocked() &&
+ !scx_kf_allowed(sch, SCX_KF_DISPATCH))
return false;
/*
@@ -6568,7 +5497,15 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
{
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return 0;
return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
@@ -6583,14 +5520,21 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
__bpf_kfunc void scx_bpf_dispatch_cancel(void)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct scx_sched *sch;
+
+ guard(rcu)();
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return;
if (dspc->cursor > 0)
dspc->cursor--;
else
- scx_kf_error("dispatch buffer underflow");
+ scx_error(sch, "dispatch buffer underflow");
}
/**
@@ -6609,11 +5553,17 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
*/
__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{
- struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
+ struct scx_sched *sch;
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return false;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return false;
flush_dispatch_buf(sch, dspc->rq);
@@ -6760,12 +5710,18 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
{
+ struct scx_sched *sch;
LIST_HEAD(tasks);
u32 nr_enqueued = 0;
struct rq *rq;
struct task_struct *p, *n;
- if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
return 0;
rq = cpu_rq(smp_processor_id());
@@ -6877,22 +5833,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
__bpf_kfunc_start_defs();
-/**
- * scx_bpf_kick_cpu - Trigger reschedule on a CPU
- * @cpu: cpu to kick
- * @flags: %SCX_KICK_* flags
- *
- * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
- * trigger rescheduling on a busy CPU. This can be called from any online
- * scx_ops operation and the actual kicking is performed asynchronously through
- * an irq work.
- */
-__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
{
struct rq *this_rq;
unsigned long irq_flags;
- if (!kf_cpu_valid(cpu, NULL))
+ if (!ops_cpu_valid(sch, cpu, NULL))
return;
local_irq_save(irq_flags);
@@ -6916,7 +5862,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *target_rq = cpu_rq(cpu);
if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
- scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+ scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
if (raw_spin_rq_trylock(target_rq)) {
if (can_skip_idle_kick(target_rq)) {
@@ -6941,6 +5887,26 @@ out:
}
/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: %SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (likely(sch))
+ scx_kick_cpu(sch, cpu, flags);
+}
+
+/**
* scx_bpf_dsq_nr_queued - Return the number of queued tasks
* @dsq_id: id of the DSQ
*
@@ -7120,28 +6086,29 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
__bpf_kfunc_end_defs();
-static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
- char *fmt, unsigned long long *data, u32 data__sz)
+static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
+ size_t line_size, char *fmt, unsigned long long *data,
+ u32 data__sz)
{
struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
s32 ret;
if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
(data__sz && !data)) {
- scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz);
+ scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz);
return -EINVAL;
}
ret = copy_from_kernel_nofault(data_buf, data, data__sz);
if (ret < 0) {
- scx_kf_error("failed to read data fields (%d)", ret);
+ scx_error(sch, "failed to read data fields (%d)", ret);
return ret;
}
ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
&bprintf_data);
if (ret < 0) {
- scx_kf_error("format preparation failed (%d)", ret);
+ scx_error(sch, "format preparation failed (%d)", ret);
return ret;
}
@@ -7149,17 +6116,17 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
bprintf_data.bin_args);
bpf_bprintf_cleanup(&bprintf_data);
if (ret < 0) {
- scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
+ scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
return ret;
}
return ret;
}
-static s32 bstr_format(struct scx_bstr_buf *buf,
+static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf,
char *fmt, unsigned long long *data, u32 data__sz)
{
- return __bstr_format(buf->data, buf->line, sizeof(buf->line),
+ return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line),
fmt, data, data__sz);
}
@@ -7178,11 +6145,14 @@ __bpf_kfunc_start_defs();
__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
unsigned long long *data, u32 data__sz)
{
+ struct scx_sched *sch;
unsigned long flags;
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
- if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
+ sch = rcu_dereference_bh(scx_root);
+ if (likely(sch) &&
+ bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7198,11 +6168,14 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
u32 data__sz)
{
+ struct scx_sched *sch;
unsigned long flags;
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
- if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
+ sch = rcu_dereference_bh(scx_root);
+ if (likely(sch) &&
+ bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7221,17 +6194,24 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
u32 data__sz)
{
+ struct scx_sched *sch;
struct scx_dump_data *dd = &scx_dump_data;
struct scx_bstr_buf *buf = &dd->buf;
s32 ret;
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
if (raw_smp_processor_id() != dd->cpu) {
- scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends");
return;
}
/* append the formatted string to the line buf */
- ret = __bstr_format(buf->data, buf->line + dd->cursor,
+ ret = __bstr_format(sch, buf->data, buf->line + dd->cursor,
sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
if (ret < 0) {
dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
@@ -7267,7 +6247,12 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
{
- if (kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
return arch_scale_cpu_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7289,7 +6274,12 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
{
- if (kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
return arch_scale_freq_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7311,12 +6301,20 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
*/
__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(sch);
+ if (unlikely(!sch))
+ return;
+
if (unlikely(perf > SCX_CPUPERF_ONE)) {
- scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu);
return;
}
- if (kf_cpu_valid(cpu, NULL)) {
+ if (ops_cpu_valid(sch, cpu, NULL)) {
struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
struct rq_flags rf;
@@ -7325,7 +6323,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
* to the corresponding CPU to prevent ABBA deadlocks.
*/
if (locked_rq && rq != locked_rq) {
- scx_kf_error("Invalid target CPU %d", cpu);
+ scx_error(sch, "Invalid target CPU %d", cpu);
return;
}
@@ -7420,13 +6418,76 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
*/
__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
{
- if (!kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
return NULL;
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return NULL;
+
+ if (!sch->warned_deprecated_rq) {
+ printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; "
+ "use scx_bpf_locked_rq() when holding rq lock "
+ "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__);
+ sch->warned_deprecated_rq = true;
+ }
+
return cpu_rq(cpu);
}
/**
+ * scx_bpf_locked_rq - Return the rq currently locked by SCX
+ *
+ * Returns the rq if a rq lock is currently held by SCX.
+ * Otherwise emits an error and returns NULL.
+ */
+__bpf_kfunc struct rq *scx_bpf_locked_rq(void)
+{
+ struct scx_sched *sch;
+ struct rq *rq;
+
+ guard(preempt)();
+
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ rq = scx_locked_rq();
+ if (!rq) {
+ scx_error(sch, "accessing rq without holding rq lock");
+ return NULL;
+ }
+
+ return rq;
+}
+
+/**
+ * scx_bpf_cpu_curr - Return remote CPU's curr task
+ * @cpu: CPU of interest
+ *
+ * Callers must hold RCU read lock (KF_RCU).
+ */
+__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return NULL;
+
+ return rcu_dereference(cpu_rq(cpu)->curr);
+}
+
+/**
* scx_bpf_task_cgroup - Return the sched cgroup of a task
* @p: task of interest
*
@@ -7442,8 +6503,15 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
{
struct task_group *tg = p->sched_task_group;
struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ goto out;
- if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
+ if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p))
goto out;
cgrp = tg_cgrp(tg);
@@ -7524,7 +6592,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event
/* Aggregate per-CPU event counters into @events. */
memset(events, 0, sizeof(*events));
for_each_possible_cpu(cpu) {
- e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
+ e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats;
scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
@@ -7590,6 +6658,8 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
+BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
#ifdef CONFIG_CGROUP_SCHED
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 292bb41a242e..43429b33e52c 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,29 +8,6 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
-static inline bool scx_kf_allowed_if_unlocked(void)
-{
- return !current->scx.kf_mask;
-}
-
-static inline bool scx_rq_bypassing(struct rq *rq)
-{
- return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
-}
-
-DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
-
-DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
-
-/*
- * Return the rq currently locked from an scx callback, or NULL if no rq is
- * locked.
- */
-static inline struct rq *scx_locked_rq(void)
-{
- return __this_cpu_read(scx_locked_rq_state);
-}
-
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -100,7 +77,6 @@ int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
void scx_cgroup_move_task(struct task_struct *p);
-void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
@@ -111,7 +87,6 @@ static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
static inline void scx_cgroup_move_task(struct task_struct *p) {}
-static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 537c6992bb63..d2434c954848 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -819,10 +819,10 @@ void scx_idle_disable(void)
* Helpers that can be called from the BPF scheduler.
*/
-static int validate_node(int node)
+static int validate_node(struct scx_sched *sch, int node)
{
if (!static_branch_likely(&scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is disabled");
+ scx_error(sch, "per-node idle tracking is disabled");
return -EOPNOTSUPP;
}
@@ -832,13 +832,13 @@ static int validate_node(int node)
/* Make sure node is in a valid range */
if (node < 0 || node >= nr_node_ids) {
- scx_kf_error("invalid node %d", node);
+ scx_error(sch, "invalid node %d", node);
return -EINVAL;
}
/* Make sure the node is part of the set of possible nodes */
if (!node_possible(node)) {
- scx_kf_error("unavailable node %d", node);
+ scx_error(sch, "unavailable node %d", node);
return -EINVAL;
}
@@ -847,12 +847,12 @@ static int validate_node(int node)
__bpf_kfunc_start_defs();
-static bool check_builtin_idle_enabled(void)
+static bool check_builtin_idle_enabled(struct scx_sched *sch)
{
if (static_branch_likely(&scx_builtin_idle_enabled))
return true;
- scx_kf_error("built-in idle tracking is disabled");
+ scx_error(sch, "built-in idle tracking is disabled");
return false;
}
@@ -882,17 +882,18 @@ static bool is_bpf_migration_disabled(const struct task_struct *p)
return p->migration_disabled;
}
-static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
+ s32 prev_cpu, u64 wake_flags,
const struct cpumask *allowed, u64 flags)
{
struct rq *rq;
struct rq_flags rf;
s32 cpu;
- if (!kf_cpu_valid(prev_cpu, NULL))
+ if (!ops_cpu_valid(sch, prev_cpu, NULL))
return -EINVAL;
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return -EBUSY;
/*
@@ -905,7 +906,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
if (scx_kf_allowed_if_unlocked()) {
rq = task_rq_lock(p, &rf);
} else {
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
+ if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
return -EPERM;
rq = scx_locked_rq();
}
@@ -948,9 +949,13 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
*/
__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
{
- if (!kf_cpu_valid(cpu, NULL))
- return NUMA_NO_NODE;
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL))
+ return NUMA_NO_NODE;
return cpu_to_node(cpu);
}
@@ -972,15 +977,21 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *is_idle)
{
+ struct scx_sched *sch;
s32 cpu;
- cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0);
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
*is_idle = true;
return cpu;
}
*is_idle = false;
-
return prev_cpu;
}
@@ -1007,7 +1018,16 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags)
{
- return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags,
+ cpus_allowed, flags);
}
/**
@@ -1021,7 +1041,15 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ node = validate_node(sch, node);
if (node < 0)
return cpu_none_mask;
@@ -1037,12 +1065,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return cpu_none_mask;
return idle_cpumask(NUMA_NO_NODE)->cpu;
@@ -1060,7 +1096,15 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ node = validate_node(sch, node);
if (node < 0)
return cpu_none_mask;
@@ -1080,12 +1124,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return cpu_none_mask;
if (sched_smt_active())
@@ -1121,10 +1173,18 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
*/
__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
{
- if (!check_builtin_idle_enabled())
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
return false;
- if (!kf_cpu_valid(cpu, NULL))
+ if (!check_builtin_idle_enabled(sch))
+ return false;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
return false;
return scx_idle_test_and_clear_cpu(cpu);
@@ -1152,7 +1212,15 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
int node, u64 flags)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ node = validate_node(sch, node);
if (node < 0)
return node;
@@ -1184,12 +1252,20 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is enabled");
+ scx_error(sch, "per-node idle tracking is enabled");
return -EBUSY;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return -EBUSY;
return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
@@ -1219,9 +1295,16 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
int node, u64 flags)
{
+ struct scx_sched *sch;
s32 cpu;
- node = validate_node(node);
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ node = validate_node(sch, node);
if (node < 0)
return node;
@@ -1259,10 +1342,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
+ struct scx_sched *sch;
s32 cpu;
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is enabled");
+ scx_error(sch, "per-node idle tracking is enabled");
return -EBUSY;
}
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
new file mode 100644
index 000000000000..b3617abed510
--- /dev/null
+++ b/kernel/sched/ext_internal.h
@@ -0,0 +1,1078 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+
+enum scx_consts {
+ SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_DSP_MAX_LOOPS = 32,
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
+
+ SCX_EXIT_BT_LEN = 64,
+ SCX_EXIT_MSG_LEN = 1024,
+ SCX_EXIT_DUMP_DFL_LEN = 32768,
+
+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
+
+ /*
+ * Iterating all tasks may take a while. Periodically drop
+ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+ */
+ SCX_TASK_ITER_BATCH = 32,
+};
+
+enum scx_exit_kind {
+ SCX_EXIT_NONE,
+ SCX_EXIT_DONE,
+
+ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
+ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
+ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
+ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
+
+ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
+ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
+};
+
+/*
+ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+ * are 64bit of the format:
+ *
+ * Bits: [63 .. 48 47 .. 32 31 .. 0]
+ * [ SYS ACT ] [ SYS RSN ] [ USR ]
+ *
+ * SYS ACT: System-defined exit actions
+ * SYS RSN: System-defined exit reasons
+ * USR : User-defined exit codes and reasons
+ *
+ * Using the above, users may communicate intention and context by ORing system
+ * actions and/or system reasons with a user-defined exit code.
+ */
+enum scx_exit_code {
+ /* Reasons */
+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
+
+ /* Actions */
+ SCX_ECODE_ACT_RESTART = 1LLU << 48,
+};
+
+enum scx_exit_flags {
+ /*
+ * ops.exit() may be called even if the loading failed before ops.init()
+ * finishes successfully. This is because ops.exit() allows rich exit
+ * info communication. The following flag indicates whether ops.init()
+ * finished successfully.
+ */
+ SCX_EFLAG_INITIALIZED,
+};
+
+/*
+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
+ * being disabled.
+ */
+struct scx_exit_info {
+ /* %SCX_EXIT_* - broad category of the exit reason */
+ enum scx_exit_kind kind;
+
+ /* exit code if gracefully exiting */
+ s64 exit_code;
+
+ /* %SCX_EFLAG_* */
+ u64 flags;
+
+ /* textual representation of the above */
+ const char *reason;
+
+ /* backtrace if exiting due to an error */
+ unsigned long *bt;
+ u32 bt_len;
+
+ /* informational message */
+ char *msg;
+
+ /* debug dump */
+ char *dump;
+};
+
+/* sched_ext_ops.flags */
+enum scx_ops_flags {
+ /*
+ * Keep built-in idle tracking even if ops.update_idle() is implemented.
+ */
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+
+ /*
+ * By default, if there are no other task to run on the CPU, ext core
+ * keeps running the current task even after its slice expires. If this
+ * flag is specified, such tasks are passed to ops.enqueue() with
+ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
+ */
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
+
+ /*
+ * An exiting task may schedule after PF_EXITING is set. In such cases,
+ * bpf_task_from_pid() may not be able to find the task and if the BPF
+ * scheduler depends on pid lookup for dispatching, the task will be
+ * lost leading to various issues including RCU grace period stalls.
+ *
+ * To mask this problem, by default, unhashed tasks are automatically
+ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
+ * depend on pid lookups and wants to handle these tasks directly, the
+ * following flag can be used.
+ */
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
+
+ /*
+ * If set, only tasks with policy set to SCHED_EXT are attached to
+ * sched_ext. If clear, SCHED_NORMAL tasks are also included.
+ */
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+
+ /*
+ * A migration disabled task can only execute on its current CPU. By
+ * default, such tasks are automatically put on the CPU's local DSQ with
+ * the default slice on enqueue. If this ops flag is set, they also go
+ * through ops.enqueue().
+ *
+ * A migration disabled task never invokes ops.select_cpu() as it can
+ * only select the current CPU. Also, p->cpus_ptr will only contain its
+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
+ */
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
+ * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+ * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+ * transfers. When this optimization is enabled, ops.select_cpu() is
+ * skipped in some cases (when racing against the wakee switching out).
+ * As the BPF scheduler may depend on ops.select_cpu() being invoked
+ * during wakeups, queued wakeup is disabled by default.
+ *
+ * If this ops flag is set, queued wakeup optimization is enabled and
+ * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+ * wakee's CPU without preceding ops.select_cpu() even for tasks which
+ * may be executed on multiple CPUs.
+ */
+ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
+
+ /*
+ * If set, enable per-node idle cpumasks. If clear, use a single global
+ * flat idle cpumask.
+ */
+ SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
+
+ /*
+ * CPU cgroup support flags
+ */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
+
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
+ SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
+ SCX_OPS_HAS_CGROUP_WEIGHT,
+
+ /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
+ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
+};
+
+/* argument container for ops.init_task() */
+struct scx_init_task_args {
+ /*
+ * Set if ops.init_task() is being invoked on the fork path, as opposed
+ * to the scheduler transition path.
+ */
+ bool fork;
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /* the cgroup the task is joining */
+ struct cgroup *cgroup;
+#endif
+};
+
+/* argument container for ops.exit_task() */
+struct scx_exit_task_args {
+ /* Whether the task exited before running on sched_ext. */
+ bool cancelled;
+};
+
+/* argument container for ops->cgroup_init() */
+struct scx_cgroup_init_args {
+ /* the weight of the cgroup [1..10000] */
+ u32 weight;
+
+ /* bandwidth control parameters from cpu.max and cpu.max.burst */
+ u64 bw_period_us;
+ u64 bw_quota_us;
+ u64 bw_burst_us;
+};
+
+enum scx_cpu_preempt_reason {
+ /* next task is being scheduled by &sched_class_rt */
+ SCX_CPU_PREEMPT_RT,
+ /* next task is being scheduled by &sched_class_dl */
+ SCX_CPU_PREEMPT_DL,
+ /* next task is being scheduled by &sched_class_stop */
+ SCX_CPU_PREEMPT_STOP,
+ /* unknown reason for SCX being preempted */
+ SCX_CPU_PREEMPT_UNKNOWN,
+};
+
+/*
+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * expanded in the future.
+ */
+struct scx_cpu_acquire_args {};
+
+/* argument container for ops->cpu_release() */
+struct scx_cpu_release_args {
+ /* the reason the CPU was preempted */
+ enum scx_cpu_preempt_reason reason;
+
+ /* the task that's going to be scheduled on the CPU */
+ struct task_struct *task;
+};
+
+/*
+ * Informational context provided to dump operations.
+ */
+struct scx_dump_ctx {
+ enum scx_exit_kind kind;
+ s64 exit_code;
+ const char *reason;
+ u64 at_ns;
+ u64 at_jiffies;
+};
+
+/**
+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
+ *
+ * A BPF scheduler can implement an arbitrary scheduling policy by
+ * implementing and loading operations in this table. Note that a userland
+ * scheduling policy can also be implemented using the BPF scheduler
+ * as a shim layer.
+ */
+struct sched_ext_ops {
+ /**
+ * @select_cpu: Pick the target CPU for a task which is being woken up
+ * @p: task being woken up
+ * @prev_cpu: the cpu @p was on before sleeping
+ * @wake_flags: SCX_WAKE_*
+ *
+ * Decision made here isn't final. @p may be moved to any CPU while it
+ * is getting dispatched for execution later. However, as @p is not on
+ * the rq at this point, getting the eventual execution CPU right here
+ * saves a small bit of overhead down the line.
+ *
+ * If an idle CPU is returned, the CPU is kicked and will try to
+ * dispatch. While an explicit custom mechanism can be added,
+ * select_cpu() serves as the default way to wake up idle CPUs.
+ *
+ * @p may be inserted into a DSQ directly by calling
+ * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
+ * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
+ * of the CPU returned by this operation.
+ *
+ * Note that select_cpu() is never called for tasks that can only run
+ * on a single CPU or tasks with migration disabled, as they don't have
+ * the option to select a different CPU. See select_task_rq() for
+ * details.
+ */
+ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
+
+ /**
+ * @enqueue: Enqueue a task on the BPF scheduler
+ * @p: task being enqueued
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * @p is ready to run. Insert directly into a DSQ by calling
+ * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
+ * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
+ * the task will stall.
+ *
+ * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
+ * skipped.
+ */
+ void (*enqueue)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @dequeue: Remove a task from the BPF scheduler
+ * @p: task being dequeued
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * Remove @p from the BPF scheduler. This is usually called to isolate
+ * the task while updating its scheduling properties (e.g. priority).
+ *
+ * The ext core keeps track of whether the BPF side owns a given task or
+ * not and can gracefully ignore spurious dispatches from BPF side,
+ * which makes it safe to not implement this method. However, depending
+ * on the scheduling logic, this can lead to confusing behaviors - e.g.
+ * scheduling position not being updated across a priority change.
+ */
+ void (*dequeue)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
+ * @cpu: CPU to dispatch tasks for
+ * @prev: previous task being switched out
+ *
+ * Called when a CPU's local dsq is empty. The operation should dispatch
+ * one or more tasks from the BPF scheduler into the DSQs using
+ * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
+ * using scx_bpf_dsq_move_to_local().
+ *
+ * The maximum number of times scx_bpf_dsq_insert() can be called
+ * without an intervening scx_bpf_dsq_move_to_local() is specified by
+ * ops.dispatch_max_batch. See the comments on top of the two functions
+ * for more details.
+ *
+ * When not %NULL, @prev is an SCX task with its slice depleted. If
+ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
+ * @prev->scx.flags, it is not enqueued yet and will be enqueued after
+ * ops.dispatch() returns. To keep executing @prev, return without
+ * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
+ */
+ void (*dispatch)(s32 cpu, struct task_struct *prev);
+
+ /**
+ * @tick: Periodic tick
+ * @p: task running currently
+ *
+ * This operation is called every 1/HZ seconds on CPUs which are
+ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
+ * immediate dispatch cycle on the CPU.
+ */
+ void (*tick)(struct task_struct *p);
+
+ /**
+ * @runnable: A task is becoming runnable on its associated CPU
+ * @p: task becoming runnable
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * This and the following three functions can be used to track a task's
+ * execution state transitions. A task becomes ->runnable() on a CPU,
+ * and then goes through one or more ->running() and ->stopping() pairs
+ * as it runs on the CPU, and eventually becomes ->quiescent() when it's
+ * done running on the CPU.
+ *
+ * @p is becoming runnable on the CPU because it's
+ *
+ * - waking up (%SCX_ENQ_WAKEUP)
+ * - being moved from another CPU
+ * - being restored after temporarily taken off the queue for an
+ * attribute change.
+ *
+ * This and ->enqueue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be followed by ->enqueue()
+ * e.g. when @p is being dispatched to a remote CPU, or when @p is
+ * being enqueued on a CPU experiencing a hotplug event. Likewise, a
+ * task may be ->enqueue()'d without being preceded by this operation
+ * e.g. after exhausting its slice.
+ */
+ void (*runnable)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @running: A task is starting to run on its associated CPU
+ * @p: task starting to run
+ *
+ * Note that this callback may be called from a CPU other than the
+ * one the task is going to run on. This can happen when a task
+ * property is changed (i.e., affinity), since scx_next_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to determine the
+ * target CPU the task is going to use.
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ */
+ void (*running)(struct task_struct *p);
+
+ /**
+ * @stopping: A task is stopping execution
+ * @p: task stopping to run
+ * @runnable: is task @p still runnable?
+ *
+ * Note that this callback may be called from a CPU other than the
+ * one the task was running on. This can happen when a task
+ * property is changed (i.e., affinity), since dequeue_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
+ * the task was running on.
+ *
+ * See ->runnable() for explanation on the task state notifiers. If
+ * !@runnable, ->quiescent() will be invoked after this operation
+ * returns.
+ */
+ void (*stopping)(struct task_struct *p, bool runnable);
+
+ /**
+ * @quiescent: A task is becoming not runnable on its associated CPU
+ * @p: task becoming not runnable
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ *
+ * @p is becoming quiescent on the CPU because it's
+ *
+ * - sleeping (%SCX_DEQ_SLEEP)
+ * - being moved to another CPU
+ * - being temporarily taken off the queue for an attribute change
+ * (%SCX_DEQ_SAVE)
+ *
+ * This and ->dequeue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be preceded by ->dequeue()
+ * e.g. when @p is being dispatched to a remote CPU.
+ */
+ void (*quiescent)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @yield: Yield CPU
+ * @from: yielding task
+ * @to: optional yield target task
+ *
+ * If @to is NULL, @from is yielding the CPU to other runnable tasks.
+ * The BPF scheduler should ensure that other available tasks are
+ * dispatched before the yielding task. Return value is ignored in this
+ * case.
+ *
+ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
+ * scheduler can implement the request, return %true; otherwise, %false.
+ */
+ bool (*yield)(struct task_struct *from, struct task_struct *to);
+
+ /**
+ * @core_sched_before: Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ *
+ * Used by core-sched to determine the ordering between two tasks. See
+ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
+ * core-sched.
+ *
+ * Both @a and @b are runnable and may or may not currently be queued on
+ * the BPF scheduler. Should return %true if @a should run before @b.
+ * %false if there's no required ordering or @b should run before @a.
+ *
+ * If not specified, the default is ordering them according to when they
+ * became runnable.
+ */
+ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
+
+ /**
+ * @set_weight: Set task weight
+ * @p: task to set weight for
+ * @weight: new weight [1..10000]
+ *
+ * Update @p's weight to @weight.
+ */
+ void (*set_weight)(struct task_struct *p, u32 weight);
+
+ /**
+ * @set_cpumask: Set CPU affinity
+ * @p: task to set CPU affinity for
+ * @cpumask: cpumask of cpus that @p can run on
+ *
+ * Update @p's CPU affinity to @cpumask.
+ */
+ void (*set_cpumask)(struct task_struct *p,
+ const struct cpumask *cpumask);
+
+ /**
+ * @update_idle: Update the idle state of a CPU
+ * @cpu: CPU to update the idle state for
+ * @idle: whether entering or exiting the idle state
+ *
+ * This operation is called when @rq's CPU goes or leaves the idle
+ * state. By default, implementing this operation disables the built-in
+ * idle CPU tracking and the following helpers become unavailable:
+ *
+ * - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_select_cpu_and()
+ * - scx_bpf_test_and_clear_cpu_idle()
+ * - scx_bpf_pick_idle_cpu()
+ *
+ * The user also must implement ops.select_cpu() as the default
+ * implementation relies on scx_bpf_select_cpu_dfl().
+ *
+ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
+ * tracking.
+ */
+ void (*update_idle)(s32 cpu, bool idle);
+
+ /**
+ * @cpu_acquire: A CPU is becoming available to the BPF scheduler
+ * @cpu: The CPU being acquired by the BPF scheduler.
+ * @args: Acquire arguments, see the struct definition.
+ *
+ * A CPU that was previously released from the BPF scheduler is now once
+ * again under its control.
+ */
+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+ /**
+ * @cpu_release: A CPU is taken away from the BPF scheduler
+ * @cpu: The CPU being released by the BPF scheduler.
+ * @args: Release arguments, see the struct definition.
+ *
+ * The specified CPU is no longer under the control of the BPF
+ * scheduler. This could be because it was preempted by a higher
+ * priority sched_class, though there may be other reasons as well. The
+ * caller should consult @args->reason to determine the cause.
+ */
+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+
+ /**
+ * @init_task: Initialize a task to run in a BPF scheduler
+ * @p: task to initialize for BPF scheduling
+ * @args: init arguments, see the struct definition
+ *
+ * Either we're loading a BPF scheduler or a new task is being forked.
+ * Initialize @p for BPF scheduling. This operation may block and can
+ * be used for allocations, and is called exactly once for a task.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During a fork, it
+ * will abort that specific fork.
+ */
+ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
+
+ /**
+ * @exit_task: Exit a previously-running task from the system
+ * @p: task to exit
+ * @args: exit arguments, see the struct definition
+ *
+ * @p is exiting or the BPF scheduler is being unloaded. Perform any
+ * necessary cleanup for @p.
+ */
+ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
+
+ /**
+ * @enable: Enable BPF scheduling for a task
+ * @p: task to enable BPF scheduling for
+ *
+ * Enable @p for BPF scheduling. enable() is called on @p any time it
+ * enters SCX, and is always paired with a matching disable().
+ */
+ void (*enable)(struct task_struct *p);
+
+ /**
+ * @disable: Disable BPF scheduling for a task
+ * @p: task to disable BPF scheduling for
+ *
+ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
+ * Disable BPF scheduling for @p. A disable() call is always matched
+ * with a prior enable() call.
+ */
+ void (*disable)(struct task_struct *p);
+
+ /**
+ * @dump: Dump BPF scheduler state on error
+ * @ctx: debug dump context
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
+ */
+ void (*dump)(struct scx_dump_ctx *ctx);
+
+ /**
+ * @dump_cpu: Dump BPF scheduler state for a CPU on error
+ * @ctx: debug dump context
+ * @cpu: CPU to generate debug dump for
+ * @idle: @cpu is currently idle without any runnable tasks
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @cpu. If @idle is %true and this operation doesn't produce any
+ * output, @cpu is skipped for dump.
+ */
+ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
+
+ /**
+ * @dump_task: Dump BPF scheduler state for a runnable task on error
+ * @ctx: debug dump context
+ * @p: runnable task to generate debug dump for
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @p.
+ */
+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /**
+ * @cgroup_init: Initialize a cgroup
+ * @cgrp: cgroup being initialized
+ * @args: init arguments, see the struct definition
+ *
+ * Either the BPF scheduler is being loaded or @cgrp created, initialize
+ * @cgrp for sched_ext. This operation may block.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During cgroup
+ * creation, it will abort the specific cgroup creation.
+ */
+ s32 (*cgroup_init)(struct cgroup *cgrp,
+ struct scx_cgroup_init_args *args);
+
+ /**
+ * @cgroup_exit: Exit a cgroup
+ * @cgrp: cgroup being exited
+ *
+ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+ * @cgrp for sched_ext. This operation my block.
+ */
+ void (*cgroup_exit)(struct cgroup *cgrp);
+
+ /**
+ * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Prepare @p for move from cgroup @from to @to. This operation may
+ * block and can be used for allocations.
+ *
+ * Return 0 for success, -errno for failure. An error return aborts the
+ * migration.
+ */
+ s32 (*cgroup_prep_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_move: Commit cgroup move
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Commit the move. @p is dequeued during this operation.
+ */
+ void (*cgroup_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_cancel_move: Cancel cgroup move
+ * @p: task whose cgroup move is being canceled
+ * @from: cgroup @p was being moved from
+ * @to: cgroup @p was being moved to
+ *
+ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+ * Undo the preparation.
+ */
+ void (*cgroup_cancel_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_set_weight: A cgroup's weight is being changed
+ * @cgrp: cgroup whose weight is being updated
+ * @weight: new weight [1..10000]
+ *
+ * Update @cgrp's weight to @weight.
+ */
+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+ /**
+ * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+ * @cgrp: cgroup whose bandwidth is being updated
+ * @period_us: bandwidth control period
+ * @quota_us: bandwidth control quota
+ * @burst_us: bandwidth control burst
+ *
+ * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+ * cgroup interface.
+ *
+ * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+ * to. For example, if @period_us is 1_000_000 and @quota_us is
+ * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+ * interpreted in the same fashion and specifies how much @cgrp can
+ * burst temporarily. The specific control mechanism and thus the
+ * interpretation of @period_us and burstiness is upto to the BPF
+ * scheduler.
+ */
+ void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us);
+
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+ /*
+ * All online ops must come before ops.cpu_online().
+ */
+
+ /**
+ * @cpu_online: A CPU became online
+ * @cpu: CPU which just came up
+ *
+ * @cpu just came online. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
+ */
+ void (*cpu_online)(s32 cpu);
+
+ /**
+ * @cpu_offline: A CPU is going offline
+ * @cpu: CPU which is going offline
+ *
+ * @cpu is going offline. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
+ */
+ void (*cpu_offline)(s32 cpu);
+
+ /*
+ * All CPU hotplug ops must come before ops.init().
+ */
+
+ /**
+ * @init: Initialize the BPF scheduler
+ */
+ s32 (*init)(void);
+
+ /**
+ * @exit: Clean up after the BPF scheduler
+ * @info: Exit info
+ *
+ * ops.exit() is also called on ops.init() failure, which is a bit
+ * unusual. This is to allow rich reporting through @info on how
+ * ops.init() failed.
+ */
+ void (*exit)(struct scx_exit_info *info);
+
+ /**
+ * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
+ */
+ u32 dispatch_max_batch;
+
+ /**
+ * @flags: %SCX_OPS_* flags
+ */
+ u64 flags;
+
+ /**
+ * @timeout_ms: The maximum amount of time, in milliseconds, that a
+ * runnable task should be able to wait before being scheduled. The
+ * maximum timeout may not exceed the default timeout of 30 seconds.
+ *
+ * Defaults to the maximum allowed timeout value of 30 seconds.
+ */
+ u32 timeout_ms;
+
+ /**
+ * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
+ * value of 32768 is used.
+ */
+ u32 exit_dump_len;
+
+ /**
+ * @hotplug_seq: A sequence number that may be set by the scheduler to
+ * detect when a hotplug event has occurred during the loading process.
+ * If 0, no detection occurs. Otherwise, the scheduler will fail to
+ * load if the sequence number does not match @scx_hotplug_seq on the
+ * enable path.
+ */
+ u64 hotplug_seq;
+
+ /**
+ * @name: BPF scheduler's name
+ *
+ * Must be a non-zero valid BPF object name including only isalnum(),
+ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
+ * BPF scheduler is enabled.
+ */
+ char name[SCX_OPS_NAME_LEN];
+
+ /* internal use only, must be NULL */
+ void *priv;
+};
+
+enum scx_opi {
+ SCX_OPI_BEGIN = 0,
+ SCX_OPI_NORMAL_BEGIN = 0,
+ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
+ SCX_OPI_END = SCX_OP_IDX(init),
+};
+
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * Total number of times a task's time slice was refilled with the
+ * default value (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_REFILL_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+struct scx_sched_pcpu {
+ /*
+ * The event counters are in a per-CPU variable to minimize the
+ * accounting overhead. A system-wide view on the event counter is
+ * constructed when requested by scx_bpf_events().
+ */
+ struct scx_event_stats event_stats;
+};
+
+struct scx_sched {
+ struct sched_ext_ops ops;
+ DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+ /*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+ * This is to avoid live-locking in bypass mode where all tasks are
+ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+ * per-node split isn't sufficient, it can be further split.
+ */
+ struct rhashtable dsq_hash;
+ struct scx_dispatch_q **global_dsqs;
+ struct scx_sched_pcpu __percpu *pcpu;
+
+ bool warned_zero_slice:1;
+ bool warned_deprecated_rq:1;
+
+ atomic_t exit_kind;
+ struct scx_exit_info *exit_info;
+
+ struct kobject kobj;
+
+ struct kthread_worker *helper;
+ struct irq_work error_irq_work;
+ struct kthread_work disable_work;
+ struct rcu_work rcu_work;
+};
+
+enum scx_wake_flags {
+ /* expose select WF_* flags as enums */
+ SCX_WAKE_FORK = WF_FORK,
+ SCX_WAKE_TTWU = WF_TTWU,
+ SCX_WAKE_SYNC = WF_SYNC,
+};
+
+enum scx_enq_flags {
+ /* expose select ENQUEUE_* flags as enums */
+ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
+ SCX_ENQ_HEAD = ENQUEUE_HEAD,
+ SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * Set the following to trigger preemption when calling
+ * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
+ * current task is cleared to zero and the CPU is kicked into the
+ * scheduling path. Implies %SCX_ENQ_HEAD.
+ */
+ SCX_ENQ_PREEMPT = 1LLU << 32,
+
+ /*
+ * The task being enqueued was previously enqueued on the current CPU's
+ * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+ * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
+ * invoked in a ->cpu_release() callback, and the task is again
+ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+ * task will not be scheduled on the CPU until at least the next invocation
+ * of the ->cpu_acquire() callback.
+ */
+ SCX_ENQ_REENQ = 1LLU << 40,
+
+ /*
+ * The task being enqueued is the only task available for the cpu. By
+ * default, ext core keeps executing such tasks but when
+ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
+ * %SCX_ENQ_LAST flag set.
+ *
+ * The BPF scheduler is responsible for triggering a follow-up
+ * scheduling event. Otherwise, Execution may stall.
+ */
+ SCX_ENQ_LAST = 1LLU << 41,
+
+ /* high 8 bits are internal */
+ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
+ SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
+};
+
+enum scx_deq_flags {
+ /* expose select DEQUEUE_* flags as enums */
+ SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * The generic core-sched layer decided to execute the task even though
+ * it hasn't been dispatched yet. Dequeue from the BPF side.
+ */
+ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
+};
+
+enum scx_pick_idle_cpu_flags {
+ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+ SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
+};
+
+enum scx_kick_flags {
+ /*
+ * Kick the target CPU if idle. Guarantees that the target CPU goes
+ * through at least one full scheduling cycle before going idle. If the
+ * target CPU can be determined to be currently not idle and going to go
+ * through a scheduling cycle before going idle, noop.
+ */
+ SCX_KICK_IDLE = 1LLU << 0,
+
+ /*
+ * Preempt the current task and execute the dispatch path. If the
+ * current task of the target CPU is an SCX task, its ->scx.slice is
+ * cleared to zero before the scheduling path is invoked so that the
+ * task expires and the dispatch path is invoked.
+ */
+ SCX_KICK_PREEMPT = 1LLU << 1,
+
+ /*
+ * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
+ * return after the target CPU finishes picking the next task.
+ */
+ SCX_KICK_WAIT = 1LLU << 2,
+};
+
+enum scx_tg_flags {
+ SCX_TG_ONLINE = 1U << 0,
+ SCX_TG_INITED = 1U << 1,
+};
+
+enum scx_enable_state {
+ SCX_ENABLING,
+ SCX_ENABLED,
+ SCX_DISABLING,
+ SCX_DISABLED,
+};
+
+static const char *scx_enable_state_str[] = {
+ [SCX_ENABLING] = "enabling",
+ [SCX_ENABLED] = "enabled",
+ [SCX_DISABLING] = "disabling",
+ [SCX_DISABLED] = "disabled",
+};
+
+/*
+ * sched_ext_entity->ops_state
+ *
+ * Used to track the task ownership between the SCX core and the BPF scheduler.
+ * State transitions look as follows:
+ *
+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
+ * ^ | |
+ * | v v
+ * \-------------------------------/
+ *
+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
+ * sites for explanations on the conditions being waited upon and why they are
+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
+ * waiters should load_acquire.
+ *
+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
+ * any given task can be dispatched by the BPF scheduler at all times and thus
+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
+ * to try to dispatch any task anytime regardless of its state as the SCX core
+ * can safely reject invalid dispatches.
+ */
+enum scx_ops_state {
+ SCX_OPSS_NONE, /* owned by the SCX core */
+ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
+ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
+ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
+
+ /*
+ * QSEQ brands each QUEUED instance so that, when dispatch races
+ * dequeue/requeue, the dispatcher can tell whether it still has a claim
+ * on the task being dispatched.
+ *
+ * As some 32bit archs can't do 64bit store_release/load_acquire,
+ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
+ * 32bit machines. The dispatch race window QSEQ protects is very narrow
+ * and runs with IRQ disabled. 30 bits should be sufficient.
+ */
+ SCX_OPSS_QSEQ_SHIFT = 2,
+};
+
+/* Use macros to ensure that the type is unsigned long for the masks */
+#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
+#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
+
+DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(scx_locked_rq_state);
+}
+
+static inline bool scx_kf_allowed_if_unlocked(void)
+{
+ return !current->scx.kf_mask;
+}
+
+static inline bool scx_rq_bypassing(struct rq *rq)
+{
+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 426287930497..3a89f949e307 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se)
if (!gcfs_rq || !gcfs_rq->load.weight)
return;
- if (throttled_hierarchy(gcfs_rq))
- return;
-
shares = calc_group_shares(gcfs_rq);
if (unlikely(se->load.weight != shares))
reweight_entity(cfs_rq_of(se), se, shares);
@@ -5291,18 +5288,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq)) {
- list_add_leaf_cfs_rq(cfs_rq);
- } else {
+ list_add_leaf_cfs_rq(cfs_rq);
#ifdef CONFIG_CFS_BANDWIDTH
+ if (cfs_rq->pelt_clock_throttled) {
struct rq *rq = rq_of(cfs_rq);
- if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
- cfs_rq->throttled_clock = rq_clock(rq);
- if (!cfs_rq->throttled_clock_self)
- cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
}
+#endif
}
}
@@ -5341,8 +5336,6 @@ static void set_delayed(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_runnable--;
- if (cfs_rq_throttled(cfs_rq))
- break;
}
}
@@ -5363,8 +5356,6 @@ static void clear_delayed(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_runnable++;
- if (cfs_rq_throttled(cfs_rq))
- break;
}
}
@@ -5392,7 +5383,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* DELAY_DEQUEUE relies on spurious wakeups, special task
* states must not suffer spurious wakeups, excempt them.
*/
- if (flags & DEQUEUE_SPECIAL)
+ if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
delay = false;
WARN_ON_ONCE(delay && se->sched_delayed);
@@ -5450,8 +5441,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
- if (cfs_rq->nr_queued == 0)
+ if (cfs_rq->nr_queued == 0) {
update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
+ }
+#endif
+ }
return true;
}
@@ -5725,74 +5726,253 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
return cfs_bandwidth_used() && cfs_rq->throttled;
}
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
+}
+
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
+}
+
+static inline bool task_is_throttled(struct task_struct *p)
+{
+ return cfs_bandwidth_used() && p->throttled;
+}
+
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
+static void throttle_cfs_rq_work(struct callback_head *work)
+{
+ struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+
+ WARN_ON_ONCE(p != current);
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+
+ /*
+ * If task is exiting, then there won't be a return to userspace, so we
+ * don't have to bother with any of this.
+ */
+ if ((p->flags & PF_EXITING))
+ return;
+
+ scoped_guard(task_rq_lock, p) {
+ se = &p->se;
+ cfs_rq = cfs_rq_of(se);
+
+ /* Raced, forget */
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ /*
+ * If not in limbo, then either replenish has happened or this
+ * task got migrated out of the throttled cfs_rq, move along.
+ */
+ if (!cfs_rq->throttle_count)
+ return;
+ rq = scope.rq;
+ update_rq_clock(rq);
+ WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
+ dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ /*
+ * Must not set throttled before dequeue or dequeue will
+ * mistakenly regard this task as an already throttled one.
+ */
+ p->throttled = true;
+ resched_curr(rq);
+ }
+}
+
+void init_cfs_throttle_work(struct task_struct *p)
+{
+ init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
+ /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+ INIT_LIST_HEAD(&p->throttle_node);
+}
+
/*
- * Ensure that neither of the group entities corresponding to src_cpu or
- * dest_cpu are members of a throttled hierarchy when performing group
- * load-balance operations.
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
*/
-static inline int throttled_lb_pair(struct task_group *tg,
- int src_cpu, int dest_cpu)
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
+{
+ WARN_ON_ONCE(p->se.on_rq);
+ list_del_init(&p->throttle_node);
+
+ /* task blocked after throttled */
+ if (flags & DEQUEUE_SLEEP) {
+ p->throttled = false;
+ return;
+ }
+
+ /*
+ * task is migrating off its old cfs_rq, detach
+ * the task's load from its old cfs_rq.
+ */
+ if (task_on_rq_migrating(p))
+ detach_task_cfs_rq(p);
+}
+
+static bool enqueue_throttled_task(struct task_struct *p)
{
- struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
- src_cfs_rq = tg->cfs_rq[src_cpu];
- dest_cfs_rq = tg->cfs_rq[dest_cpu];
+ /* @p should have gone through dequeue_throttled_task() first */
+ WARN_ON_ONCE(!list_empty(&p->throttle_node));
+
+ /*
+ * If the throttled task @p is enqueued to a throttled cfs_rq,
+ * take the fast path by directly putting the task on the
+ * target cfs_rq's limbo list.
+ *
+ * Do not do that when @p is current because the following race can
+ * cause @p's group_node to be incorectly re-insterted in its rq's
+ * cfs_tasks list, despite being throttled:
+ *
+ * cpuX cpuY
+ * p ret2user
+ * throttle_cfs_rq_work() sched_move_task(p)
+ * LOCK task_rq_lock
+ * dequeue_task_fair(p)
+ * UNLOCK task_rq_lock
+ * LOCK task_rq_lock
+ * task_current_donor(p) == true
+ * task_on_rq_queued(p) == true
+ * dequeue_task(p)
+ * put_prev_task(p)
+ * sched_change_group()
+ * enqueue_task(p) -> p's new cfs_rq
+ * is throttled, go
+ * fast path and skip
+ * actual enqueue
+ * set_next_task(p)
+ * list_move(&se->group_node, &rq->cfs_tasks); // bug
+ * schedule()
+ *
+ * In the above race case, @p current cfs_rq is in the same rq as
+ * its previous cfs_rq because sched_move_task() only moves a task
+ * to a different group from the same rq, so we can use its current
+ * cfs_rq to derive rq and test if the task is current.
+ */
+ if (throttled_hierarchy(cfs_rq) &&
+ !task_current_donor(rq_of(cfs_rq), p)) {
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ return true;
+ }
- return throttled_hierarchy(src_cfs_rq) ||
- throttled_hierarchy(dest_cfs_rq);
+ /* we can't take the fast path, do an actual enqueue*/
+ p->throttled = false;
+ return false;
}
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p, *tmp;
- cfs_rq->throttle_count--;
- if (!cfs_rq->throttle_count) {
+ if (--cfs_rq->throttle_count)
+ return 0;
+
+ if (cfs_rq->pelt_clock_throttled) {
cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
+ }
- /* Add cfs_rq with load or one or more already running entities to the list */
- if (!cfs_rq_is_decayed(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
- if (cfs_rq->throttled_clock_self) {
- u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+ cfs_rq->throttled_clock_self = 0;
- cfs_rq->throttled_clock_self = 0;
+ if (WARN_ON_ONCE((s64)delta < 0))
+ delta = 0;
- if (WARN_ON_ONCE((s64)delta < 0))
- delta = 0;
+ cfs_rq->throttled_clock_self_time += delta;
+ }
- cfs_rq->throttled_clock_self_time += delta;
- }
+ /* Re-enqueue the tasks that have been throttled at this level. */
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+ list_del_init(&p->throttle_node);
+ p->throttled = false;
+ enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
}
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+
return 0;
}
+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+ return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+ if (task_has_throttle_work(p))
+ return;
+
+ /*
+ * Kthreads and exiting tasks don't return to userspace, so adding the
+ * work is pointless
+ */
+ if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+ return;
+
+ task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
+static void record_throttle_clock(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+}
+
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
- list_del_leaf_cfs_rq(cfs_rq);
+ if (cfs_rq->throttle_count++)
+ return 0;
- WARN_ON_ONCE(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock_self = rq_clock(rq);
+ /*
+ * For cfs_rqs that still have entities enqueued, PELT clock
+ * stop happens at dequeue time when all entities are dequeued.
+ */
+ if (!cfs_rq->nr_queued) {
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
}
- cfs_rq->throttle_count++;
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+ WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
return 0;
}
@@ -5800,8 +5980,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+ int dequeue = 1;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5824,76 +6003,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!dequeue)
return false; /* Throttle no longer required. */
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- int flags;
-
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- /*
- * Abuse SPECIAL to avoid delayed dequeue in this instance.
- * This avoids teaching dequeue_entities() about throttled
- * entities and keeps things relatively simple.
- */
- flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
- if (se->sched_delayed)
- flags |= DEQUEUE_DELAYED;
- dequeue_entity(qcfs_rq, se, flags);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
-
- if (qcfs_rq->load.weight) {
- /* Avoid re-evaluating load for this entity: */
- se = parent_entity(se);
- break;
- }
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
- }
-
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, queued_delta);
-done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
WARN_ON_ONCE(cfs_rq->throttled_clock);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@@ -5901,9 +6021,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+
+ /*
+ * It's possible we are called with !runtime_remaining due to things
+ * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+ * unthrottled us with a positive runtime_remaining but other still
+ * running entities consumed those runtime before we reached here.
+ *
+ * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+ * because any enqueue in tg_unthrottle_up() will immediately trigger a
+ * throttle, which is not supposed to happen on unthrottle path.
+ */
+ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+ return;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -5933,62 +6064,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
break;
}
- goto unthrottle_throttle;
- }
-
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- /* Handle any unfinished DELAY_DEQUEUE business first. */
- if (se->sched_delayed) {
- int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
- dequeue_entity(qcfs_rq, se, flags);
- } else if (se->on_rq)
- break;
- enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- update_load_avg(qcfs_rq, se, UPDATE_TG);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
}
- /* Start the fair server if un-throttling resulted in new runnable tasks */
- if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
- dl_server_start(&rq->fair_server);
-
- /* At this point se is NULL and we are at root level*/
- add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
@@ -6472,6 +6549,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+ INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -6639,19 +6717,28 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void task_throttle_setup_work(struct task_struct *p) {}
+static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
+static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return 0;
}
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+ return false;
+}
+
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return 0;
}
-static inline int throttled_lb_pair(struct task_group *tg,
- int src_cpu, int dest_cpu)
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
{
return 0;
}
@@ -6831,6 +6918,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
+ if (task_is_throttled(p) && enqueue_throttled_task(p))
+ return;
+
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
@@ -6883,10 +6973,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
-
flags = ENQUEUE_WAKEUP;
}
@@ -6908,10 +6994,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
}
if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -6941,7 +7023,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!task_new)
check_update_overutilized_status(rq);
-enqueue_throttle:
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -6963,6 +7044,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
bool was_sched_idle = sched_idle_rq(rq);
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
+ bool task_throttled = flags & DEQUEUE_THROTTLE;
struct task_struct *p = NULL;
int h_nr_idle = 0;
int h_nr_queued = 0;
@@ -6996,9 +7078,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -7010,7 +7091,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
- if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+ if (task_sleep && se)
set_next_buddy(se);
break;
}
@@ -7037,9 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
}
sub_nr_running(rq, h_nr_queued);
@@ -7073,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
*/
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
+ if (task_is_throttled(p)) {
+ dequeue_throttled_task(p, flags);
+ return true;
+ }
+
if (!p->se.sched_delayed)
util_est_dequeue(&rq->cfs, p);
@@ -8660,7 +8745,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
- if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+ if (task_is_throttled(p))
return;
if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
@@ -8741,19 +8826,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
+ struct task_struct *p;
+ bool throttled;
again:
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_queued)
return NULL;
+ throttled = false;
+
do {
/* Might not have done put_prev_entity() */
if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto again;
+ throttled |= check_cfs_rq_runtime(cfs_rq);
se = pick_next_entity(rq, cfs_rq);
if (!se)
@@ -8761,7 +8849,10 @@ again:
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
- return task_of(se);
+ p = task_of(se);
+ if (unlikely(throttled))
+ task_throttle_setup_work(p);
+ return p;
}
static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
@@ -8923,8 +9014,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- /* throttled hierarchies are not runnable */
- if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
+ /* !se->on_rq also covers throttled task */
+ if (!se->on_rq)
return false;
/* Tell the scheduler that we'd really like se to run next. */
@@ -9283,7 +9374,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) delayed dequeued unless we migrate load, or
- * 2) throttled_lb_pair, or
+ * 2) target cfs_rq is in throttled hierarchy, or
* 3) cannot be migrated to this CPU due to cpus_ptr, or
* 4) running (obviously), or
* 5) are cache-hot on their current CPU, or
@@ -9292,7 +9383,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
- if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ if (lb_throttled_hierarchy(p, env->dst_cpu))
return 0;
/*
@@ -13076,10 +13167,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq_throttled(cfs_rq))
- return;
-
- if (!throttled_hierarchy(cfs_rq))
+ /*
+ * If a task gets attached to this cfs_rq and before being queued,
+ * it gets migrated to another CPU due to reasons like affinity
+ * change, make sure this cfs_rq stays on leaf cfs_rq list to have
+ * that removed load decayed or it can cause faireness problem.
+ */
+ if (!cfs_rq_pelt_clock_throttled(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
/* Start to propagate at parent */
@@ -13090,10 +13184,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
update_load_avg(cfs_rq, se, UPDATE_TG);
- if (cfs_rq_throttled(cfs_rq))
- break;
-
- if (!throttled_hierarchy(cfs_rq))
+ if (!cfs_rq_pelt_clock_throttled(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 62c3fa543c0f..f921302dc40f 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
u64 throttled;
- if (unlikely(cfs_rq->throttle_count))
+ if (unlikely(cfs_rq->pelt_clock_throttled))
throttled = U64_MAX;
else
throttled = cfs_rq->throttled_clock_pelt_time;
@@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
- if (unlikely(cfs_rq->throttle_count))
+ if (unlikely(cfs_rq->pelt_clock_throttled))
return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
new file mode 100644
index 000000000000..a23747bbe25b
--- /dev/null
+++ b/kernel/sched/rq-offsets.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#define COMPILE_OFFSETS
+#include <linux/kbuild.h>
+#include <linux/types.h>
+#include "sched.h"
+
+int main(void)
+{
+ DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
+
+ return 0;
+}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 69458b9b46dc..1f5d07067f60 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -760,10 +760,12 @@ struct cfs_rq {
u64 throttled_clock_pelt_time;
u64 throttled_clock_self;
u64 throttled_clock_self_time;
- int throttled;
+ bool throttled:1;
+ bool pelt_clock_throttled:1;
int throttle_count;
struct list_head throttled_list;
struct list_head throttled_csd_list;
+ struct list_head throttled_limbo_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -2367,6 +2369,7 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SPECIAL 0x10
#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_THROTTLE 0x800
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -2683,6 +2686,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
extern void init_dl_entity(struct sched_dl_entity *dl_se);
+extern void init_cfs_throttle_work(struct task_struct *p);
+
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6e2f54169e66..444bdfdab731 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1591,7 +1591,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
enum numa_topology_type sched_numa_topology_type;
static int sched_domains_numa_levels;
-static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
@@ -1632,14 +1631,7 @@ sd_init(struct sched_domain_topology_level *tl,
int sd_id, sd_weight, sd_flags = 0;
struct cpumask *sd_span;
-#ifdef CONFIG_NUMA
- /*
- * Ugly hack to pass state to sd_numa_mask()...
- */
- sched_domains_curr_level = tl->numa_level;
-#endif
-
- sd_weight = cpumask_weight(tl->mask(cpu));
+ sd_weight = cpumask_weight(tl->mask(tl, cpu));
if (tl->sd_flags)
sd_flags = (*tl->sd_flags)();
@@ -1677,7 +1669,7 @@ sd_init(struct sched_domain_topology_level *tl,
};
sd_span = sched_domain_span(sd);
- cpumask_and(sd_span, cpu_map, tl->mask(cpu));
+ cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
sd_id = cpumask_first(sd_span);
sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
@@ -1732,22 +1724,63 @@ sd_init(struct sched_domain_topology_level *tl,
return sd;
}
+#ifdef CONFIG_SCHED_SMT
+int cpu_smt_flags(void)
+{
+ return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_smt_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+int cpu_cluster_flags(void)
+{
+ return SD_CLUSTER | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_clustergroup_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+int cpu_core_flags(void)
+{
+ return SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_coregroup_mask(cpu);
+}
+#endif
+
+const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_node_mask(cpu);
+}
+
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
+ SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
#endif
#ifdef CONFIG_SCHED_CLUSTER
- SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
+ SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
+ SDTL_INIT(tl_mc_mask, cpu_core_flags, MC),
#endif
- SDTL_INIT(cpu_cpu_mask, NULL, PKG),
+ SDTL_INIT(tl_pkg_mask, NULL, PKG),
{ NULL, },
};
@@ -1768,10 +1801,14 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl)
}
#ifdef CONFIG_NUMA
+static int cpu_numa_flags(void)
+{
+ return SD_NUMA;
+}
-static const struct cpumask *sd_numa_mask(int cpu)
+static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu)
{
- return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+ return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)];
}
static void sched_numa_warn(const char *str)
@@ -2413,7 +2450,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
* breaks the linking done for an earlier span.
*/
for_each_cpu(cpu, cpu_map) {
- const struct cpumask *tl_cpu_mask = tl->mask(cpu);
+ const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu);
int id;
/* lowest bit set in this mask is used as a unique id */
@@ -2421,7 +2458,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
if (cpumask_test_cpu(id, id_seen)) {
/* First CPU has already been seen, ensure identical spans */
- if (!cpumask_equal(tl->mask(id), tl_cpu_mask))
+ if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask))
return false;
} else {
/* First CPU hasn't been seen before, ensure it's a completely new span */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 3bbfba30a777..25f62867a16d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -741,6 +741,26 @@ out:
}
#ifdef SECCOMP_ARCH_NATIVE
+static bool seccomp_uprobe_exception(struct seccomp_data *sd)
+{
+#if defined __NR_uretprobe || defined __NR_uprobe
+#ifdef SECCOMP_ARCH_COMPAT
+ if (sd->arch == SECCOMP_ARCH_NATIVE)
+#endif
+ {
+#ifdef __NR_uretprobe
+ if (sd->nr == __NR_uretprobe)
+ return true;
+#endif
+#ifdef __NR_uprobe
+ if (sd->nr == __NR_uprobe)
+ return true;
+#endif
+ }
+#endif
+ return false;
+}
+
/**
* seccomp_is_const_allow - check if filter is constant allow with given data
* @fprog: The BPF programs
@@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
return false;
/* Our single exception to filtering. */
-#ifdef __NR_uretprobe
-#ifdef SECCOMP_ARCH_COMPAT
- if (sd->arch == SECCOMP_ARCH_NATIVE)
-#endif
- if (sd->nr == __NR_uretprobe)
- return true;
-#endif
+ if (seccomp_uprobe_exception(sd))
+ return true;
for (pc = 0; pc < fprog->len; pc++) {
struct sock_filter *insn = &fprog->filter[pc];
@@ -1043,6 +1058,9 @@ static const int mode1_syscalls[] = {
#ifdef __NR_uretprobe
__NR_uretprobe,
#endif
+#ifdef __NR_uprobe
+ __NR_uprobe,
+#endif
-1, /* negative terminated */
};
diff --git a/kernel/smp.c b/kernel/smp.c
index 56f83aa58ec8..02f52291fae4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -884,16 +884,15 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
- * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
- * (atomically) until function has completed on other CPUs. If
- * %SCF_RUN_LOCAL is set, the function will also be run locally
- * if the local CPU is set in the @cpumask.
- *
- * If @wait is true, then returns once @func has returned.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
*
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler. Preemption
* must be disabled when calling this function.
+ *
+ * @func is not called on the local CPU even if @mask contains it. Consider
+ * using on_each_cpu_cond_mask() instead if this is not desirable.
*/
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 513b1945987c..77198911b8dd 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -165,7 +165,11 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
/* First entry of a task into a BH disabled section? */
if (!current->softirq_disable_cnt) {
if (preemptible()) {
- local_lock(&softirq_ctrl.lock);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK))
+ local_lock(&softirq_ctrl.lock);
+ else
+ migrate_disable();
+
/* Required to meet the RCU bottomhalf requirements. */
rcu_read_lock();
} else {
@@ -177,17 +181,34 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
* Track the per CPU softirq disabled state. On RT this is per CPU
* state to allow preemption of bottom half disabled sections.
*/
- newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt);
- /*
- * Reflect the result in the task state to prevent recursion on the
- * local lock and to make softirq_count() & al work.
- */
- current->softirq_disable_cnt = newcnt;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) {
+ newcnt = this_cpu_add_return(softirq_ctrl.cnt, cnt);
+ /*
+ * Reflect the result in the task state to prevent recursion on the
+ * local lock and to make softirq_count() & al work.
+ */
+ current->softirq_disable_cnt = newcnt;
- if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) {
- raw_local_irq_save(flags);
- lockdep_softirqs_off(ip);
- raw_local_irq_restore(flags);
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+ }
+ } else {
+ bool sirq_dis = false;
+
+ if (!current->softirq_disable_cnt)
+ sirq_dis = true;
+
+ this_cpu_add(softirq_ctrl.cnt, cnt);
+ current->softirq_disable_cnt += cnt;
+ WARN_ON_ONCE(current->softirq_disable_cnt < 0);
+
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_dis) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+ }
}
}
EXPORT_SYMBOL(__local_bh_disable_ip);
@@ -195,23 +216,42 @@ EXPORT_SYMBOL(__local_bh_disable_ip);
static void __local_bh_enable(unsigned int cnt, bool unlock)
{
unsigned long flags;
+ bool sirq_en = false;
int newcnt;
- DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
- this_cpu_read(softirq_ctrl.cnt));
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) {
+ DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
+ this_cpu_read(softirq_ctrl.cnt));
+ if (softirq_count() == cnt)
+ sirq_en = true;
+ } else {
+ if (current->softirq_disable_cnt == cnt)
+ sirq_en = true;
+ }
- if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) {
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_en) {
raw_local_irq_save(flags);
lockdep_softirqs_on(_RET_IP_);
raw_local_irq_restore(flags);
}
- newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt);
- current->softirq_disable_cnt = newcnt;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) {
+ newcnt = this_cpu_sub_return(softirq_ctrl.cnt, cnt);
+ current->softirq_disable_cnt = newcnt;
- if (!newcnt && unlock) {
- rcu_read_unlock();
- local_unlock(&softirq_ctrl.lock);
+ if (!newcnt && unlock) {
+ rcu_read_unlock();
+ local_unlock(&softirq_ctrl.lock);
+ }
+ } else {
+ current->softirq_disable_cnt -= cnt;
+ this_cpu_sub(softirq_ctrl.cnt, cnt);
+ if (unlock && !current->softirq_disable_cnt) {
+ migrate_enable();
+ rcu_read_unlock();
+ } else {
+ WARN_ON_ONCE(current->softirq_disable_cnt < 0);
+ }
}
}
@@ -228,7 +268,10 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
lock_map_release(&bh_lock_map);
local_irq_save(flags);
- curcnt = __this_cpu_read(softirq_ctrl.cnt);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK))
+ curcnt = this_cpu_read(softirq_ctrl.cnt);
+ else
+ curcnt = current->softirq_disable_cnt;
/*
* If this is not reenabling soft interrupts, no point in trying to
@@ -805,6 +848,58 @@ static bool tasklet_clear_sched(struct tasklet_struct *t)
return false;
}
+#ifdef CONFIG_PREEMPT_RT
+struct tasklet_sync_callback {
+ spinlock_t cb_lock;
+ atomic_t cb_waiters;
+};
+
+static DEFINE_PER_CPU(struct tasklet_sync_callback, tasklet_sync_callback) = {
+ .cb_lock = __SPIN_LOCK_UNLOCKED(tasklet_sync_callback.cb_lock),
+ .cb_waiters = ATOMIC_INIT(0),
+};
+
+static void tasklet_lock_callback(void)
+{
+ spin_lock(this_cpu_ptr(&tasklet_sync_callback.cb_lock));
+}
+
+static void tasklet_unlock_callback(void)
+{
+ spin_unlock(this_cpu_ptr(&tasklet_sync_callback.cb_lock));
+}
+
+static void tasklet_callback_cancel_wait_running(void)
+{
+ struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback);
+
+ atomic_inc(&sync_cb->cb_waiters);
+ spin_lock(&sync_cb->cb_lock);
+ atomic_dec(&sync_cb->cb_waiters);
+ spin_unlock(&sync_cb->cb_lock);
+}
+
+static void tasklet_callback_sync_wait_running(void)
+{
+ struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback);
+
+ if (atomic_read(&sync_cb->cb_waiters)) {
+ spin_unlock(&sync_cb->cb_lock);
+ spin_lock(&sync_cb->cb_lock);
+ }
+}
+
+#else /* !CONFIG_PREEMPT_RT: */
+
+static void tasklet_lock_callback(void) { }
+static void tasklet_unlock_callback(void) { }
+static void tasklet_callback_sync_wait_running(void) { }
+
+#ifdef CONFIG_SMP
+static void tasklet_callback_cancel_wait_running(void) { }
+#endif
+#endif /* !CONFIG_PREEMPT_RT */
+
static void tasklet_action_common(struct tasklet_head *tl_head,
unsigned int softirq_nr)
{
@@ -816,6 +911,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head,
tl_head->tail = &tl_head->head;
local_irq_enable();
+ tasklet_lock_callback();
while (list) {
struct tasklet_struct *t = list;
@@ -835,6 +931,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head,
}
}
tasklet_unlock(t);
+ tasklet_callback_sync_wait_running();
continue;
}
tasklet_unlock(t);
@@ -847,6 +944,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head,
__raise_softirq_irqoff(softirq_nr);
local_irq_enable();
}
+ tasklet_unlock_callback();
}
static __latent_entropy void tasklet_action(void)
@@ -897,12 +995,9 @@ void tasklet_unlock_spin_wait(struct tasklet_struct *t)
/*
* Prevent a live lock when current preempted soft
* interrupt processing or prevents ksoftirqd from
- * running. If the tasklet runs on a different CPU
- * then this has no effect other than doing the BH
- * disable/enable dance for nothing.
+ * running.
*/
- local_bh_disable();
- local_bh_enable();
+ tasklet_callback_cancel_wait_running();
} else {
cpu_relax();
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a86931f8c..bf5d05c635ff 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -392,3 +392,4 @@ COND_SYSCALL(setuid16);
COND_SYSCALL(rseq);
COND_SYSCALL(uretprobe);
+COND_SYSCALL(uprobe);
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e6e9b85d4db5..f7d52d9543cc 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -26,7 +26,7 @@ obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o
ifeq ($(CONFIG_SMP),y)
obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o
endif
-obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
+obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
obj-$(CONFIG_TIME_NS) += namespace.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 577f0e6842d4..069d93bfb0c7 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -35,7 +35,7 @@
/**
* struct alarm_base - Alarm timer bases
- * @lock: Lock for syncrhonized access to the base
+ * @lock: Lock for synchronized access to the base
* @timerqueue: Timerqueue head managing the list of events
* @get_ktime: Function to read the time correlating to the base
* @get_timespec: Function to read the namespace time correlating to the base
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f3e831f62906..a59bc75ab7c5 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -633,7 +633,7 @@ void tick_offline_cpu(unsigned int cpu)
raw_spin_lock(&clockevents_lock);
tick_broadcast_offline(cpu);
- tick_shutdown(cpu);
+ tick_shutdown();
/*
* Unregister the clock event devices which were
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 0aef0e349e49..a1890a073196 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -144,7 +144,7 @@ static u64 suspend_start;
* Default for maximum permissible skew when cs->uncertainty_margin is
* not specified, and the lower bound even when cs->uncertainty_margin
* is specified. This is also the default that is used when registering
- * clocks with unspecifed cs->uncertainty_margin, so this macro is used
+ * clocks with unspecified cs->uncertainty_margin, so this macro is used
* even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels.
*/
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
@@ -407,9 +407,8 @@ void clocksource_verify_percpu(struct clocksource *cs)
if (!cpumask_empty(&cpus_behind))
pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n",
cpumask_pr_args(&cpus_behind), testcpu, cs->name);
- if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
- pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
- testcpu, cs_nsec_min, cs_nsec_max, cs->name);
+ pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
+ testcpu, cs_nsec_min, cs_nsec_max, cs->name);
}
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e8c479329282..88aa062b8a55 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -59,6 +59,7 @@
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
static void retrigger_next_event(void *arg);
+static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);
/*
* The timer bases:
@@ -76,42 +77,34 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.index = HRTIMER_BASE_MONOTONIC,
.clockid = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME,
.clockid = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME,
.clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI,
.clockid = CLOCK_TAI,
- .get_time = &ktime_get_clocktai,
},
{
.index = HRTIMER_BASE_MONOTONIC_SOFT,
.clockid = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME_SOFT,
.clockid = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME_SOFT,
.clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI_SOFT,
.clockid = CLOCK_TAI,
- .get_time = &ktime_get_clocktai,
},
},
.csd = CSD_INIT(retrigger_next_event, NULL)
@@ -208,7 +201,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_
/*
* The offline local CPU can't be the default target if the
* next remote target event is after this timer. Keep the
- * elected new base. An IPI will we issued to reprogram
+ * elected new base. An IPI will be issued to reprogram
* it as a last resort.
*/
if (!hrtimer_base_is_online(this_cpu_base))
@@ -1253,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
remove_hrtimer(timer, base, true, force_local);
if (mode & HRTIMER_MODE_REL)
- tim = ktime_add_safe(tim, base->get_time());
+ tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
tim = hrtimer_update_lowres(timer, tim, mode);
@@ -1574,10 +1567,10 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
switch (clock_id) {
- case CLOCK_REALTIME:
- return HRTIMER_BASE_REALTIME;
case CLOCK_MONOTONIC:
return HRTIMER_BASE_MONOTONIC;
+ case CLOCK_REALTIME:
+ return HRTIMER_BASE_REALTIME;
case CLOCK_BOOTTIME:
return HRTIMER_BASE_BOOTTIME;
case CLOCK_TAI:
@@ -1588,6 +1581,29 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
}
}
+static ktime_t __hrtimer_cb_get_time(clockid_t clock_id)
+{
+ switch (clock_id) {
+ case CLOCK_MONOTONIC:
+ return ktime_get();
+ case CLOCK_REALTIME:
+ return ktime_get_real();
+ case CLOCK_BOOTTIME:
+ return ktime_get_boottime();
+ case CLOCK_TAI:
+ return ktime_get_clocktai();
+ default:
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return ktime_get();
+ }
+}
+
+ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
+{
+ return __hrtimer_cb_get_time(timer->base->clockid);
+}
+EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);
+
static void __hrtimer_setup(struct hrtimer *timer,
enum hrtimer_restart (*function)(struct hrtimer *),
clockid_t clock_id, enum hrtimer_mode mode)
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 876d389b2e21..7c6110e964e7 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -163,8 +163,7 @@ void posixtimer_rearm_itimer(struct task_struct *tsk)
struct hrtimer *tmr = &tsk->signal->real_timer;
if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) {
- hrtimer_forward(tmr, tmr->base->get_time(),
- tsk->signal->it_real_incr);
+ hrtimer_forward_now(tmr, tsk->signal->it_real_incr);
hrtimer_restart(tmr);
}
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 8b582174b1f9..aa3120104a51 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -299,8 +299,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
{
struct hrtimer *timer = &timr->it.real.timer;
- timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
- timr->it_interval);
+ timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval);
hrtimer_restart(timer);
}
@@ -535,7 +534,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
goto out;
}
/*
- * After succesful copy out, the timer ID is visible to user space
+ * After successful copy out, the timer ID is visible to user space
* now but not yet valid because new_timer::signal low order bit is 1.
*
* Complete the initialization with the clock specific create
@@ -825,7 +824,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode);
if (!absolute)
- expires = ktime_add_safe(expires, timer->base->get_time());
+ expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer));
hrtimer_set_expires(timer, expires);
if (!sigev_none)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index cc15fe293719..cc1afec306b3 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -174,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
return HRTIMER_RESTART;
}
-void __init
-sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
+void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
@@ -247,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
pr_debug("Registered %pS as sched_clock source\n", read);
}
+EXPORT_SYMBOL_GPL(sched_clock_register);
void __init generic_sched_clock_init(void)
{
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 9a3859443c04..7e33d3f2e889 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -411,24 +411,18 @@ int tick_cpu_dying(unsigned int dying_cpu)
}
/*
- * Shutdown an event device on a given cpu:
+ * Shutdown an event device on the outgoing CPU:
*
- * This is called on a life CPU, when a CPU is dead. So we cannot
- * access the hardware device itself.
- * We just set the mode and remove it from the lists.
+ * Called by the dying CPU during teardown, with clockevents_lock held
+ * and interrupts disabled.
*/
-void tick_shutdown(unsigned int cpu)
+void tick_shutdown(void)
{
- struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
struct clock_event_device *dev = td->evtdev;
td->mode = TICKDEV_MODE_PERIODIC;
if (dev) {
- /*
- * Prevent that the clock events layer tries to call
- * the set mode function!
- */
- clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index faac36de35b9..4e4f7bbe2a64 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -26,7 +26,7 @@ extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_check_new_device(struct clock_event_device *dev);
extern void tick_offline_cpu(unsigned int cpu);
-extern void tick_shutdown(unsigned int cpu);
+extern void tick_shutdown(void);
extern void tick_suspend(void);
extern void tick_resume(void);
extern bool tick_check_replacement(struct clock_event_device *curdev,
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index b03d0ada6469..488e47e96e93 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -102,8 +102,6 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
SEQ_printf(m, " .index: %d\n", base->index);
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
-
- SEQ_printf(m, " .get_time: %ps\n", base->get_time);
#ifdef CONFIG_HIGH_RES_TIMERS
SEQ_printf(m, " .offset: %Lu nsecs\n",
(unsigned long long) ktime_to_ns(base->offset));
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c6b79b3675c3..45320e27a16c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -222,7 +222,9 @@ struct worker_pool {
struct workqueue_attrs *attrs; /* I: worker attributes */
struct hlist_node hash_node; /* PL: unbound_pool_hash node */
int refcnt; /* PL: refcnt for unbound pools */
-
+#ifdef CONFIG_PREEMPT_RT
+ spinlock_t cb_lock; /* BH worker cancel lock */
+#endif
/*
* Destruction of pool is RCU protected to allow dereferences
* from get_work_pool().
@@ -2930,7 +2932,7 @@ static void idle_worker_timeout(struct timer_list *t)
raw_spin_unlock_irq(&pool->lock);
if (do_cull)
- queue_work(system_unbound_wq, &pool->idle_cull_work);
+ queue_work(system_dfl_wq, &pool->idle_cull_work);
}
/**
@@ -3078,6 +3080,31 @@ restart:
goto restart;
}
+#ifdef CONFIG_PREEMPT_RT
+static void worker_lock_callback(struct worker_pool *pool)
+{
+ spin_lock(&pool->cb_lock);
+}
+
+static void worker_unlock_callback(struct worker_pool *pool)
+{
+ spin_unlock(&pool->cb_lock);
+}
+
+static void workqueue_callback_cancel_wait_running(struct worker_pool *pool)
+{
+ spin_lock(&pool->cb_lock);
+ spin_unlock(&pool->cb_lock);
+}
+
+#else
+
+static void worker_lock_callback(struct worker_pool *pool) { }
+static void worker_unlock_callback(struct worker_pool *pool) { }
+static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { }
+
+#endif
+
/**
* manage_workers - manage worker pool
* @worker: self
@@ -3557,6 +3584,7 @@ static void bh_worker(struct worker *worker)
int nr_restarts = BH_WORKER_RESTARTS;
unsigned long end = jiffies + BH_WORKER_JIFFIES;
+ worker_lock_callback(pool);
raw_spin_lock_irq(&pool->lock);
worker_leave_idle(worker);
@@ -3585,6 +3613,7 @@ done:
worker_enter_idle(worker);
kick_pool(pool);
raw_spin_unlock_irq(&pool->lock);
+ worker_unlock_callback(pool);
}
/*
@@ -4222,17 +4251,17 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
(data & WORK_OFFQ_BH)) {
/*
* On RT, prevent a live lock when %current preempted
- * soft interrupt processing or prevents ksoftirqd from
- * running by keeping flipping BH. If the BH work item
- * runs on a different CPU then this has no effect other
- * than doing the BH disable/enable dance for nothing.
- * This is copied from
- * kernel/softirq.c::tasklet_unlock_spin_wait().
+ * soft interrupt processing by blocking on lock which
+ * is owned by the thread invoking the callback.
*/
while (!try_wait_for_completion(&barr.done)) {
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- local_bh_disable();
- local_bh_enable();
+ struct worker_pool *pool;
+
+ guard(rcu)();
+ pool = get_work_pool(work);
+ if (pool)
+ workqueue_callback_cancel_wait_running(pool);
} else {
cpu_relax();
}
@@ -4782,6 +4811,9 @@ static int init_worker_pool(struct worker_pool *pool)
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
pool->refcnt = 1;
+#ifdef CONFIG_PREEMPT_RT
+ spin_lock_init(&pool->cb_lock);
+#endif
/* shouldn't fail above this point */
pool->attrs = alloc_workqueue_attrs();
@@ -6046,7 +6078,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
struct pool_workqueue *pwq;
bool ret;
- rcu_read_lock();
preempt_disable();
if (cpu == WORK_CPU_UNBOUND)
@@ -6056,7 +6087,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
ret = !list_empty(&pwq->inactive_works);
preempt_enable();
- rcu_read_unlock();
return ret;
}
@@ -7546,8 +7576,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
if (!thresh)
return;
- rcu_read_lock();
-
for_each_pool(pool, pi) {
unsigned long pool_ts, touched, ts;
@@ -7589,8 +7617,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
}
- rcu_read_unlock();
-
if (lockup_detected)
show_all_workqueues();
@@ -7642,7 +7668,7 @@ static int wq_watchdog_param_set_thresh(const char *val,
if (ret)
return ret;
- if (system_wq)
+ if (system_percpu_wq)
wq_watchdog_set_thresh(thresh);
else
wq_watchdog_thresh = thresh;
@@ -7802,22 +7828,22 @@ void __init workqueue_init_early(void)
ordered_wq_attrs[i] = attrs;
}
- system_wq = alloc_workqueue("events", 0, 0);
- system_percpu_wq = alloc_workqueue("events", 0, 0);
- system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
- system_long_wq = alloc_workqueue("events_long", 0, 0);
+ system_wq = alloc_workqueue("events", WQ_PERCPU, 0);
+ system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0);
+ system_highpri_wq = alloc_workqueue("events_highpri",
+ WQ_HIGHPRI | WQ_PERCPU, 0);
+ system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
- WQ_FREEZABLE, 0);
+ WQ_FREEZABLE | WQ_PERCPU, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
- WQ_POWER_EFFICIENT, 0);
+ WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
- WQ_FREEZABLE | WQ_POWER_EFFICIENT,
- 0);
- system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
+ WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
+ system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0);
system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
- WQ_BH | WQ_HIGHPRI, 0);
+ WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0);
BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
!system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
!system_power_efficient_wq ||