diff options
Diffstat (limited to 'kernel')
80 files changed, 3926 insertions, 2624 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235..41751834e764 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ + kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o @@ -122,7 +122,7 @@ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/audit.c b/kernel/audit.c index 61b5744d0bb6..26a332ffb1b8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,7 @@ #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/security.h> +#include <linux/lsm_hooks.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <net/netns/generic.h> @@ -81,6 +82,13 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; +/* Number of modules that provide a security context. + List of lsms that provide a security context */ +static u32 audit_subj_secctx_cnt; +static u32 audit_obj_secctx_cnt; +static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; +static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT]; + /** * struct audit_net - audit private network namespace data * @sk: communication socket @@ -195,8 +203,10 @@ static struct audit_ctl_mutex { * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { - struct sk_buff *skb; /* formatted skb ready to send */ + struct sk_buff *skb; /* the skb for audit_log functions */ + struct sk_buff_head skb_list; /* formatted skbs, ready to send */ struct audit_context *ctx; /* NULL or associated context */ + struct audit_stamp stamp; /* audit stamp for these records */ gfp_t gfp_mask; }; @@ -279,6 +289,33 @@ static pid_t auditd_pid_vnr(void) } /** + * audit_cfg_lsm - Identify a security module as providing a secctx. + * @lsmid: LSM identity + * @flags: which contexts are provided + * + * Description: + * Increments the count of the security modules providing a secctx. + * If the LSM id is already in the list leave it alone. + */ +void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ + int i; + + if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) { + for (i = 0 ; i < audit_subj_secctx_cnt; i++) + if (audit_subj_lsms[i] == lsmid) + return; + audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; + } + if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) { + for (i = 0 ; i < audit_obj_secctx_cnt; i++) + if (audit_obj_lsms[i] == lsmid) + return; + audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid; + } +} + +/** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace * @@ -1113,7 +1150,6 @@ static int is_audit_feature_set(int i) return af.features & AUDIT_FEATURE_TO_MASK(i); } - static int audit_get_feature(struct sk_buff *skb) { u32 seq; @@ -1473,7 +1509,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, case AUDIT_SIGNAL_INFO: if (lsmprop_is_set(&audit_sig_lsm)) { err = security_lsmprop_to_secctx(&audit_sig_lsm, - &lsmctx); + &lsmctx, LSM_ID_UNDEF); if (err < 0) return err; } @@ -1776,10 +1812,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { + struct sk_buff *skb; + if (!ab) return; - kfree_skb(ab->skb); + while ((skb = skb_dequeue(&ab->skb_list))) + kfree_skb(skb); kmem_cache_free(audit_buffer_cache, ab); } @@ -1792,9 +1831,14 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, if (!ab) return NULL; + skb_queue_head_init(&ab->skb_list); + ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; + + skb_queue_tail(&ab->skb_list, ab->skb); + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; @@ -1833,11 +1877,11 @@ unsigned int audit_serial(void) } static inline void audit_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial) + struct audit_stamp *stamp) { - if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - ktime_get_coarse_real_ts64(t); - *serial = audit_serial(); + if (!ctx || !auditsc_get_stamp(ctx, stamp)) { + ktime_get_coarse_real_ts64(&stamp->ctime); + stamp->serial = audit_serial(); } } @@ -1860,8 +1904,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct timespec64 t; - unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1916,12 +1958,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_get_stamp(ab->ctx, &t, &serial); + audit_get_stamp(ab->ctx, &ab->stamp); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", - (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); return ab; } @@ -2177,33 +2221,179 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -int audit_log_task_context(struct audit_buffer *ab) +/** + * audit_buffer_aux_new - Add an aux record buffer to the skb list + * @ab: audit_buffer + * @type: message type + * + * Aux records are allocated and added to the skb list of + * the "main" record. The ab->skb is reset to point to the + * aux record on its creation. When the aux record in complete + * ab->skb has to be reset to point to the "main" record. + * This allows the audit_log_ functions to be ignorant of + * which kind of record it is logging to. It also avoids adding + * special data for aux records. + * + * On success ab->skb will point to the new aux record. + * Returns 0 on success, -ENOMEM should allocation fail. + */ +static int audit_buffer_aux_new(struct audit_buffer *ab, int type) +{ + WARN_ON(ab->skb != skb_peek(&ab->skb_list)); + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask); + if (!ab->skb) + goto err; + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) + goto err; + skb_queue_tail(&ab->skb_list, ab->skb); + + audit_log_format(ab, "audit(%llu.%03lu:%u): ", + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); + + return 0; + +err: + kfree_skb(ab->skb); + ab->skb = skb_peek(&ab->skb_list); + return -ENOMEM; +} + +/** + * audit_buffer_aux_end - Switch back to the "main" record from an aux record + * @ab: audit_buffer + * + * Restores the "main" audit record to ab->skb. + */ +static void audit_buffer_aux_end(struct audit_buffer *ab) +{ + ab->skb = skb_peek(&ab->skb_list); +} + +/** + * audit_log_subj_ctx - Add LSM subject information + * @ab: audit_buffer + * @prop: LSM subject properties. + * + * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record. + */ +int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { - struct lsm_prop prop; struct lsm_context ctx; + char *space = ""; int error; + int i; - security_current_getlsmprop_subj(&prop); - if (!lsmprop_is_set(&prop)) + security_current_getlsmprop_subj(prop); + if (!lsmprop_is_set(prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx); - if (error < 0) { - if (error != -EINVAL) - goto error_path; + if (audit_subj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return 0; + } + audit_log_format(ab, " subj=%s", ctx.context); + security_release_secctx(&ctx); return 0; } - - audit_log_format(ab, " subj=%s", ctx.context); - security_release_secctx(&ctx); + /* Multiple LSMs provide contexts. Include an aux record. */ + audit_log_format(ab, " subj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_subj_secctx_cnt; i++) { + error = security_lsmprop_to_secctx(prop, &ctx, + audit_subj_lsms[i]->id); + if (error < 0) { + /* + * Don't print anything. An LSM like BPF could + * claim to support contexts, but only do so under + * certain conditions. + */ + if (error == -EOPNOTSUPP) + continue; + if (error != -EINVAL) + audit_panic("error in audit_log_subj_ctx"); + } else { + audit_log_format(ab, "%ssubj_%s=%s", space, + audit_subj_lsms[i]->name, ctx.context); + space = " "; + security_release_secctx(&ctx); + } + } + audit_buffer_aux_end(ab); return 0; error_path: - audit_panic("error in audit_log_task_context"); + audit_panic("error in audit_log_subj_ctx"); return error; } +EXPORT_SYMBOL(audit_log_subj_ctx); + +int audit_log_task_context(struct audit_buffer *ab) +{ + struct lsm_prop prop; + + security_current_getlsmprop_subj(&prop); + return audit_log_subj_ctx(ab, &prop); +} EXPORT_SYMBOL(audit_log_task_context); +int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) +{ + int i; + int rc; + int error = 0; + char *space = ""; + struct lsm_context ctx; + + if (audit_obj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return error; + } + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); + return 0; + } + audit_log_format(ab, " obj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_obj_secctx_cnt; i++) { + rc = security_lsmprop_to_secctx(prop, &ctx, + audit_obj_lsms[i]->id); + if (rc < 0) { + audit_log_format(ab, "%sobj_%s=?", space, + audit_obj_lsms[i]->name); + if (rc != -EINVAL) + audit_panic("error in audit_log_obj_ctx"); + error = rc; + } else { + audit_log_format(ab, "%sobj_%s=%s", space, + audit_obj_lsms[i]->name, ctx.context); + security_release_secctx(&ctx); + } + space = " "; + } + + audit_buffer_aux_end(ab); + return error; + +error_path: + audit_panic("error in audit_log_obj_ctx"); + return error; +} + void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { @@ -2411,6 +2601,28 @@ int audit_signal_info(int sig, struct task_struct *t) } /** + * __audit_log_end - enqueue one audit record + * @skb: the buffer to send + */ +static void __audit_log_end(struct sk_buff *skb) +{ + struct nlmsghdr *nlh; + + if (audit_rate_check()) { + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* queue the netlink packet */ + skb_queue_tail(&audit_queue, skb); + } else { + audit_log_lost("rate limit exceeded"); + kfree_skb(skb); + } +} + +/** * audit_log_end - end one audit record * @ab: the audit_buffer * @@ -2422,25 +2634,15 @@ int audit_signal_info(int sig, struct task_struct *t) void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; - struct nlmsghdr *nlh; if (!ab) return; - if (audit_rate_check()) { - skb = ab->skb; - ab->skb = NULL; - - /* setup the netlink header, see the comments in - * kauditd_send_multicast_skb() for length quirks */ - nlh = nlmsg_hdr(skb); - nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + while ((skb = skb_dequeue(&ab->skb_list))) + __audit_log_end(skb); - /* queue the netlink packet and poke the kauditd thread */ - skb_queue_tail(&audit_queue, skb); - wake_up_interruptible(&kauditd_wait); - } else - audit_log_lost("rate limit exceeded"); + /* poke the kauditd thread */ + wake_up_interruptible(&kauditd_wait); audit_buffer_free(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index 2a24d01c5fb0..0f05933a173b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -99,6 +99,12 @@ struct audit_proctitle { char *value; /* the cmdline field */ }; +/* A timestamp/serial pair to identify an event */ +struct audit_stamp { + struct timespec64 ctime; /* time of syscall entry */ + unsigned int serial; /* serial number for record */ +}; + /* The per-task audit context. */ struct audit_context { int dummy; /* must be the first element */ @@ -108,10 +114,9 @@ struct audit_context { AUDIT_CTX_URING, /* in use by io_uring */ } context; enum audit_state state, current_state; - unsigned int serial; /* serial number for record */ + struct audit_stamp stamp; /* event identifier */ int major; /* syscall number */ int uring_op; /* uring operation */ - struct timespec64 ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ long return_code;/* syscall return code */ u64 prio; @@ -263,7 +268,7 @@ extern void audit_put_tty(struct tty_struct *tty); extern unsigned int audit_serial(void); #ifdef CONFIG_AUDITSYSCALL extern int auditsc_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial); + struct audit_stamp *stamp); extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); @@ -304,7 +309,7 @@ extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); #else /* CONFIG_AUDITSYSCALL */ -#define auditsc_get_stamp(c, t, s) 0 +#define auditsc_get_stamp(c, s) 0 #define audit_put_watch(w) do { } while (0) #define audit_get_watch(w) do { } while (0) #define audit_to_watch(k, p, l, o) (-EINVAL) diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index c565fbf66ac8..b92805b317a2 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -76,17 +76,18 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa struct audit_fsnotify_mark *audit_mark; struct path path; struct dentry *dentry; - struct inode *inode; int ret; if (pathname[0] != '/' || pathname[len-1] == '/') return ERR_PTR(-EINVAL); - dentry = kern_path_locked(pathname, &path); + dentry = kern_path_parent(pathname, &path); if (IS_ERR(dentry)) return ERR_CAST(dentry); /* returning an error */ - inode = path.dentry->d_inode; - inode_unlock(inode); + if (d_really_is_negative(dentry)) { + audit_mark = ERR_PTR(-ENOENT); + goto out; + } audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { @@ -100,7 +101,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); + ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0); if (ret < 0) { audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b0eae2a3c895..1605df0a171e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -93,8 +93,10 @@ static struct kmem_cache *audit_tree_mark_cachep __ro_after_init; static struct audit_tree *alloc_tree(const char *s) { struct audit_tree *tree; + size_t sz; - tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL); + sz = strlen(s) + 1; + tree = kmalloc(struct_size(tree, pathname, sz), GFP_KERNEL); if (tree) { refcount_set(&tree->count, 1); tree->goner = 0; @@ -103,7 +105,7 @@ static struct audit_tree *alloc_tree(const char *s) INIT_LIST_HEAD(&tree->list); INIT_LIST_HEAD(&tree->same_root); tree->root = NULL; - strcpy(tree->pathname, s); + strscpy(tree->pathname, s, sz); } return tree; } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0ebbbe37a60f..a700e3c8925f 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -349,7 +349,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { struct dentry *d; - d = kern_path_locked_negative(watch->path, parent); + d = kern_path_parent(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); @@ -359,7 +359,6 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) watch->ino = d_backing_inode(d)->i_ino; } - inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f7708fe2c457..c401082d9b25 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1440,7 +1440,7 @@ static int update_lsm_rule(struct audit_krule *r) } /* This function will re-initialize the lsm_rule field of all applicable rules. - * It will traverse the filter lists serarching for rules that contain LSM + * It will traverse the filter lists searching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the * LSM field is re-initialized, and the old rule is replaced with the * updated rule. */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index eb98cd6fe91f..d1966144bdfe 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -994,10 +994,10 @@ static void audit_reset_context(struct audit_context *ctx) */ ctx->current_state = ctx->state; - ctx->serial = 0; + ctx->stamp.serial = 0; + ctx->stamp.ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; ctx->major = 0; ctx->uring_op = 0; - ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; memset(ctx->argv, 0, sizeof(ctx->argv)); ctx->return_code = 0; ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0); @@ -1098,7 +1098,6 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, char *comm) { struct audit_buffer *ab; - struct lsm_context ctx; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); @@ -1108,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx) < 0) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop)) + rc = 1; + audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); @@ -1392,15 +1385,8 @@ static void show_special(struct audit_context *context, int *call_panic) from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { - struct lsm_context lsmctx; - - if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx) < 0) { + if (audit_log_obj_ctx(ab, &context->ipc.oprop)) *call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", lsmctx.context); - security_release_secctx(&lsmctx); - } } if (context->ipc.has_perm) { audit_log_end(ab); @@ -1557,17 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - if (lsmprop_is_set(&n->oprop)) { - struct lsm_context ctx; - - if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) { - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(&n->oprop) && + audit_log_obj_ctx(ab, &n->oprop)) + *call_panic = 2; /* log the audit_names record type */ switch (n->type) { @@ -1785,8 +1763,9 @@ static void audit_log_exit(void) audit_log_pid_context(context, context->target_pid, context->target_auid, context->target_uid, context->target_sessionid, - &context->target_ref, context->target_comm)) - call_panic = 1; + &context->target_ref, + context->target_comm)) + call_panic = 1; if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); @@ -1917,7 +1896,7 @@ void __audit_uring_entry(u8 op) ctx->context = AUDIT_CTX_URING; ctx->current_state = ctx->state; - ktime_get_coarse_real_ts64(&ctx->ctime); + ktime_get_coarse_real_ts64(&ctx->stamp.ctime); } /** @@ -2039,7 +2018,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, context->argv[3] = a4; context->context = AUDIT_CTX_SYSCALL; context->current_state = state; - ktime_get_coarse_real_ts64(&context->ctime); + ktime_get_coarse_real_ts64(&context->stamp.ctime); } /** @@ -2508,21 +2487,17 @@ EXPORT_SYMBOL_GPL(__audit_inode_child); /** * auditsc_get_stamp - get local copies of audit_context values * @ctx: audit_context for the task - * @t: timespec64 to store time recorded in the audit_context - * @serial: serial value that is recorded in the audit_context + * @stamp: timestamp to record * * Also sets the context as auditable. */ -int auditsc_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial) +int auditsc_get_stamp(struct audit_context *ctx, struct audit_stamp *stamp) { if (ctx->context == AUDIT_CTX_UNUSED) return 0; - if (!ctx->serial) - ctx->serial = audit_serial(); - t->tv_sec = ctx->ctime.tv_sec; - t->tv_nsec = ctx->ctime.tv_nsec; - *serial = ctx->serial; + if (!ctx->stamp.serial) + ctx->stamp.serial = audit_serial(); + *stamp = ctx->stamp; if (!ctx->prio) { ctx->prio = 1; ctx->current_state = AUDIT_STATE_RECORD; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 269c04a24664..f6cf8c2af5f7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -62,3 +62,4 @@ CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5d1650af899d..e4568d44e827 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2366,8 +2366,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, or interpreter is being used when - * prog->jit_requested is not 0, so warn about it! + * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; @@ -2468,8 +2467,9 @@ out: return ret; } -static void bpf_prog_select_func(struct bpf_prog *fp) +static bool bpf_prog_select_interpreter(struct bpf_prog *fp) { + bool select_interpreter = false; #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); u32 idx = (round_up(stack_depth, 32) / 32) - 1; @@ -2478,15 +2478,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * But for non-JITed programs, we don't need bpf_func, so no bounds * check needed. */ - if (!fp->jit_requested && - !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + if (idx < ARRAY_SIZE(interpreters)) { fp->bpf_func = interpreters[idx]; + select_interpreter = true; } else { fp->bpf_func = __bpf_prog_ret0_warn; } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif + return select_interpreter; } /** @@ -2505,7 +2506,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ - bool jit_needed = fp->jit_requested; + bool jit_needed = false; if (fp->bpf_func) goto finalize; @@ -2514,7 +2515,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) bpf_prog_has_kfunc_call(fp)) jit_needed = true; - bpf_prog_select_func(fp); + if (!bpf_prog_select_interpreter(fp)) + jit_needed = true; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -3024,7 +3026,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { - .func = NULL, + /* func is unused for tail_call, we set it to pass the + * get_helper_proto check + */ + .func = BPF_PTR_POISON, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b2b7b8ec2c2a..c46360b27871 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -186,7 +186,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, struct xdp_buff xdp; int i, nframes = 0; - xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; for (i = 0; i < n; i++) { @@ -231,7 +230,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } - xdp_clear_return_frame_no_direct(); stats->pass += nframes; return nframes; @@ -255,6 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + xdp_set_return_frame_no_direct(); ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); if (unlikely(ret->skb_n)) @@ -264,6 +263,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (stats->redirect) xdp_do_flush(); + xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 94854cd9c4cc..83c4d9943084 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, siv_len = siv ? __bpf_dynptr_size(siv) : 0; src_len = __bpf_dynptr_size(src); dst_len = __bpf_dynptr_size(dst); - if (!src_len || !dst_len) + if (!src_len || !dst_len || src_len > dst_len) return -EINVAL; if (siv_len != ctx->siv_len) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68..8af62cb243d9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1274,8 +1274,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until + * kmalloc_nolock() is available, avoid locking issues by using + * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). + */ + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; @@ -3664,10 +3667,17 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { - for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) { + for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&c2, s2__ign + j, char, err_out); if (c2 == '\0') return i; + /* + * We allow reading an extra byte from s2 (note the + * `i + j <= len` above) to cover the case when s2 is + * a suffix of the first len chars of s1. + */ + if (i + j == len) + break; __get_kernel_nofault(&c1, s1__ign + j, char, err_out); if (c1 == '\0') return -ENOENT; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392..f90bdcc0a047 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -442,7 +442,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(path_fd, pathname, &path, 0); + dentry = start_creating_user_path(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -471,7 +471,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, ret = -EPERM; } out: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return ret; } @@ -788,7 +788,7 @@ static void bpf_free_inode(struct inode *inode) const struct super_operations bpf_super_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = bpf_show_options, .free_inode = bpf_free_inode, }; diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 5ab354d55d82..a00561b1d3e5 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -471,7 +471,7 @@ queue: * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= _Q_MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { lockevent_inc(lock_no_node); RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); while (!queued_spin_trylock(lock)) { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3615c06b7dfa..ec3a57a5fba1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, if (max_depth > sysctl_perf_event_max_stack) max_depth = sysctl_perf_event_max_stack; - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, false, false); if (unlikely(!trace)) @@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else if (kernel && task) trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, crosstask, false); if (unlikely(!trace) || trace->nr < skip) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af6..8340cecd1b35 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8547,6 +8547,10 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); + return -EOPNOTSUPP; + } meta->map_uid = reg->map_uid; meta->map_ptr = map; return 0; @@ -11354,7 +11358,7 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return -EINVAL; *ptr = env->ops->get_func_proto(func_id, env->prog); - return *ptr ? 0 : -EINVAL; + return *ptr && (*ptr)->func ? 0 : -EINVAL; } static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -23855,6 +23859,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, BTF_SET_START(btf_id_deny) BTF_ID_UNUSED #ifdef CONFIG_SMP +BTF_ID(func, ___migrate_enable) BTF_ID(func, migrate_disable) BTF_ID(func, migrate_enable) #endif diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index b14e61c64a34..22051b4f1ccb 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,12 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(bool lock_threadgroup); -void cgroup_attach_unlock(bool lock_threadgroup); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) + enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task, bool locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 2a4a387f867a..a9e029b570c8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -10,6 +10,7 @@ #include <linux/sched/task.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> @@ -68,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); for_each_root(root) { struct cgroup *from_cgrp; @@ -80,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return retval; @@ -117,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -153,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return ret; } @@ -502,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; - bool locked; + enum cgroup_attach_lock_mode lock_mode; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -531,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -1133,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc) if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, ctx->release_agent); + strscpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } @@ -1325,7 +1326,7 @@ static int __init cgroup1_wq_init(void) * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); + WQ_PERCPU, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..da4eaee52178 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -59,6 +59,7 @@ #include <linux/sched/cputime.h> #include <linux/sched/deadline.h> #include <linux/psi.h> +#include <linux/nstree.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -124,10 +125,33 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup - * destruction work items don't end up filling up max_active of system_wq + * destruction work items don't end up filling up max_active of system_percpu_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -216,13 +240,22 @@ static u16 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); +/* + * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem, + * read protected by either. + * + * Can only be turned on, but not turned off. + */ +bool cgroup_enable_per_threadgroup_rwsem __read_mostly; + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, - .ns.inum = PROC_CGROUP_INIT_INO, + .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, + .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; @@ -1302,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; - /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + /* + * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition. + * favordynmods can flip while task is between + * cgroup_threadgroup_change_begin() and end(), so down_write global + * cgroup_threadgroup_rwsem to synchronize them. + * + * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding + * cgroup_threadgroup_rwsem doesn't exlude tasks between + * cgroup_thread_group_change_begin() and end() and thus it's unsafe to + * turn off. As the scenario is unlikely, simply disallow disabling once + * enabled and print out a warning. + */ + percpu_down_write(&cgroup_threadgroup_rwsem); if (favor && !favoring) { + cgroup_enable_per_threadgroup_rwsem = true; rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { + if (cgroup_enable_per_threadgroup_rwsem) + pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n"); rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } + percpu_up_write(&cgroup_threadgroup_rwsem); } static int cgroup_init_root_id(struct cgroup_root *root) @@ -2459,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * @lock_mode: whether acquire and acquire which rwsem + * @tsk: thread group to lock * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() @@ -2479,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. + * + * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead + * on dynamic cgroup modifications. see the comment above + * CGRP_ROOT_FAVOR_DYNMODS definition. + * + * tsk is not NULL only when writing to cgroup.procs. */ -void cgroup_attach_lock(bool lock_threadgroup) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { cpus_read_lock(); - if (lock_threadgroup) + + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + down_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + * @lock_mode: whether release and release which rwsem + * @tsk: thread group to lock */ -void cgroup_attach_unlock(bool lock_threadgroup) +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { - if (lock_threadgroup) + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + up_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } + cpus_read_unlock(); } @@ -2944,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, /* look up all src csets */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ @@ -2968,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *threadgroup_locked) + enum cgroup_attach_lock_mode *lock_mode) { struct task_struct *tsk; pid_t pid; @@ -2976,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - /* - * If we migrate a single thread, we don't care about threadgroup - * stability. If the thread is `current`, it won't exit(2) under our - * hands or change PID through exec(2). We exclude - * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write - * callers by cgroup_mutex. - * Therefore, we can skip the global lock. - */ - lockdep_assert_held(&cgroup_mutex); - *threadgroup_locked = pid || threadgroup; - cgroup_attach_lock(*threadgroup_locked); - +retry_find_task: rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } } else { tsk = current; @@ -3010,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } - get_task_struct(tsk); - goto out_unlock_rcu; + rcu_read_unlock(); + + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers + * by cgroup_mutex. Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + + if (pid || threadgroup) { + if (cgroup_enable_per_threadgroup_rwsem) + *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP; + else + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + } else { + *lock_mode = CGRP_ATTACH_LOCK_NONE; + } + + cgroup_attach_lock(*lock_mode, tsk); + + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * A race with de_thread from another thread's exec() + * may strip us of our leadership. If this happens, + * throw this task away and try again. + */ + cgroup_attach_unlock(*lock_mode, tsk); + put_task_struct(tsk); + goto retry_find_task; + } + } + + return tsk; -out_unlock_threadgroup: - cgroup_attach_unlock(*threadgroup_locked); - *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) { - struct cgroup_subsys *ss; - int ssid; + cgroup_attach_unlock(lock_mode, task); /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - - cgroup_attach_unlock(threadgroup_locked); - - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3088,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + enum cgroup_attach_lock_mode lock_mode; bool has_tasks; int ret; @@ -3119,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - cgroup_attach_lock(has_tasks); + + if (has_tasks) + lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(lock_mode, NULL); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3140,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(has_tasks); + cgroup_attach_unlock(lock_mode, NULL); return ret; } @@ -3763,6 +3865,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + unsigned int sequence; + u64 freeze_time; + + do { + sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); + freeze_time = cgrp->freezer.frozen_nsec; + /* Add in current freezer interval if the cgroup is freezing. */ + if (test_bit(CGRP_FREEZE, &cgrp->flags)) + freeze_time += (ktime_get_ns() - + cgrp->freezer.freeze_start_nsec); + } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); + + do_div(freeze_time, NSEC_PER_USEC); + seq_printf(seq, "frozen_usec %llu\n", freeze_time); + + return 0; +} + #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem @@ -4159,6 +4282,7 @@ static void cgroup_file_release(struct kernfs_open_file *of) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); + of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, @@ -5241,13 +5365,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool threadgroup_locked; + enum cgroup_attach_lock_mode lock_mode; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5273,7 +5397,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, threadgroup_locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -5355,6 +5479,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_stat_show, }, { + .name = "cgroup.stat.local", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_core_local_stat_show, + }, + { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_freeze_show, @@ -5558,7 +5687,7 @@ static void css_release_work_fn(struct work_struct *work) cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5567,7 +5696,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5701,7 +5830,7 @@ err_list_del: list_del_rcu(&css->sibling); err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5763,6 +5892,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; + seqcount_init(&cgrp->freezer.freeze_seq); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be @@ -5771,6 +5901,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.freeze_start_nsec = ktime_get_ns(); set_bit(CGRP_FROZEN, &cgrp->flags); } @@ -5939,7 +6070,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6312,6 +6443,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cpuset_fs_type)); #endif + ns_tree_add(&init_cgroup_ns); return 0; } @@ -6325,8 +6457,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 383963e28ac6..337608f408ce 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -38,7 +38,6 @@ enum prs_errcode { /* bits in struct cpuset flags field */ typedef enum { - CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); + return css_is_online(&cs->css) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) @@ -277,6 +276,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int cpuset_common_seq_show(struct seq_file *sf, void *v); +void cpuset_full_lock(void); +void cpuset_full_unlock(void); /* * cpuset-v1.c diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index b69a7db67090..12e76774c75b 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } @@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 27adb04df675..52468d2c178a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -40,6 +40,7 @@ #include <linux/sched/isolation.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <linux/task_work.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -131,11 +132,6 @@ static bool force_sd_rebuild; #define PRS_INVALID_ROOT -1 #define PRS_INVALID_ISOLATED -2 -static inline bool is_prs_invalid(int prs_state) -{ - return prs_state < 0; -} - /* * Temporary cpumasks for working with partitions that are passed among * functions to avoid memory allocation in inner functions. @@ -159,16 +155,21 @@ void dec_dl_tasks_cs(struct task_struct *p) cs->nr_deadline_tasks--; } -static inline int is_partition_valid(const struct cpuset *cs) +static inline bool is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } -static inline int is_partition_invalid(const struct cpuset *cs) +static inline bool is_partition_invalid(const struct cpuset *cs) { return cs->partition_root_state < 0; } +static inline bool cs_is_member(const struct cpuset *cs) +{ + return cs->partition_root_state == PRS_MEMBER; +} + /* * Callers should hold callback_lock to modify partition_root_state. */ @@ -207,7 +208,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ static struct cpuset top_cpuset = { - .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, @@ -250,6 +251,12 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); +/** + * cpuset_lock - Acquire the global cpuset mutex + * + * This locks the global cpuset mutex to prevent modifications to cpuset + * hierarchy and configurations. This helper is not enough to make modification. + */ void cpuset_lock(void) { mutex_lock(&cpuset_mutex); @@ -260,6 +267,24 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +/** + * cpuset_full_lock - Acquire full protection for cpuset modification + * + * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex + * to safely modify cpuset data. + */ +void cpuset_full_lock(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +void cpuset_full_unlock(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + static DEFINE_SPINLOCK(callback_lock); void cpuset_callback_lock_irq(void) @@ -411,94 +436,104 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) } /** - * alloc_cpumasks - allocate three cpumasks for cpuset - * @cs: the cpuset that have cpumasks to be allocated. - * @tmp: the tmpmasks structure pointer + * alloc_cpumasks - Allocate an array of cpumask variables + * @pmasks: Pointer to array of cpumask_var_t pointers + * @size: Number of cpumasks to allocate * Return: 0 if successful, -ENOMEM otherwise. * - * Only one of the two input arguments should be non-NULL. + * Allocates @size cpumasks and initializes them to empty. Returns 0 on + * success, -ENOMEM on allocation failure. On failure, any previously + * allocated cpumasks are freed. */ -static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) { - cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; + int i; - if (cs) { - pmask1 = &cs->cpus_allowed; - pmask2 = &cs->effective_cpus; - pmask3 = &cs->effective_xcpus; - pmask4 = &cs->exclusive_cpus; - } else { - pmask1 = &tmp->new_cpus; - pmask2 = &tmp->addmask; - pmask3 = &tmp->delmask; - pmask4 = NULL; + for (i = 0; i < size; i++) { + if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) { + while (--i >= 0) + free_cpumask_var(*pmasks[i]); + return -ENOMEM; + } } - - if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) - goto free_one; - - if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) - goto free_two; - - if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) - goto free_three; - - return 0; +} + +/** + * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. + * @tmp: Pointer to tmpmasks structure to populate + * Return: 0 on success, -ENOMEM on allocation failure + */ +static inline int alloc_tmpmasks(struct tmpmasks *tmp) +{ + /* + * Array of pointers to the three cpumask_var_t fields in tmpmasks. + * Note: Array size must match actual number of masks (3) + */ + cpumask_var_t *pmask[3] = { + &tmp->new_cpus, + &tmp->addmask, + &tmp->delmask + }; -free_three: - free_cpumask_var(*pmask3); -free_two: - free_cpumask_var(*pmask2); -free_one: - free_cpumask_var(*pmask1); - return -ENOMEM; + return alloc_cpumasks(pmask, ARRAY_SIZE(pmask)); } /** - * free_cpumasks - free cpumasks in a tmpmasks structure - * @cs: the cpuset that have cpumasks to be free. + * free_tmpmasks - free cpumasks in a tmpmasks structure * @tmp: the tmpmasks structure pointer */ -static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline void free_tmpmasks(struct tmpmasks *tmp) { - if (cs) { - free_cpumask_var(cs->cpus_allowed); - free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->effective_xcpus); - free_cpumask_var(cs->exclusive_cpus); - } - if (tmp) { - free_cpumask_var(tmp->new_cpus); - free_cpumask_var(tmp->addmask); - free_cpumask_var(tmp->delmask); - } + if (!tmp) + return; + + free_cpumask_var(tmp->new_cpus); + free_cpumask_var(tmp->addmask); + free_cpumask_var(tmp->delmask); } /** - * alloc_trial_cpuset - allocate a trial cpuset - * @cs: the cpuset that the trial cpuset duplicates + * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset + * @cs: Source cpuset to duplicate (NULL for a fresh allocation) + * + * Creates a new cpuset by either: + * 1. Duplicating an existing cpuset (if @cs is non-NULL), or + * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) + * + * Return: Pointer to newly allocated cpuset on success, NULL on failure */ -static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) +static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) { struct cpuset *trial; - trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + /* Allocate base structure */ + trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : + kzalloc(sizeof(*cs), GFP_KERNEL); if (!trial) return NULL; - if (alloc_cpumasks(trial, NULL)) { + /* Setup cpumask pointer array */ + cpumask_var_t *pmask[4] = { + &trial->cpus_allowed, + &trial->effective_cpus, + &trial->effective_xcpus, + &trial->exclusive_cpus + }; + + if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) { kfree(trial); return NULL; } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); - cpumask_copy(trial->effective_cpus, cs->effective_cpus); - cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); - cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + /* Copy masks if duplicating */ + if (cs) { + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + } + return trial; } @@ -508,7 +543,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static inline void free_cpuset(struct cpuset *cs) { - free_cpumasks(cs, NULL); + free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->effective_cpus); + free_cpumask_var(cs->effective_xcpus); + free_cpumask_var(cs->exclusive_cpus); kfree(cs); } @@ -540,6 +578,47 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) return true; } +/** + * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * Conflict detection rules: + * 1. If either cpuset is CPU exclusive, they must be mutually exclusive + * 2. exclusive_cpus masks cannot intersect between cpusets + * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + */ +static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + /* If either cpuset is exclusive, check if they are mutually exclusive */ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return !cpusets_are_exclusive(cs1, cs2); + + /* Exclusive_cpus cannot intersect */ + if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) + return true; + + /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ + if (!cpumask_empty(cs1->cpus_allowed) && + cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) + return true; + + if (!cpumask_empty(cs2->cpus_allowed) && + cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + return true; + + return false; +} + +static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2))) + return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); + return false; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -621,38 +700,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - bool txset, cxset; /* Are exclusive_cpus set? */ - if (c == cur) continue; - - txset = !cpumask_empty(trial->exclusive_cpus); - cxset = !cpumask_empty(c->exclusive_cpus); - if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || - (txset && cxset)) { - if (!cpusets_are_exclusive(trial, c)) - goto out; - } else if (txset || cxset) { - struct cpumask *xcpus, *acpus; - - /* - * When just one of the exclusive_cpus's is set, - * cpus_allowed of the other cpuset, if set, cannot be - * a subset of it or none of those CPUs will be - * available if these exclusive CPUs are activated. - */ - if (txset) { - xcpus = trial->exclusive_cpus; - acpus = c->cpus_allowed; - } else { - xcpus = c->exclusive_cpus; - acpus = trial->cpus_allowed; - } - if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) - goto out; - } - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) + if (cpus_excl_conflict(trial, c)) + goto out; + if (mems_excl_conflict(trial, c)) goto out; } @@ -1363,38 +1415,25 @@ bool cpuset_cpu_is_isolated(int cpu) } EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); -/* - * compute_effective_exclusive_cpumask - compute effective exclusive CPUs - * @cs: cpuset - * @xcpus: effective exclusive CPUs value to be set - * @real_cs: the real cpuset (can be NULL) - * Return: 0 if there is no sibling conflict, > 0 otherwise +/** + * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets + * @parent: Parent cpuset containing all siblings + * @cs: Current cpuset (will be skipped) + * @excpus: exclusive effective CPU mask to modify * - * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to - * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus - * as well. The provision of real_cs means that a cpumask is being changed and - * the given cs is a trial one. + * This function ensures the given @excpus mask doesn't include any CPUs that + * are exclusively allocated to sibling cpusets. It walks through all siblings + * of @cs under @parent and removes their exclusive CPUs from @excpus. */ -static int compute_effective_exclusive_cpumask(struct cpuset *cs, - struct cpumask *xcpus, - struct cpuset *real_cs) +static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, + struct cpumask *excpus) { struct cgroup_subsys_state *css; - struct cpuset *parent = parent_cs(cs); struct cpuset *sibling; int retval = 0; - if (!xcpus) - xcpus = cs->effective_xcpus; - - cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); - - if (!real_cs) { - if (!cpumask_empty(cs->exclusive_cpus)) - return 0; - } else { - cs = real_cs; - } + if (cpumask_empty(excpus)) + return retval; /* * Exclude exclusive CPUs from siblings @@ -1404,20 +1443,66 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs, if (sibling == cs) continue; - if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) { - cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); + if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { + cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); retval++; continue; } - if (cpumask_intersects(xcpus, sibling->effective_xcpus)) { - cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); + if (cpumask_intersects(excpus, sibling->effective_xcpus)) { + cpumask_andnot(excpus, excpus, sibling->effective_xcpus); retval++; } } rcu_read_unlock(); + return retval; } +/* + * compute_excpus - compute effective exclusive CPUs + * @cs: cpuset + * @xcpus: effective exclusive CPUs value to be set + * Return: 0 if there is no sibling conflict, > 0 otherwise + * + * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets + * and exclude their exclusive_cpus or effective_xcpus as well. + */ +static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) +{ + struct cpuset *parent = parent_cs(cs); + + cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus); + + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + +/* + * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset + * @trialcs: The trial cpuset containing the proposed new configuration + * @cs: The original cpuset that the trial configuration is based on + * Return: 0 if successful with no sibling conflict, >0 if a conflict is found + * + * Computes the effective_xcpus for a trial configuration. @cs is provided to represent + * the real cs. + */ +static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(trialcs); + struct cpumask *excpus = trialcs->effective_xcpus; + + /* trialcs is member, cpuset.cpus has no impact to excpus */ + if (cs_is_member(cs)) + cpumask_and(excpus, trialcs->exclusive_cpus, + parent->effective_xcpus); + else + cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + static inline bool is_remote_partition(struct cpuset *cs) { return !list_empty(&cs->remote_sibling); @@ -1459,7 +1544,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * Note that creating a remote partition with any local partition root * above it or remote partition root underneath it is not allowed. */ - compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); + compute_excpus(cs, tmp->new_cpus); WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) @@ -1508,7 +1593,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) cs->partition_root_state = PRS_MEMBER; /* effective_xcpus may need to be changed */ - compute_effective_exclusive_cpumask(cs, NULL, NULL); + compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); @@ -1677,7 +1762,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, old_prs = new_prs = cs->partition_root_state; if (cmd == partcmd_invalidate) { - if (is_prs_invalid(old_prs)) + if (is_partition_invalid(cs)) return 0; /* @@ -1709,13 +1794,14 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* - * Need to call compute_effective_exclusive_cpumask() in case + * Need to call compute_excpus() in case * exclusive_cpus not set. Sibling conflict should only happen * if exclusive_cpus isn't set. */ xcpus = tmp->delmask; - if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) + if (compute_excpus(cs, xcpus)) WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; /* * Enabling partition root is not allowed if its @@ -1727,11 +1813,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; - /* - * A parent can be left with no CPU as long as there is no - * task directly associated with the parent partition. - */ - if (nocpu) + if (tasks_nocpu_error(parent, cs, xcpus)) return PERR_NOCPUS; /* @@ -1748,7 +1830,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, deleting = true; subparts_delta++; - new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* * May need to add cpus back to parent's effective_cpus @@ -1788,7 +1869,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * For invalid partition: * delmask = newmask & parent->effective_xcpus */ - if (is_prs_invalid(old_prs)) { + if (is_partition_invalid(cs)) { adding = false; deleting = cpumask_and(tmp->delmask, newmask, parent->effective_xcpus); @@ -1837,7 +1918,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * A partition error happens when parent has tasks and all * its effective CPUs will have to be distributed out. */ - WARN_ON_ONCE(!is_partition_valid(parent)); if (nocpu) { part_error = PERR_NOCPUS; if (is_partition_valid(cs)) @@ -1996,7 +2076,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * 2) All the effective_cpus will be used up and cp * has tasks */ - compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); + compute_excpus(cs, new_ecpus); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); @@ -2075,7 +2155,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * its value is being processed. */ if (remote && (cp != cs)) { - compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); + compute_excpus(cp, tmp->new_cpus); if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -2177,7 +2257,7 @@ get_css: cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_effective_exclusive_cpumask(cp, NULL, NULL); + compute_excpus(cp, cp->effective_xcpus); /* * Make sure effective_xcpus is properly set for a valid @@ -2284,82 +2364,54 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, rcu_read_unlock(); } -/** - * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it - * @cs: the cpuset to consider - * @trialcs: trial cpuset - * @buf: buffer of cpu numbers written to this cpuset - */ -static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, - const char *buf) +static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) { int retval; - struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; - bool force = false; - int old_prs = cs->partition_root_state; - /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */ - if (cs == &top_cpuset) - return -EACCES; + retval = cpulist_parse(buf, out_mask); + if (retval < 0) + return retval; + if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed)) + return -EINVAL; - /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. - * Since cpulist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have cpus. - */ - if (!*buf) { - cpumask_clear(trialcs->cpus_allowed); - if (cpumask_empty(trialcs->exclusive_cpus)) - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); - if (retval < 0) - return retval; + return 0; +} - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) - return -EINVAL; +/** + * validate_partition - Validate a cpuset partition configuration + * @cs: The cpuset to validate + * @trialcs: The trial cpuset containing proposed configuration changes + * + * If any validation check fails, the appropriate error code is set in the + * cpuset's prs_err field. + * + * Return: PRS error code (0 if valid, non-zero error code if invalid) + */ +static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) +{ + struct cpuset *parent = parent_cs(cs); - /* - * When exclusive_cpus isn't explicitly set, it is constrained - * by cpus_allowed and parent's effective_xcpus. Otherwise, - * trialcs->effective_xcpus is used as a temporary cpumask - * for checking validity of the partition root. - */ - trialcs->partition_root_state = PRS_MEMBER; - if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) - compute_effective_exclusive_cpumask(trialcs, NULL, cs); - } + if (cs_is_member(trialcs)) + return PERR_NONE; - /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) - return 0; + if (cpumask_empty(trialcs->effective_xcpus)) + return PERR_INVCPUS; - if (alloc_cpumasks(NULL, &tmp)) - return -ENOMEM; + if (prstate_housekeeping_conflict(trialcs->partition_root_state, + trialcs->effective_xcpus)) + return PERR_HKEEPING; - if (old_prs) { - if (is_partition_valid(cs) && - cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } - } + if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) + return PERR_NOCPUS; - /* - * Check all the descendants in update_cpumasks_hier() if - * effective_xcpus is to be changed. - */ - force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + return PERR_NONE; +} + +static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + int retval; + struct cpuset *parent = parent_cs(cs); retval = validate_change(cs, trialcs); @@ -2374,7 +2426,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * partition. However, any conflicting sibling partitions * have to be marked as invalid too. */ - invalidate = true; + trialcs->prs_err = PERR_NOTEXCL; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { struct cpumask *xcpus = user_xcpus(trialcs); @@ -2382,36 +2434,92 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); + update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); rcu_read_lock(); } } rcu_read_unlock(); retval = 0; } + return retval; +} - if (retval < 0) - goto out_free; +/** + * partition_cpus_change - Handle partition state changes due to CPU mask updates + * @cs: The target cpuset being modified + * @trialcs: The trial cpuset containing proposed configuration changes + * @tmp: Temporary masks for intermediate calculations + * + * This function handles partition state transitions triggered by CPU mask changes. + * CPU modifications may cause a partition to be disabled or require state updates. + */ +static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + enum prs_errcode prs_err; - if (is_partition_valid(cs) || - (is_partition_invalid(cs) && !invalidate)) { - struct cpumask *xcpus = trialcs->effective_xcpus; + if (cs_is_member(cs)) + return; - if (cpumask_empty(xcpus) && is_partition_invalid(cs)) - xcpus = trialcs->cpus_allowed; + prs_err = validate_partition(cs, trialcs); + if (prs_err) + trialcs->prs_err = cs->prs_err = prs_err; - /* - * Call remote_cpus_update() to handle valid remote partition - */ - if (is_remote_partition(cs)) - remote_cpus_update(cs, NULL, xcpus, &tmp); - else if (invalidate) + if (is_remote_partition(cs)) { + if (trialcs->prs_err) + remote_partition_disable(cs, tmp); + else + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, tmp); + } else { + if (trialcs->prs_err) update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); + NULL, tmp); else update_parent_effective_cpumask(cs, partcmd_update, - xcpus, &tmp); + trialcs->effective_xcpus, tmp); } +} + +/** + * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + struct tmpmasks tmp; + bool force = false; + int old_prs = cs->partition_root_state; + + retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); + if (retval < 0) + return retval; + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + return 0; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; + + compute_trialcs_excpus(trialcs, cs); + trialcs->prs_err = PERR_NONE; + + retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + if (retval < 0) + goto out_free; + + /* + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. + */ + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + + partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); @@ -2427,7 +2535,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); out_free: - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return retval; } @@ -2444,33 +2552,23 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; bool force = false; int old_prs = cs->partition_root_state; - if (!*buf) { - cpumask_clear(trialcs->exclusive_cpus); - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->exclusive_cpus); - if (retval < 0) - return retval; - } + retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus); + if (retval < 0) + return retval; /* Nothing to do if the CPUs didn't change */ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (*buf) { - trialcs->partition_root_state = PRS_MEMBER; - /* - * Reject the change if there is exclusive CPUs conflict with - * the siblings. - */ - if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) - return -EINVAL; - } + /* + * Reject the change if there is exclusive CPUs conflict with + * the siblings. + */ + if (compute_trialcs_excpus(trialcs, cs)) + return -EINVAL; /* * Check all the descendants in update_cpumasks_hier() if @@ -2482,35 +2580,12 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval) return retval; - if (alloc_cpumasks(NULL, &tmp)) + if (alloc_tmpmasks(&tmp)) return -ENOMEM; - if (old_prs) { - if (cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } + trialcs->prs_err = PERR_NONE; + partition_cpus_change(cs, trialcs, &tmp); - if (is_remote_partition(cs)) { - if (invalidate) - remote_partition_disable(cs, &tmp); - else - remote_cpus_update(cs, trialcs->exclusive_cpus, - trialcs->effective_xcpus, &tmp); - } else if (invalidate) { - update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); - } else { - update_parent_effective_cpumask(cs, partcmd_update, - trialcs->effective_xcpus, &tmp); - } - } spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); @@ -2530,7 +2605,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return 0; } @@ -2582,9 +2657,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -static void cpuset_post_attach(void) +static void flush_migrate_mm_task_workfn(struct callback_head *head) { flush_workqueue(cpuset_migrate_mm_wq); + kfree(head); +} + +static void schedule_flush_migrate_mm(void) +{ + struct callback_head *flush_cb; + + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); + if (!flush_cb) + return; + + init_task_work(flush_cb, flush_migrate_mm_task_workfn); + + if (task_work_add(current, flush_cb, TWA_RESUME)) + kfree(flush_cb); } /* @@ -2750,32 +2840,17 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, int retval; /* - * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; - * it's read-only - */ - if (cs == &top_cpuset) { - retval = -EACCES; - goto done; - } - - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. - * Since nodelist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have memory. + * The validate_change() call ensures that cpusets with tasks have memory. */ - if (!*buf) { - nodes_clear(trialcs->mems_allowed); - } else { - retval = nodelist_parse(buf, trialcs->mems_allowed); - if (retval < 0) - goto done; + retval = nodelist_parse(buf, trialcs->mems_allowed); + if (retval < 0) + goto done; - if (!nodes_subset(trialcs->mems_allowed, - top_cpuset.mems_allowed)) { - retval = -EINVAL; - goto done; - } + if (!nodes_subset(trialcs->mems_allowed, + top_cpuset.mems_allowed)) { + retval = -EINVAL; + goto done; } if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { @@ -2826,7 +2901,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int spread_flag_changed; int err; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) return -ENOMEM; @@ -2884,10 +2959,10 @@ static int update_prstate(struct cpuset *cs, int new_prs) /* * Treat a previously invalid partition root as if it is a "member". */ - if (new_prs && is_prs_invalid(old_prs)) + if (new_prs && is_partition_invalid(cs)) old_prs = PRS_MEMBER; - if (alloc_cpumasks(NULL, &tmpmask)) + if (alloc_tmpmasks(&tmpmask)) return -ENOMEM; err = update_partition_exclusive_flag(cs, new_prs); @@ -2983,7 +3058,7 @@ out: notify_partition_change(cs, old_prs); if (force_sd_rebuild) rebuild_sched_domains_locked(); - free_cpumasks(NULL, &tmpmask); + free_tmpmasks(&tmpmask); return 0; } @@ -3141,6 +3216,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; bool cpus_updated, mems_updated; + bool queue_task_work = false; cgroup_taskset_first(tset, &css); cs = css_cs(css); @@ -3191,15 +3267,18 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) + if (is_memory_migrate(cs)) { cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - else + queue_task_work = true; + } else mmput(mm); } } out: + if (queue_task_work) + schedule_flush_migrate_mm(); cs->old_mems_allowed = cpuset_attach_nodemask_to; if (cs->nr_migrate_dl_tasks) { @@ -3223,13 +3302,16 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, struct cpuset *trialcs; int retval = -ENODEV; + /* root is read-only */ + if (cs == &top_cpuset) + return -EACCES; + buf = strstrip(buf); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) { retval = -ENOMEM; goto out_unlock; @@ -3254,9 +3336,9 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, if (force_sd_rebuild) rebuild_sched_domains_locked(); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - flush_workqueue(cpuset_migrate_mm_wq); + cpuset_full_unlock(); + if (of_cft(of)->private == FILE_MEMLIST) + schedule_flush_migrate_mm(); return retval ?: nbytes; } @@ -3358,12 +3440,10 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, else return -EINVAL; - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); return retval ?: nbytes; } @@ -3462,15 +3542,10 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) return &top_cpuset.css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); + cs = dup_or_alloc_cpuset(NULL); if (!cs) return ERR_PTR(-ENOMEM); - if (alloc_cpumasks(cs, NULL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } - __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; @@ -3493,10 +3568,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - - set_bit(CS_ONLINE, &cs->flags); + cpuset_full_lock(); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) @@ -3548,8 +3620,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); return 0; } @@ -3564,17 +3635,12 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); - clear_bit(CS_ONLINE, &cs->flags); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); } /* @@ -3586,16 +3652,11 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); /* Reset valid partition back to member */ if (is_partition_valid(cs)) update_prstate(cs, PRS_MEMBER); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - + cpuset_full_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -3724,7 +3785,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .post_attach = cpuset_post_attach, .bind = cpuset_bind, .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, @@ -3928,7 +3988,7 @@ static void cpuset_handle_hotplug(void) bool on_dfl = is_in_v2_mode(); struct tmpmasks tmp, *ptmp = NULL; - if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + if (on_dfl && !alloc_tmpmasks(&tmp)) ptmp = &tmp; lockdep_assert_cpus_held(); @@ -4008,7 +4068,7 @@ static void cpuset_handle_hotplug(void) if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); - free_cpumasks(NULL, ptmp); + free_tmpmasks(ptmp); } void cpuset_update_active_cpus(void) @@ -4073,7 +4133,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) struct cpuset *cs; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = task_cs(tsk); if (cs != &top_cpuset) @@ -4095,7 +4154,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) cpumask_copy(pmask, possible_mask); } - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } @@ -4168,9 +4226,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return mask; @@ -4265,10 +4321,8 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return allowed; diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 80aa3f027ac3..81ea38dd6f9d 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) return -ENODEV; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); @@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, css, css->id); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); cgroup_kn_unlock(of->kn); return 0; @@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) return -ENOMEM; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name_buf); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); kfree(name_buf); return 0; diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index bf1690a167dd..6c18854bff34 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze) /* * Freeze or unfreeze all tasks in the given cgroup. */ -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) { struct css_task_iter it; struct task_struct *task; @@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - if (freeze) + write_seqcount_begin(&cgrp->freezer.freeze_seq); + if (freeze) { set_bit(CGRP_FREEZE, &cgrp->flags); - else + cgrp->freezer.freeze_start_nsec = ts_nsec; + } else { clear_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.frozen_nsec += (ts_nsec - + cgrp->freezer.freeze_start_nsec); + } + write_seqcount_end(&cgrp->freezer.freeze_seq); spin_unlock_irq(&css_set_lock); if (freeze) @@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) struct cgroup *parent; struct cgroup *dsct; bool applied = false; + u64 ts_nsec; bool old_e; lockdep_assert_held(&cgroup_mutex); @@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) return; cgrp->freezer.freeze = freeze; + ts_nsec = ktime_get_ns(); /* * Propagate changes downwards the cgroup tree. @@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) /* * Do change actual state: freeze or unfreeze. */ - cgroup_do_freeze(dsct, freeze); + cgroup_do_freeze(dsct, freeze, ts_nsec); applied = true; } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c6..fdbe57578e68 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -5,7 +5,7 @@ #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> - +#include <linux/nstree.h> /* cgroup namespaces */ @@ -21,33 +21,32 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts) static struct cgroup_namespace *alloc_cgroup_ns(void) { - struct cgroup_namespace *new_ns; + struct cgroup_namespace *new_ns __free(kfree) = NULL; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); + ret = ns_common_init(new_ns); + if (ret) return ERR_PTR(ret); - } - refcount_set(&new_ns->ns.count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; + ns_tree_add(new_ns); + return no_free_ptr(new_ns); } void free_cgroup_ns(struct cgroup_namespace *ns) { + ns_tree_remove(ns); put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } EXPORT_SYMBOL(free_cgroup_ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { @@ -90,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -143,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns) const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", - .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 64caaf997fc0..7c3924614e01 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -93,8 +93,8 @@ CONFIG_SECCOMP_FILTER=y # Provides some protections against SYN flooding. CONFIG_SYN_COOKIES=y -# Enable Kernel Control Flow Integrity (currently Clang only). -CONFIG_CFI_CLANG=y +# Enable Kernel Control Flow Integrity. +CONFIG_CFI=y # CONFIG_CFI_PERMISSIVE is not set # Attack surface reduction: do not autoload TTY line disciplines. diff --git a/kernel/cred.c b/kernel/cred.c index 9676965c0981..dbf6b687dc5c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -287,7 +287,7 @@ struct cred *prepare_exec_creds(void) * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ -int copy_creds(struct task_struct *p, unsigned long clone_flags) +int copy_creds(struct task_struct *p, u64 clone_flags) { struct cred *new; int ret; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index e43c6de2bce4..b82399437db0 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -39,6 +39,7 @@ enum { dma_debug_sg, dma_debug_coherent, dma_debug_resource, + dma_debug_noncoherent, }; enum map_err_types { @@ -141,6 +142,7 @@ static const char *type2name[] = { [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", [dma_debug_resource] = "resource", + [dma_debug_noncoherent] = "noncoherent", }; static const char *dir2name[] = { @@ -993,7 +995,8 @@ static void check_unmap(struct dma_debug_entry *ref) "[mapped as %s] [unmapped as %s]\n", ref->dev_addr, ref->size, type2name[entry->type], type2name[ref->type]); - } else if (entry->type == dma_debug_coherent && + } else if ((entry->type == dma_debug_coherent || + entry->type == dma_debug_noncoherent) && ref->paddr != entry->paddr) { err_printk(ref->dev, entry, "device driver frees " "DMA memory with different CPU address " @@ -1581,6 +1584,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } } +void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_noncoherent; + entry->dev = dev; + entry->paddr = page_to_phys(page); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + + add_dma_entry(entry, attrs); +} + +void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ + struct dma_debug_entry ref = { + .type = dma_debug_noncoherent, + .dev = dev, + .paddr = page_to_phys(page), + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} + static int __init dma_debug_driver_setup(char *str) { int i; diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index f525197d3cae..48757ca13f31 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -54,6 +54,13 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev, extern void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction); +extern void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs); +extern void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, @@ -126,5 +133,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev, int nelems, int direction) { } + +static inline void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ +} + +static inline void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ +} #endif /* CONFIG_DMA_API_DEBUG */ #endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 107e4a4d251d..56de28a3b179 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -712,7 +712,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, if (page) { trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, size, dir, gfp, 0); - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); + debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0); } else { trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0); } @@ -738,7 +738,7 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0); - debug_dma_unmap_page(dev, dma_handle, size, dir); + debug_dma_free_pages(dev, page, size, dir, dma_handle); __dma_free_pages(dev, size, page, dma_handle, dir); } EXPORT_SYMBOL_GPL(dma_free_pages); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 408d28b5179d..f62e1d1b2063 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -143,6 +143,20 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } +/** + * arch_irqentry_exit_need_resched - Architecture specific need resched function + * + * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed. + * Defaults return true. + * + * The main purpose is to permit arch to avoid preemption of a task from an IRQ. + */ +static inline bool arch_irqentry_exit_need_resched(void); + +#ifndef arch_irqentry_exit_need_resched +static inline bool arch_irqentry_exit_need_resched(void) { return true; } +#endif + void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { @@ -150,7 +164,7 @@ void raw_irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (need_resched() && arch_irqentry_exit_need_resched()) preempt_schedule_irq(); } } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6c83ad674d01..808c0d7a31fa 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,22 +217,26 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr } struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; int rctx, start_entry_idx; + /* crosstask is not supported for user stacks */ + if (crosstask && user && !kernel) + return NULL; + entry = get_callchain_entry(&rctx); if (!entry) return NULL; - ctx.entry = entry; - ctx.max_stack = max_stack; - ctx.nr = entry->nr = init_nr; - ctx.contexts = 0; - ctx.contexts_maxed = false; + ctx.entry = entry; + ctx.max_stack = max_stack; + ctx.nr = entry->nr = 0; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) @@ -240,25 +244,19 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, perf_callchain_kernel(&ctx, regs); } - if (user) { + if (user && !crosstask) { if (!user_mode(regs)) { - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - if (crosstask) + if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) goto exit_put; + regs = task_pt_regs(current); + } - if (add_mark) - perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - start_entry_idx = entry->nr; - perf_callchain_user(&ctx, regs); - fixup_uretprobe_trampoline_entries(entry, start_entry_idx); - } + start_entry_idx = entry->nr; + perf_callchain_user(&ctx, regs); + fixup_uretprobe_trampoline_entries(entry, start_entry_idx); } exit_put: diff --git a/kernel/events/core.c b/kernel/events/core.c index 820127536e62..fb1eae762044 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3974,7 +3974,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, */ static inline bool event_update_userpage(struct perf_event *event) { - if (likely(!atomic_read(&event->mmap_count))) + if (likely(!refcount_read(&event->mmap_count))) return false; perf_event_update_time(event); @@ -6710,11 +6710,11 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; mapped_f mapped = get_mapped(event, event_mapped); - atomic_inc(&event->mmap_count); - atomic_inc(&event->rb->mmap_count); + refcount_inc(&event->mmap_count); + refcount_inc(&event->rb->mmap_count); if (vma->vm_pgoff) - atomic_inc(&event->rb->aux_mmap_count); + refcount_inc(&event->rb->aux_mmap_count); if (mapped) mapped(event, vma->vm_mm); @@ -6749,7 +6749,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { + refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -6769,10 +6769,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&rb->aux_mutex); } - if (atomic_dec_and_test(&rb->mmap_count)) + if (refcount_dec_and_test(&rb->mmap_count)) detach_rest = true; - if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); @@ -6933,230 +6933,242 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) return err; } -static int perf_mmap(struct file *file, struct vm_area_struct *vma) +static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra) { - struct perf_event *event = file->private_data; - unsigned long user_locked, user_lock_limit; + unsigned long user_locked, user_lock_limit, locked, lock_limit; struct user_struct *user = current_user(); - struct mutex *aux_mutex = NULL; - struct perf_buffer *rb = NULL; - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra = 0, extra = 0; - int ret, flags = 0; - mapped_f mapped; + + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + /* Increase the limit linearly with more CPUs */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm); /* - * Don't allow mmap() of inherited per-task counters. This would - * create a performance issue due to all children writing to the - * same rb. + * sysctl_perf_event_mlock may have changed, so that + * user->locked_vm > user_lock_limit */ - if (event->cpu == -1 && event->attr.inherit) - return -EINVAL; + if (user_locked > user_lock_limit) + user_locked = user_lock_limit; + user_locked += *user_extra; - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; + if (user_locked > user_lock_limit) { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ + *extra = user_locked - user_lock_limit; + *user_extra -= *extra; + } - ret = security_perf_event_read(event); - if (ret) - return ret; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra; - vma_size = vma->vm_end - vma->vm_start; - nr_pages = vma_size / PAGE_SIZE; + return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK); +} - if (nr_pages > INT_MAX) - return -ENOMEM; +static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra) +{ + struct user_struct *user = current_user(); - if (vma_size != PAGE_SIZE * nr_pages) - return -EINVAL; + atomic_long_add(user_extra, &user->locked_vm); + atomic64_add(extra, &vma->vm_mm->pinned_vm); +} - user_extra = nr_pages; +static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + struct perf_buffer *rb; + int rb_flags = 0; - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; + nr_pages -= 1; /* - * This relies on __pmu_detach_event() taking mmap_mutex after marking - * the event REVOKED. Either we observe the state, or __pmu_detach_event() - * will detach the rb created here. + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. */ - if (event->state <= PERF_EVENT_STATE_REVOKED) { - ret = -ENODEV; - goto unlock; - } - - if (vma->vm_pgoff == 0) { - nr_pages -= 1; - - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - goto unlock; - - WARN_ON_ONCE(event->ctx->parent_ctx); + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) - goto unlock; + WARN_ON_ONCE(event->ctx->parent_ctx); - if (atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Success -- managed to mmap() the same buffer - * multiple times. - */ - ret = 0; - /* We need the rb to map pages. */ - rb = event->rb; - goto unlock; - } + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) + return -EINVAL; + if (refcount_inc_not_zero(&event->rb->mmap_count)) { /* - * Raced against perf_mmap_close()'s - * atomic_dec_and_mutex_lock() remove the - * event and continue as if !event->rb + * Success -- managed to mmap() the same buffer + * multiple times. */ - ring_buffer_attach(event, NULL); + perf_mmap_account(vma, user_extra, extra); + refcount_inc(&event->mmap_count); + return 0; } - } else { /* - * AUX area mapping: if rb->aux_nr_pages != 0, it's already - * mapped, all subsequent mappings should have the same size - * and offset. Must be above the normal perf buffer. + * Raced against perf_mmap_close()'s + * refcount_dec_and_mutex_lock() remove the + * event and continue as if !event->rb */ - u64 aux_offset, aux_size; + ring_buffer_attach(event, NULL); + } - rb = event->rb; - if (!rb) - goto aux_unlock; + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) + return -EPERM; - aux_mutex = &rb->aux_mutex; - mutex_lock(aux_mutex); + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; - aux_offset = READ_ONCE(rb->user_page->aux_offset); - aux_size = READ_ONCE(rb->user_page->aux_size); + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, rb_flags); - if (aux_offset < perf_data_size(rb) + PAGE_SIZE) - goto aux_unlock; + if (!rb) + return -ENOMEM; - if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) - goto aux_unlock; + refcount_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; - /* already mapped with a different offset */ - if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) - goto aux_unlock; + ring_buffer_attach(event, rb); - if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) - goto aux_unlock; + perf_event_update_time(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); - /* already mapped with a different size */ - if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) - goto aux_unlock; + perf_mmap_account(vma, user_extra, extra); + refcount_set(&event->mmap_count, 1); - if (!is_power_of_2(nr_pages)) - goto aux_unlock; + return 0; +} - if (!atomic_inc_not_zero(&rb->mmap_count)) - goto aux_unlock; +static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + u64 aux_offset, aux_size; + struct perf_buffer *rb; + int ret, rb_flags = 0; - if (rb_has_aux(rb)) { - atomic_inc(&rb->aux_mmap_count); - ret = 0; - goto unlock; - } - } + rb = event->rb; + if (!rb) + return -EINVAL; - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + guard(mutex)(&rb->aux_mutex); /* - * Increase the limit linearly with more CPUs: + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. */ - user_lock_limit *= num_online_cpus(); + aux_offset = READ_ONCE(rb->user_page->aux_offset); + aux_size = READ_ONCE(rb->user_page->aux_size); - user_locked = atomic_long_read(&user->locked_vm); + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + return -EINVAL; - /* - * sysctl_perf_event_mlock may have changed, so that - * user->locked_vm > user_lock_limit - */ - if (user_locked > user_lock_limit) - user_locked = user_lock_limit; - user_locked += user_extra; + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + return -EINVAL; - if (user_locked > user_lock_limit) { - /* - * charge locked_vm until it hits user_lock_limit; - * charge the rest from pinned_vm - */ - extra = user_locked - user_lock_limit; - user_extra -= extra; - } + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + return -EINVAL; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; + if (aux_size != nr_pages * PAGE_SIZE) + return -EINVAL; - if ((locked > lock_limit) && perf_is_paranoid() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + return -EINVAL; - WARN_ON(!rb && event->rb); + if (!is_power_of_2(nr_pages)) + return -EINVAL; - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; + if (!refcount_inc_not_zero(&rb->mmap_count)) + return -EINVAL; - if (!rb) { - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); + if (rb_has_aux(rb)) { + refcount_inc(&rb->aux_mmap_count); - if (!rb) { - ret = -ENOMEM; - goto unlock; + } else { + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + refcount_dec(&rb->mmap_count); + return -EPERM; } - atomic_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); - rb->mmap_locked = extra; + WARN_ON(!rb && event->rb); - ring_buffer_attach(event, rb); + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; - perf_event_update_time(event); - perf_event_init_userpage(event); - perf_event_update_userpage(event); - ret = 0; - } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, - event->attr.aux_watermark, flags); - if (!ret) { - atomic_set(&rb->aux_mmap_count, 1); - rb->aux_mmap_locked = extra; + event->attr.aux_watermark, rb_flags); + if (ret) { + refcount_dec(&rb->mmap_count); + return ret; } + + refcount_set(&rb->aux_mmap_count, 1); + rb->aux_mmap_locked = extra; } -unlock: - if (!ret) { - atomic_long_add(user_extra, &user->locked_vm); - atomic64_add(extra, &vma->vm_mm->pinned_vm); - - atomic_inc(&event->mmap_count); - } else if (rb) { - /* AUX allocation failed */ - atomic_dec(&rb->mmap_count); - } -aux_unlock: - if (aux_mutex) - mutex_unlock(aux_mutex); - mutex_unlock(&event->mmap_mutex); + perf_mmap_account(vma, user_extra, extra); + refcount_inc(&event->mmap_count); + return 0; +} + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long vma_size, nr_pages; + mapped_f mapped; + int ret; + + /* + * Don't allow mmap() of inherited per-task counters. This would + * create a performance issue due to all children writing to the + * same rb. + */ + if (event->cpu == -1 && event->attr.inherit) + return -EINVAL; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + ret = security_perf_event_read(event); if (ret) return ret; + vma_size = vma->vm_end - vma->vm_start; + nr_pages = vma_size / PAGE_SIZE; + + if (nr_pages > INT_MAX) + return -ENOMEM; + + if (vma_size != PAGE_SIZE * nr_pages) + return -EINVAL; + + scoped_guard (mutex, &event->mmap_mutex) { + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + + if (vma->vm_pgoff == 0) + ret = perf_mmap_rb(vma, event, nr_pages); + else + ret = perf_mmap_aux(vma, event, nr_pages); + if (ret) + return ret; + } + /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. @@ -7174,7 +7186,7 @@ aux_unlock: * full cleanup in this case and therefore does not invoke * vmops::close(). */ - ret = map_range(rb, vma); + ret = map_range(event->rb, vma); if (ret) perf_mmap_close(vma); @@ -7440,7 +7452,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (!(current->flags & PF_KTHREAD)) { + } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; @@ -8080,7 +8092,7 @@ static u64 perf_virt_to_phys(u64 virt) * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ - if (current->mm != NULL) { + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { struct page *p; pagefault_disable(); @@ -8192,7 +8204,8 @@ struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; - bool user = !event->attr.exclude_callchain_user; + bool user = !event->attr.exclude_callchain_user && + !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; @@ -8204,7 +8217,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, 0, kernel, user, + callchain = get_perf_callchain(regs, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } @@ -13249,7 +13262,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) + if (refcount_read(&event->mmap_count)) goto unlock; if (output_event) { @@ -13262,7 +13275,7 @@ set: goto unlock; /* did we race against perf_mmap_close() */ - if (!atomic_read(&rb->mmap_count)) { + if (!refcount_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 249288d82b8d..d9cc57083091 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -35,7 +35,7 @@ struct perf_buffer { spinlock_t event_lock; struct list_head event_list; - atomic_t mmap_count; + refcount_t mmap_count; unsigned long mmap_locked; struct user_struct *mmap_user; @@ -47,7 +47,7 @@ struct perf_buffer { unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; - atomic_t aux_mmap_count; + refcount_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index aa9a759e824f..20a905023736 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic(). */ - if (!atomic_read(&rb->aux_mmap_count)) + if (!refcount_read(&rb->aux_mmap_count)) goto err; if (!refcount_inc_not_zero(&rb->aux_refcount)) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd..c8bfff023c6e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -121,7 +121,7 @@ struct xol_area { static void uprobe_warn(struct task_struct *t, const char *msg) { - pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); + pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg); } /* @@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); @@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -205,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); - if (is_swbp_insn(new_opcode)) { + if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { @@ -399,12 +400,12 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, return identical; } -static int __uprobe_write_opcode(struct vm_area_struct *vma, +static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long opcode_vaddr, uprobe_opcode_t opcode) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + bool is_register) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(&opcode); + const unsigned long vaddr = insn_vaddr & PAGE_MASK; bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -429,7 +430,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, */ flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); - copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + copy_to_page(fw->page, insn_vaddr, insn, nbytes); /* * When unregistering, we may only zap a PTE if uffd is disabled and @@ -482,23 +483,32 @@ remap: * @opcode_vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @opcode_vaddr. * - * Called with mm->mmap_lock held for read or write. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode) + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, + bool is_register) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, + verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); +} + +int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data) +{ + const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - int ret, is_register, ref_ctr_updated = 0; + int ret, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page; - is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -509,7 +519,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really - * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() * cannot deal with PMDs yet. */ if (is_register) @@ -521,14 +531,14 @@ retry: goto out; folio = page_folio(page); - ret = verify_opcode(page, opcode_vaddr, &opcode); + ret = verify(page, insn_vaddr, insn, nbytes, data); if (ret <= 0) { folio_put(folio); goto out; } /* We are going to replace instruction, update ref_ctr. */ - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) { folio_put(folio); @@ -560,7 +570,7 @@ retry: /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); folio_walk_end(&fw, vma); } @@ -580,7 +590,7 @@ retry: out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && ref_ctr_updated) + if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ @@ -602,7 +612,7 @@ out: int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); } /** @@ -618,7 +628,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - *(uprobe_opcode_t *)&auprobe->insn); + *(uprobe_opcode_t *)&auprobe->insn, false); } /* uprobe should have guaranteed positive refcount */ @@ -1051,7 +1061,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); + uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; @@ -1210,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), - GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + GFP_NOWAIT | __GFP_NOMEMALLOC); if (prev) prev->next = NULL; } @@ -1397,7 +1407,7 @@ struct uprobe *uprobe_register(struct inode *inode, return ERR_PTR(-EINVAL); /* - * This ensures that copy_from_page(), copy_to_page() and + * This ensures that uprobe_copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) @@ -1463,7 +1473,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - mmap_read_lock(mm); + mmap_write_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1480,7 +1490,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, vma, vaddr); } - mmap_read_unlock(mm); + mmap_write_unlock(mm); return err; } @@ -1726,7 +1736,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -void * __weak arch_uprobe_trampoline(unsigned long *psize) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; @@ -1758,7 +1768,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - insns = arch_uprobe_trampoline(&insns_size); + insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) @@ -1792,6 +1802,14 @@ static struct xol_area *get_xol_area(void) return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1803,6 +1821,8 @@ void uprobe_clear_state(struct mm_struct *mm) delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); + arch_uprobe_clear_state(mm); + if (!area) return; @@ -2160,7 +2180,7 @@ static void dup_xol_work(struct callback_head *work) /* * Called in context of a new clone/fork from copy_process. */ -void uprobe_copy_process(struct task_struct *t, unsigned long flags) +void uprobe_copy_process(struct task_struct *t, u64 flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; @@ -2393,7 +2413,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ @@ -2677,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -2741,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; @@ -2752,6 +2779,23 @@ out: rcu_read_unlock_trace(); } +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + guard(rcu_tasks_trace)(); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + return; + if (!get_utask()) + return; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + return; + handler_chain(uprobe, regs); +} + /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. diff --git a/kernel/fork.c b/kernel/fork.c index c4ada32598bd..cffa6157a55a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1014,6 +1014,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } @@ -1507,7 +1508,7 @@ fail_nomem: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) +static int copy_mm(u64 clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; @@ -1545,7 +1546,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) +static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { @@ -1566,7 +1567,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk, +static int copy_files(u64 clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; @@ -1596,7 +1597,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, return 0; } -static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) +static int copy_sighand(u64 clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -1645,7 +1646,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) posix_cputimers_group_init(pct, cpu_limit); } -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) +static int copy_signal(u64 clone_flags, struct task_struct *tsk) { struct signal_struct *sig; @@ -1688,6 +1689,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->cgroup_threadgroup_rwsem); +#endif + sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -2295,7 +2300,7 @@ __latent_entropy struct task_struct *copy_process( if (need_futex_hash_allocate_default(clone_flags)) { retval = futex_hash_allocate_default(); if (retval) - goto bad_fork_core_free; + goto bad_fork_cancel_cgroup; /* * If we fail beyond this point we don't free the allocated * futex hash map. We assume that another thread will be created diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index c716a66f8692..d818b4d47f1b 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -230,8 +230,9 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - q->key = *key; + struct task_struct *task; + q->key = *key; __futex_unqueue(q); WARN_ON(!q->rt_waiter); @@ -243,10 +244,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, futex_hash_get(hb); q->drop_hb_ref = true; q->lock_ptr = &hb->lock; + task = READ_ONCE(q->task); /* Signal locked state to the waiter */ futex_requeue_pi_complete(q, 1); - wake_up_state(q->task, TASK_NORMAL); + wake_up_state(task, TASK_NORMAL); } /** diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9b09ad3f9914..e7ad99254841 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1644,9 +1644,6 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) else __msi_domain_free_irqs(dev, domain, ctrl); - if (ops->msi_post_free) - ops->msi_post_free(domain, dev); - if (info->flags & MSI_FLAG_FREE_MSI_DESCS) msi_domain_free_descs(dev, ctrl); } diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 39278737bb68..2a1beebf1d37 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -460,6 +460,6 @@ config UNUSED_KSYMS_WHITELIST config MODULES_TREE_LOOKUP def_bool y - depends on PERF_EVENTS || TRACING || CFI_CLANG + depends on PERF_EVENTS || TRACING || CFI endif # MODULES diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index d3204c5c74eb..f8e8c126705c 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -14,7 +14,7 @@ * Use a latched RB-tree for __module_address(); this allows us to use * RCU lookups of the address from any context. * - * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can + * This is conditional on PERF_EVENTS || TRACING || CFI because those can * really hit __module_address() hard by doing a lot of stack unwinding; * potentially from NMI context. */ diff --git a/kernel/nscommon.c b/kernel/nscommon.c new file mode 100644 index 000000000000..c1fb2bad6d72 --- /dev/null +++ b/kernel/nscommon.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ns_common.h> +#include <linux/proc_ns.h> +#include <linux/vfsdebug.h> + +#ifdef CONFIG_DEBUG_VFS +static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) +{ + switch (ns->ns_type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + VFS_WARN_ON_ONCE(ops != &cgroupns_operations); + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + VFS_WARN_ON_ONCE(ops != &ipcns_operations); + break; +#endif + case CLONE_NEWNS: + VFS_WARN_ON_ONCE(ops != &mntns_operations); + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + VFS_WARN_ON_ONCE(ops != &netns_operations); + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + VFS_WARN_ON_ONCE(ops != &pidns_operations); + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + VFS_WARN_ON_ONCE(ops != &timens_operations); + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + VFS_WARN_ON_ONCE(ops != &userns_operations); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + VFS_WARN_ON_ONCE(ops != &utsns_operations); + break; +#endif + } +} +#endif + +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) +{ + refcount_set(&ns->__ns_ref, 1); + ns->stashed = NULL; + ns->ops = ops; + ns->ns_id = 0; + ns->ns_type = ns_type; + RB_CLEAR_NODE(&ns->ns_tree_node); + INIT_LIST_HEAD(&ns->ns_list_node); + +#ifdef CONFIG_DEBUG_VFS + ns_debug(ns, ops); +#endif + + if (inum) { + ns->inum = inum; + return 0; + } + return proc_alloc_inum(&ns->inum); +} + +void __ns_common_free(struct ns_common *ns) +{ + proc_free_inum(ns->inum); +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38..19aa64ab08c8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -64,7 +64,7 @@ static inline struct nsproxy *create_nsproxy(void) * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(unsigned long flags, +static struct nsproxy *create_new_namespaces(u64 flags, struct task_struct *tsk, struct user_namespace *user_ns, struct fs_struct *new_fs) { @@ -144,7 +144,7 @@ out_ns: * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) +int copy_namespaces(u64 flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); @@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(fd_file(f))) { ns = get_proc_ns(file_inode(fd_file(f))); - if (flags && (ns->ops->type != flags)) + if (flags && (ns->ns_type != flags)) err = -EINVAL; - flags = ns->ops->type; + flags = ns->ns_type; } else if (!IS_ERR(pidfd_pid(fd_file(f)))) { err = check_setns_flags(flags); } else { diff --git a/kernel/nstree.c b/kernel/nstree.c new file mode 100644 index 000000000000..b24a320a11a6 --- /dev/null +++ b/kernel/nstree.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/nstree.h> +#include <linux/proc_ns.h> +#include <linux/vfsdebug.h> + +/** + * struct ns_tree - Namespace tree + * @ns_tree: Rbtree of namespaces of a particular type + * @ns_list: Sequentially walkable list of all namespaces of this type + * @ns_tree_lock: Seqlock to protect the tree and list + * @type: type of namespaces in this tree + */ +struct ns_tree { + struct rb_root ns_tree; + struct list_head ns_list; + seqlock_t ns_tree_lock; + int type; +}; + +struct ns_tree mnt_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), + .type = CLONE_NEWNS, +}; + +struct ns_tree net_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), + .type = CLONE_NEWNET, +}; +EXPORT_SYMBOL_GPL(net_ns_tree); + +struct ns_tree uts_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), + .type = CLONE_NEWUTS, +}; + +struct ns_tree user_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), + .type = CLONE_NEWUSER, +}; + +struct ns_tree ipc_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), + .type = CLONE_NEWIPC, +}; + +struct ns_tree pid_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), + .type = CLONE_NEWPID, +}; + +struct ns_tree cgroup_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), + .type = CLONE_NEWCGROUP, +}; + +struct ns_tree time_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), + .type = CLONE_NEWTIME, +}; + +DEFINE_COOKIE(namespace_cookie); + +static inline struct ns_common *node_to_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_tree_node); +} + +static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct ns_common *ns_a = node_to_ns(a); + struct ns_common *ns_b = node_to_ns(b); + u64 ns_id_a = ns_a->ns_id; + u64 ns_id_b = ns_b->ns_id; + + if (ns_id_a < ns_id_b) + return -1; + if (ns_id_a > ns_id_b) + return 1; + return 0; +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +{ + struct rb_node *node, *prev; + + VFS_WARN_ON_ONCE(!ns->ns_id); + + write_seqlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + + node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); + /* + * If there's no previous entry simply add it after the + * head and if there is add it after the previous entry. + */ + prev = rb_prev(&ns->ns_tree_node); + if (!prev) + list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); + else + list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + + write_sequnlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(node); +} + +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +{ + VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + + write_seqlock(&ns_tree->ns_tree_lock); + rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); + list_bidir_del_rcu(&ns->ns_list_node); + RB_CLEAR_NODE(&ns->ns_tree_node); + write_sequnlock(&ns_tree->ns_tree_lock); +} +EXPORT_SYMBOL_GPL(__ns_tree_remove); + +static int ns_find(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns(node); + + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + + +static struct ns_tree *ns_tree_from_type(int ns_type) +{ + switch (ns_type) { + case CLONE_NEWCGROUP: + return &cgroup_ns_tree; + case CLONE_NEWIPC: + return &ipc_ns_tree; + case CLONE_NEWNS: + return &mnt_ns_tree; + case CLONE_NEWNET: + return &net_ns_tree; + case CLONE_NEWPID: + return &pid_ns_tree; + case CLONE_NEWUSER: + return &user_ns_tree; + case CLONE_NEWUTS: + return &uts_ns_tree; + case CLONE_NEWTIME: + return &time_ns_tree; + } + + return NULL; +} + +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree *ns_tree; + struct rb_node *node; + unsigned int seq; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + + do { + seq = read_seqbegin(&ns_tree->ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + if (node) + break; + } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + + if (!node) + return NULL; + + VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); + + return node_to_ns(node); +} + +/** + * ns_tree_adjoined_rcu - find the next/previous namespace in the same + * tree + * @ns: namespace to start from + * @previous: if true find the previous namespace, otherwise the next + * + * Find the next or previous namespace in the same tree as @ns. If + * there is no next/previous namespace, -ENOENT is returned. + */ +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, bool previous) +{ + struct list_head *list; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); + + if (previous) + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + else + list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); + if (list_is_head(list, &ns_tree->ns_list)) + return ERR_PTR(-ENOENT); + + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); + + return list_entry_rcu(list, struct ns_common, ns_list_node); +} + +/** + * ns_tree_gen_id - generate a new namespace id + * @ns: namespace to generate id for + * + * Generates a new namespace id and assigns it to the namespace. All + * namespaces types share the same id space and thus can be compared + * directly. IOW, when two ids of two namespace are equal, they are + * identical. + */ +u64 ns_tree_gen_id(struct ns_common *ns) +{ + guard(preempt)(); + ns->ns_id = gen_cookie_next(&namespace_cookie); + return ns->ns_id; +} diff --git a/kernel/pid.c b/kernel/pid.c index c45a28c16cd2..4fffec767a63 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,13 +71,13 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .ns.inum = PROC_PID_INIT_INO, + .ns.inum = ns_init_inum(&init_pid_ns), #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif @@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = { #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif + .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -491,7 +492,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) struct upid *upid; pid_t nr = 0; - if (pid && ns->level <= pid->level) { + if (pid && ns && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; @@ -514,7 +515,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (ns) + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; @@ -680,7 +682,7 @@ static int pid_table_root_permissions(struct ctl_table_header *head, container_of(head->set, struct pid_namespace, set); int mode = table->mode; - if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) || + if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) || uid_eq(current_euid(), make_kuid(pidns->user_ns, 0))) mode = (mode & S_IRWXU) >> 6; else if (in_egroup_p(make_kgid(pidns->user_ns, 0))) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717..650be58d8d18 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@ #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/idr.h> +#include <linux/nstree.h> #include <uapi/linux/wait.h> #include "pid_sysctl.h" @@ -102,17 +103,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto out_free_idr; - ns->ns.ops = &pidns_operations; ns->pid_max = PID_MAX_LIMIT; err = register_pidns_sysctls(ns); if (err) goto out_free_inum; - refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); @@ -124,10 +123,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif + ns_tree_add(ns); return ns; out_free_inum: - ns_free_inum(&ns->ns); + ns_common_free(ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); @@ -149,9 +149,10 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { + ns_tree_remove(ns); unregister_pidns_sysctls(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); @@ -168,10 +169,10 @@ static void destroy_pid_namespace_work(struct work_struct *work) parent = ns->parent; destroy_pid_namespace(ns); ns = parent; - } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); + } while (ns != &init_pid_ns && ns_ref_put(ns)); } -struct pid_namespace *copy_pid_ns(unsigned long flags, +struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) @@ -183,7 +184,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns != &init_pid_ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); @@ -344,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) -{ - return container_of(ns, struct pid_namespace, ns); -} - static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; @@ -390,11 +386,23 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } +bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + struct pid_namespace *ns; + + if (child->level < ancestor->level) + return false; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return ns == ancestor; +} + static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = to_pid_ns(ns); + struct pid_namespace *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) @@ -408,13 +416,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns) * this maintains the property that processes and their * children can not escape their current pid namespace. */ - if (new->level < active->level) - return -EINVAL; - - ancestor = new; - while (ancestor->level > active->level) - ancestor = ancestor->parent; - if (ancestor != active) + if (!pidns_is_ancestor(new, active)) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); @@ -447,7 +449,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns) const struct proc_ns_operations pidns_operations = { .name = "pid", - .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, @@ -458,7 +459,6 @@ const struct proc_ns_operations pidns_operations = { const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", - .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, @@ -475,6 +475,7 @@ static __init int pid_namespaces_init(void) #endif register_pid_ns_sysctl_table_vm(); + ns_tree_add(&init_pid_ns); return 0; } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index ea7995a25780..8df55397414a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -553,6 +553,30 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts) { + int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts); + + if (_is_cpu_device(dev)) + em_check_capacity_update(); + + return ret; +} +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_register_pd_no_update() - Register a perf domain for a device + * @dev : Device to register the PD for + * @nr_states : Number of performance states in the new PD + * @cb : Callback functions for populating the energy model + * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device) + * @microwatts : Whether or not the power values in the EM will be in uW + * + * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity + * update after registering the PD, even if @dev is a CPU device. + */ +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) +{ struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; unsigned long flags = 0; @@ -636,12 +660,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); - if (_is_cpu_device(dev)) - em_check_capacity_update(); - return ret; } -EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); +EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); /** * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f1f30cca573..2f66ab453823 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -449,6 +449,7 @@ int hibernation_snapshot(int platform_mode) shrink_shmem_memory(); console_suspend_all(); + pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index c4a488e67aa7..755883faf751 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -58,6 +58,7 @@ #include "deadline.c" #ifdef CONFIG_SCHED_CLASS_EXT +# include "ext_internal.h" # include "ext.c" # include "ext_idle.c" #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..ec126f85ff40 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,6 +7,8 @@ * Copyright (C) 1991-2002 Linus Torvalds * Copyright (C) 1998-2024 Ingo Molnar, Red Hat */ +#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE +#include <linux/sched.h> #include <linux/highmem.h> #include <linux/hrtimer_api.h> #include <linux/ktime_api.h> @@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) __do_set_cpus_allowed(p, &ac); } -void migrate_disable(void) -{ - struct task_struct *p = current; - - if (p->migration_disabled) { -#ifdef CONFIG_DEBUG_PREEMPT - /* - *Warn about overflow half-way through the range. - */ - WARN_ON_ONCE((s16)p->migration_disabled < 0); -#endif - p->migration_disabled++; - return; - } - - guard(preempt)(); - this_rq()->nr_pinned++; - p->migration_disabled = 1; -} -EXPORT_SYMBOL_GPL(migrate_disable); - -void migrate_enable(void) +void ___migrate_enable(void) { struct task_struct *p = current; struct affinity_context ac = { @@ -2410,35 +2391,19 @@ void migrate_enable(void) .flags = SCA_MIGRATE_ENABLE, }; -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Check both overflow from migrate_disable() and superfluous - * migrate_enable(). - */ - if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) - return; -#endif + __set_cpus_allowed_ptr(p, &ac); +} +EXPORT_SYMBOL_GPL(___migrate_enable); - if (p->migration_disabled > 1) { - p->migration_disabled--; - return; - } +void migrate_disable(void) +{ + __migrate_disable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); - /* - * Ensure stop_task runs either before or after this, and that - * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). - */ - guard(preempt)(); - if (p->cpus_ptr != &p->cpus_mask) - __set_cpus_allowed_ptr(p, &ac); - /* - * Mustn't clear migration_disabled() until cpus_ptr points back at the - * regular cpus_mask, otherwise things that race (eg. - * select_fallback_rq) get confused. - */ - barrier(); - p->migration_disabled = 0; - this_rq()->nr_pinned--; +void migrate_enable(void) +{ + __migrate_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -4472,7 +4437,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * __sched_fork() is basic setup which is also used by sched_init() to * initialize the boot CPU's idle task. */ -static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +static void __sched_fork(u64 clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -4490,6 +4455,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; +#ifdef CONFIG_CFS_BANDWIDTH + init_cfs_throttle_work(p); +#endif #endif #ifdef CONFIG_SCHEDSTATS @@ -4707,7 +4675,7 @@ late_initcall(sched_core_sysctl_init); /* * fork()/clone()-time setup: */ -int sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(u64 clone_flags, struct task_struct *p) { __sched_fork(clone_flags, p); /* @@ -9362,8 +9330,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task, false); - - scx_cgroup_finish_attach(); } static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) @@ -9551,7 +9517,7 @@ static unsigned long tg_weight(struct task_group *tg) #ifdef CONFIG_FAIR_GROUP_SCHED return scale_load_down(tg->shares); #else - return sched_weight_from_cgroup(tg->scx_weight); + return sched_weight_from_cgroup(tg->scx.weight); #endif } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f25301267e47..615411a0a881 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) */ if (dl_se->dl_defer && !dl_se->dl_defer_running && dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { - if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + if (!is_dl_boosted(dl_se)) { /* * Set dl_se->dl_defer_armed and dl_throttled variables to @@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; -static bool dl_server_stopped(struct sched_dl_entity *dl_se); - static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) { struct rq *rq = rq_of_dl_se(dl_se); @@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ if (!dl_se->dl_runtime) return HRTIMER_NORESTART; - if (!dl_se->server_has_tasks(dl_se)) { - replenish_dl_entity(dl_se); - dl_server_stopped(dl_se); - return HRTIMER_NORESTART; - } - if (dl_se->dl_defer_armed) { /* * First check if the server could consume runtime in background. @@ -1579,10 +1571,8 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) { - dl_se->dl_server_idle = 0; + if (dl_se->dl_runtime) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); - } } void dl_server_start(struct sched_dl_entity *dl_se) @@ -1610,26 +1600,10 @@ void dl_server_stop(struct sched_dl_entity *dl_se) dl_se->dl_server_active = 0; } -static bool dl_server_stopped(struct sched_dl_entity *dl_se) -{ - if (!dl_se->dl_server_active) - return true; - - if (dl_se->dl_server_idle) { - dl_server_stop(dl_se); - return true; - } - - dl_se->dl_server_idle = 1; - return false; -} - void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task) { dl_se->rq = rq; - dl_se->server_has_tasks = has_tasks; dl_se->server_pick_task = pick_task; } @@ -2394,10 +2368,7 @@ again: if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - if (!dl_server_stopped(dl_se)) { - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); - } + dl_server_stop(dl_se); goto again; } rq->dl_server = dl_se; @@ -2580,6 +2551,25 @@ static int find_later_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); + + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); + + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); + + return p; +} + /* Locks the rq it finds */ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) { @@ -2607,12 +2597,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { - if (unlikely(task_rq(task) != rq || + /* + * double_lock_balance had to release rq->lock, in the + * meantime, task may no longer be fit to be migrated. + * Check the following to ensure that the task is + * still suitable for migration: + * 1. It is possible the task was scheduled, + * migrate_disabled was set and then got preempted, + * so we must check the task migration disable + * flag. + * 2. The CPU picked is in the task's affinity. + * 3. For throttled task (dl_task_offline_migration), + * check the following: + * - the task is not on the rq anymore (it was + * migrated) + * - the task is not on CPU anymore + * - the task is still a dl task + * - the task is not queued on the rq anymore + * 4. For the non-throttled task (push_dl_task), the + * check to ensure that this task is still at the + * head of the pushable tasks list is enough. + */ + if (unlikely(is_migration_disabled(task) || !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || - task_on_cpu(rq, task) || - !dl_task(task) || - is_migration_disabled(task) || - !task_on_rq_queued(task))) { + (task->dl.dl_throttled && + (task_rq(task) != rq || + task_on_cpu(rq, task) || + !dl_task(task) || + !task_on_rq_queued(task))) || + (!task->dl.dl_throttled && + task != pick_next_pushable_dl_task(rq)))) { + double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -2635,25 +2650,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) return later_rq; } -static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_dl_tasks(rq)) - return NULL; - - p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - - WARN_ON_ONCE(rq->cpu != task_cpu(p)); - WARN_ON_ONCE(task_current(rq, p)); - WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - - WARN_ON_ONCE(!task_on_rq_queued(p)); - WARN_ON_ONCE(!dl_task(p)); - - return p; -} - /* * See if the non running -deadline tasks on this rq * can be sent to some other CPU where they can preempt diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4ae32ef179dd..2b0e88206d07 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,1040 +9,6 @@ #include <linux/btf_ids.h> #include "ext_idle.h" -#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) - -enum scx_consts { - SCX_DSP_DFL_MAX_BATCH = 32, - SCX_DSP_MAX_LOOPS = 32, - SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, - - SCX_EXIT_BT_LEN = 64, - SCX_EXIT_MSG_LEN = 1024, - SCX_EXIT_DUMP_DFL_LEN = 32768, - - SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, - - /* - * Iterating all tasks may take a while. Periodically drop - * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. - */ - SCX_TASK_ITER_BATCH = 32, -}; - -enum scx_exit_kind { - SCX_EXIT_NONE, - SCX_EXIT_DONE, - - SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ - SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ - SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ - SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ - - SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ - SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ - SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ -}; - -/* - * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), - * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes - * are 64bit of the format: - * - * Bits: [63 .. 48 47 .. 32 31 .. 0] - * [ SYS ACT ] [ SYS RSN ] [ USR ] - * - * SYS ACT: System-defined exit actions - * SYS RSN: System-defined exit reasons - * USR : User-defined exit codes and reasons - * - * Using the above, users may communicate intention and context by ORing system - * actions and/or system reasons with a user-defined exit code. - */ -enum scx_exit_code { - /* Reasons */ - SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, - - /* Actions */ - SCX_ECODE_ACT_RESTART = 1LLU << 48, -}; - -/* - * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is - * being disabled. - */ -struct scx_exit_info { - /* %SCX_EXIT_* - broad category of the exit reason */ - enum scx_exit_kind kind; - - /* exit code if gracefully exiting */ - s64 exit_code; - - /* textual representation of the above */ - const char *reason; - - /* backtrace if exiting due to an error */ - unsigned long *bt; - u32 bt_len; - - /* informational message */ - char *msg; - - /* debug dump */ - char *dump; -}; - -/* sched_ext_ops.flags */ -enum scx_ops_flags { - /* - * Keep built-in idle tracking even if ops.update_idle() is implemented. - */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, - - /* - * By default, if there are no other task to run on the CPU, ext core - * keeps running the current task even after its slice expires. If this - * flag is specified, such tasks are passed to ops.enqueue() with - * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. - */ - SCX_OPS_ENQ_LAST = 1LLU << 1, - - /* - * An exiting task may schedule after PF_EXITING is set. In such cases, - * bpf_task_from_pid() may not be able to find the task and if the BPF - * scheduler depends on pid lookup for dispatching, the task will be - * lost leading to various issues including RCU grace period stalls. - * - * To mask this problem, by default, unhashed tasks are automatically - * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't - * depend on pid lookups and wants to handle these tasks directly, the - * following flag can be used. - */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, - - /* - * If set, only tasks with policy set to SCHED_EXT are attached to - * sched_ext. If clear, SCHED_NORMAL tasks are also included. - */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, - - /* - * A migration disabled task can only execute on its current CPU. By - * default, such tasks are automatically put on the CPU's local DSQ with - * the default slice on enqueue. If this ops flag is set, they also go - * through ops.enqueue(). - * - * A migration disabled task never invokes ops.select_cpu() as it can - * only select the current CPU. Also, p->cpus_ptr will only contain its - * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr - * and thus may disagree with cpumask_weight(p->cpus_ptr). - */ - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, - - /* - * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes - * ops.enqueue() on the ops.select_cpu() selected or the wakee's - * previous CPU via IPI (inter-processor interrupt) to reduce cacheline - * transfers. When this optimization is enabled, ops.select_cpu() is - * skipped in some cases (when racing against the wakee switching out). - * As the BPF scheduler may depend on ops.select_cpu() being invoked - * during wakeups, queued wakeup is disabled by default. - * - * If this ops flag is set, queued wakeup optimization is enabled and - * the BPF scheduler must be able to handle ops.enqueue() invoked on the - * wakee's CPU without preceding ops.select_cpu() even for tasks which - * may be executed on multiple CPUs. - */ - SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, - - /* - * If set, enable per-node idle cpumasks. If clear, use a single global - * flat idle cpumask. - */ - SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, - - /* - * CPU cgroup support flags - */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ - - SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | - SCX_OPS_ENQ_LAST | - SCX_OPS_ENQ_EXITING | - SCX_OPS_ENQ_MIGRATION_DISABLED | - SCX_OPS_ALLOW_QUEUED_WAKEUP | - SCX_OPS_SWITCH_PARTIAL | - SCX_OPS_BUILTIN_IDLE_PER_NODE | - SCX_OPS_HAS_CGROUP_WEIGHT, - - /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ - __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, - - SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, -}; - -/* argument container for ops.init_task() */ -struct scx_init_task_args { - /* - * Set if ops.init_task() is being invoked on the fork path, as opposed - * to the scheduler transition path. - */ - bool fork; -#ifdef CONFIG_EXT_GROUP_SCHED - /* the cgroup the task is joining */ - struct cgroup *cgroup; -#endif -}; - -/* argument container for ops.exit_task() */ -struct scx_exit_task_args { - /* Whether the task exited before running on sched_ext. */ - bool cancelled; -}; - -/* argument container for ops->cgroup_init() */ -struct scx_cgroup_init_args { - /* the weight of the cgroup [1..10000] */ - u32 weight; - - /* bandwidth control parameters from cpu.max and cpu.max.burst */ - u64 bw_period_us; - u64 bw_quota_us; - u64 bw_burst_us; -}; - -enum scx_cpu_preempt_reason { - /* next task is being scheduled by &sched_class_rt */ - SCX_CPU_PREEMPT_RT, - /* next task is being scheduled by &sched_class_dl */ - SCX_CPU_PREEMPT_DL, - /* next task is being scheduled by &sched_class_stop */ - SCX_CPU_PREEMPT_STOP, - /* unknown reason for SCX being preempted */ - SCX_CPU_PREEMPT_UNKNOWN, -}; - -/* - * Argument container for ops->cpu_acquire(). Currently empty, but may be - * expanded in the future. - */ -struct scx_cpu_acquire_args {}; - -/* argument container for ops->cpu_release() */ -struct scx_cpu_release_args { - /* the reason the CPU was preempted */ - enum scx_cpu_preempt_reason reason; - - /* the task that's going to be scheduled on the CPU */ - struct task_struct *task; -}; - -/* - * Informational context provided to dump operations. - */ -struct scx_dump_ctx { - enum scx_exit_kind kind; - s64 exit_code; - const char *reason; - u64 at_ns; - u64 at_jiffies; -}; - -/** - * struct sched_ext_ops - Operation table for BPF scheduler implementation - * - * A BPF scheduler can implement an arbitrary scheduling policy by - * implementing and loading operations in this table. Note that a userland - * scheduling policy can also be implemented using the BPF scheduler - * as a shim layer. - */ -struct sched_ext_ops { - /** - * @select_cpu: Pick the target CPU for a task which is being woken up - * @p: task being woken up - * @prev_cpu: the cpu @p was on before sleeping - * @wake_flags: SCX_WAKE_* - * - * Decision made here isn't final. @p may be moved to any CPU while it - * is getting dispatched for execution later. However, as @p is not on - * the rq at this point, getting the eventual execution CPU right here - * saves a small bit of overhead down the line. - * - * If an idle CPU is returned, the CPU is kicked and will try to - * dispatch. While an explicit custom mechanism can be added, - * select_cpu() serves as the default way to wake up idle CPUs. - * - * @p may be inserted into a DSQ directly by calling - * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. - * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ - * of the CPU returned by this operation. - * - * Note that select_cpu() is never called for tasks that can only run - * on a single CPU or tasks with migration disabled, as they don't have - * the option to select a different CPU. See select_task_rq() for - * details. - */ - s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); - - /** - * @enqueue: Enqueue a task on the BPF scheduler - * @p: task being enqueued - * @enq_flags: %SCX_ENQ_* - * - * @p is ready to run. Insert directly into a DSQ by calling - * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly - * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, - * the task will stall. - * - * If @p was inserted into a DSQ from ops.select_cpu(), this callback is - * skipped. - */ - void (*enqueue)(struct task_struct *p, u64 enq_flags); - - /** - * @dequeue: Remove a task from the BPF scheduler - * @p: task being dequeued - * @deq_flags: %SCX_DEQ_* - * - * Remove @p from the BPF scheduler. This is usually called to isolate - * the task while updating its scheduling properties (e.g. priority). - * - * The ext core keeps track of whether the BPF side owns a given task or - * not and can gracefully ignore spurious dispatches from BPF side, - * which makes it safe to not implement this method. However, depending - * on the scheduling logic, this can lead to confusing behaviors - e.g. - * scheduling position not being updated across a priority change. - */ - void (*dequeue)(struct task_struct *p, u64 deq_flags); - - /** - * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs - * @cpu: CPU to dispatch tasks for - * @prev: previous task being switched out - * - * Called when a CPU's local dsq is empty. The operation should dispatch - * one or more tasks from the BPF scheduler into the DSQs using - * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ - * using scx_bpf_dsq_move_to_local(). - * - * The maximum number of times scx_bpf_dsq_insert() can be called - * without an intervening scx_bpf_dsq_move_to_local() is specified by - * ops.dispatch_max_batch. See the comments on top of the two functions - * for more details. - * - * When not %NULL, @prev is an SCX task with its slice depleted. If - * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in - * @prev->scx.flags, it is not enqueued yet and will be enqueued after - * ops.dispatch() returns. To keep executing @prev, return without - * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. - */ - void (*dispatch)(s32 cpu, struct task_struct *prev); - - /** - * @tick: Periodic tick - * @p: task running currently - * - * This operation is called every 1/HZ seconds on CPUs which are - * executing an SCX task. Setting @p->scx.slice to 0 will trigger an - * immediate dispatch cycle on the CPU. - */ - void (*tick)(struct task_struct *p); - - /** - * @runnable: A task is becoming runnable on its associated CPU - * @p: task becoming runnable - * @enq_flags: %SCX_ENQ_* - * - * This and the following three functions can be used to track a task's - * execution state transitions. A task becomes ->runnable() on a CPU, - * and then goes through one or more ->running() and ->stopping() pairs - * as it runs on the CPU, and eventually becomes ->quiescent() when it's - * done running on the CPU. - * - * @p is becoming runnable on the CPU because it's - * - * - waking up (%SCX_ENQ_WAKEUP) - * - being moved from another CPU - * - being restored after temporarily taken off the queue for an - * attribute change. - * - * This and ->enqueue() are related but not coupled. This operation - * notifies @p's state transition and may not be followed by ->enqueue() - * e.g. when @p is being dispatched to a remote CPU, or when @p is - * being enqueued on a CPU experiencing a hotplug event. Likewise, a - * task may be ->enqueue()'d without being preceded by this operation - * e.g. after exhausting its slice. - */ - void (*runnable)(struct task_struct *p, u64 enq_flags); - - /** - * @running: A task is starting to run on its associated CPU - * @p: task starting to run - * - * Note that this callback may be called from a CPU other than the - * one the task is going to run on. This can happen when a task - * property is changed (i.e., affinity), since scx_next_task_scx(), - * which triggers this callback, may run on a CPU different from - * the task's assigned CPU. - * - * Therefore, always use scx_bpf_task_cpu(@p) to determine the - * target CPU the task is going to use. - * - * See ->runnable() for explanation on the task state notifiers. - */ - void (*running)(struct task_struct *p); - - /** - * @stopping: A task is stopping execution - * @p: task stopping to run - * @runnable: is task @p still runnable? - * - * Note that this callback may be called from a CPU other than the - * one the task was running on. This can happen when a task - * property is changed (i.e., affinity), since dequeue_task_scx(), - * which triggers this callback, may run on a CPU different from - * the task's assigned CPU. - * - * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU - * the task was running on. - * - * See ->runnable() for explanation on the task state notifiers. If - * !@runnable, ->quiescent() will be invoked after this operation - * returns. - */ - void (*stopping)(struct task_struct *p, bool runnable); - - /** - * @quiescent: A task is becoming not runnable on its associated CPU - * @p: task becoming not runnable - * @deq_flags: %SCX_DEQ_* - * - * See ->runnable() for explanation on the task state notifiers. - * - * @p is becoming quiescent on the CPU because it's - * - * - sleeping (%SCX_DEQ_SLEEP) - * - being moved to another CPU - * - being temporarily taken off the queue for an attribute change - * (%SCX_DEQ_SAVE) - * - * This and ->dequeue() are related but not coupled. This operation - * notifies @p's state transition and may not be preceded by ->dequeue() - * e.g. when @p is being dispatched to a remote CPU. - */ - void (*quiescent)(struct task_struct *p, u64 deq_flags); - - /** - * @yield: Yield CPU - * @from: yielding task - * @to: optional yield target task - * - * If @to is NULL, @from is yielding the CPU to other runnable tasks. - * The BPF scheduler should ensure that other available tasks are - * dispatched before the yielding task. Return value is ignored in this - * case. - * - * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf - * scheduler can implement the request, return %true; otherwise, %false. - */ - bool (*yield)(struct task_struct *from, struct task_struct *to); - - /** - * @core_sched_before: Task ordering for core-sched - * @a: task A - * @b: task B - * - * Used by core-sched to determine the ordering between two tasks. See - * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on - * core-sched. - * - * Both @a and @b are runnable and may or may not currently be queued on - * the BPF scheduler. Should return %true if @a should run before @b. - * %false if there's no required ordering or @b should run before @a. - * - * If not specified, the default is ordering them according to when they - * became runnable. - */ - bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); - - /** - * @set_weight: Set task weight - * @p: task to set weight for - * @weight: new weight [1..10000] - * - * Update @p's weight to @weight. - */ - void (*set_weight)(struct task_struct *p, u32 weight); - - /** - * @set_cpumask: Set CPU affinity - * @p: task to set CPU affinity for - * @cpumask: cpumask of cpus that @p can run on - * - * Update @p's CPU affinity to @cpumask. - */ - void (*set_cpumask)(struct task_struct *p, - const struct cpumask *cpumask); - - /** - * @update_idle: Update the idle state of a CPU - * @cpu: CPU to update the idle state for - * @idle: whether entering or exiting the idle state - * - * This operation is called when @rq's CPU goes or leaves the idle - * state. By default, implementing this operation disables the built-in - * idle CPU tracking and the following helpers become unavailable: - * - * - scx_bpf_select_cpu_dfl() - * - scx_bpf_select_cpu_and() - * - scx_bpf_test_and_clear_cpu_idle() - * - scx_bpf_pick_idle_cpu() - * - * The user also must implement ops.select_cpu() as the default - * implementation relies on scx_bpf_select_cpu_dfl(). - * - * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle - * tracking. - */ - void (*update_idle)(s32 cpu, bool idle); - - /** - * @cpu_acquire: A CPU is becoming available to the BPF scheduler - * @cpu: The CPU being acquired by the BPF scheduler. - * @args: Acquire arguments, see the struct definition. - * - * A CPU that was previously released from the BPF scheduler is now once - * again under its control. - */ - void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); - - /** - * @cpu_release: A CPU is taken away from the BPF scheduler - * @cpu: The CPU being released by the BPF scheduler. - * @args: Release arguments, see the struct definition. - * - * The specified CPU is no longer under the control of the BPF - * scheduler. This could be because it was preempted by a higher - * priority sched_class, though there may be other reasons as well. The - * caller should consult @args->reason to determine the cause. - */ - void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); - - /** - * @init_task: Initialize a task to run in a BPF scheduler - * @p: task to initialize for BPF scheduling - * @args: init arguments, see the struct definition - * - * Either we're loading a BPF scheduler or a new task is being forked. - * Initialize @p for BPF scheduling. This operation may block and can - * be used for allocations, and is called exactly once for a task. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During a fork, it - * will abort that specific fork. - */ - s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); - - /** - * @exit_task: Exit a previously-running task from the system - * @p: task to exit - * @args: exit arguments, see the struct definition - * - * @p is exiting or the BPF scheduler is being unloaded. Perform any - * necessary cleanup for @p. - */ - void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); - - /** - * @enable: Enable BPF scheduling for a task - * @p: task to enable BPF scheduling for - * - * Enable @p for BPF scheduling. enable() is called on @p any time it - * enters SCX, and is always paired with a matching disable(). - */ - void (*enable)(struct task_struct *p); - - /** - * @disable: Disable BPF scheduling for a task - * @p: task to disable BPF scheduling for - * - * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. - * Disable BPF scheduling for @p. A disable() call is always matched - * with a prior enable() call. - */ - void (*disable)(struct task_struct *p); - - /** - * @dump: Dump BPF scheduler state on error - * @ctx: debug dump context - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. - */ - void (*dump)(struct scx_dump_ctx *ctx); - - /** - * @dump_cpu: Dump BPF scheduler state for a CPU on error - * @ctx: debug dump context - * @cpu: CPU to generate debug dump for - * @idle: @cpu is currently idle without any runnable tasks - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @cpu. If @idle is %true and this operation doesn't produce any - * output, @cpu is skipped for dump. - */ - void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); - - /** - * @dump_task: Dump BPF scheduler state for a runnable task on error - * @ctx: debug dump context - * @p: runnable task to generate debug dump for - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @p. - */ - void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); - -#ifdef CONFIG_EXT_GROUP_SCHED - /** - * @cgroup_init: Initialize a cgroup - * @cgrp: cgroup being initialized - * @args: init arguments, see the struct definition - * - * Either the BPF scheduler is being loaded or @cgrp created, initialize - * @cgrp for sched_ext. This operation may block. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During cgroup - * creation, it will abort the specific cgroup creation. - */ - s32 (*cgroup_init)(struct cgroup *cgrp, - struct scx_cgroup_init_args *args); - - /** - * @cgroup_exit: Exit a cgroup - * @cgrp: cgroup being exited - * - * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit - * @cgrp for sched_ext. This operation my block. - */ - void (*cgroup_exit)(struct cgroup *cgrp); - - /** - * @cgroup_prep_move: Prepare a task to be moved to a different cgroup - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Prepare @p for move from cgroup @from to @to. This operation may - * block and can be used for allocations. - * - * Return 0 for success, -errno for failure. An error return aborts the - * migration. - */ - s32 (*cgroup_prep_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_move: Commit cgroup move - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Commit the move. @p is dequeued during this operation. - */ - void (*cgroup_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_cancel_move: Cancel cgroup move - * @p: task whose cgroup move is being canceled - * @from: cgroup @p was being moved from - * @to: cgroup @p was being moved to - * - * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). - * Undo the preparation. - */ - void (*cgroup_cancel_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_set_weight: A cgroup's weight is being changed - * @cgrp: cgroup whose weight is being updated - * @weight: new weight [1..10000] - * - * Update @cgrp's weight to @weight. - */ - void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); - - /** - * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed - * @cgrp: cgroup whose bandwidth is being updated - * @period_us: bandwidth control period - * @quota_us: bandwidth control quota - * @burst_us: bandwidth control burst - * - * Update @cgrp's bandwidth control parameters. This is from the cpu.max - * cgroup interface. - * - * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled - * to. For example, if @period_us is 1_000_000 and @quota_us is - * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be - * interpreted in the same fashion and specifies how much @cgrp can - * burst temporarily. The specific control mechanism and thus the - * interpretation of @period_us and burstiness is upto to the BPF - * scheduler. - */ - void (*cgroup_set_bandwidth)(struct cgroup *cgrp, - u64 period_us, u64 quota_us, u64 burst_us); - -#endif /* CONFIG_EXT_GROUP_SCHED */ - - /* - * All online ops must come before ops.cpu_online(). - */ - - /** - * @cpu_online: A CPU became online - * @cpu: CPU which just came up - * - * @cpu just came online. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs beforehand. - */ - void (*cpu_online)(s32 cpu); - - /** - * @cpu_offline: A CPU is going offline - * @cpu: CPU which is going offline - * - * @cpu is going offline. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs afterwards. - */ - void (*cpu_offline)(s32 cpu); - - /* - * All CPU hotplug ops must come before ops.init(). - */ - - /** - * @init: Initialize the BPF scheduler - */ - s32 (*init)(void); - - /** - * @exit: Clean up after the BPF scheduler - * @info: Exit info - * - * ops.exit() is also called on ops.init() failure, which is a bit - * unusual. This is to allow rich reporting through @info on how - * ops.init() failed. - */ - void (*exit)(struct scx_exit_info *info); - - /** - * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch - */ - u32 dispatch_max_batch; - - /** - * @flags: %SCX_OPS_* flags - */ - u64 flags; - - /** - * @timeout_ms: The maximum amount of time, in milliseconds, that a - * runnable task should be able to wait before being scheduled. The - * maximum timeout may not exceed the default timeout of 30 seconds. - * - * Defaults to the maximum allowed timeout value of 30 seconds. - */ - u32 timeout_ms; - - /** - * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default - * value of 32768 is used. - */ - u32 exit_dump_len; - - /** - * @hotplug_seq: A sequence number that may be set by the scheduler to - * detect when a hotplug event has occurred during the loading process. - * If 0, no detection occurs. Otherwise, the scheduler will fail to - * load if the sequence number does not match @scx_hotplug_seq on the - * enable path. - */ - u64 hotplug_seq; - - /** - * @name: BPF scheduler's name - * - * Must be a non-zero valid BPF object name including only isalnum(), - * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the - * BPF scheduler is enabled. - */ - char name[SCX_OPS_NAME_LEN]; - - /* internal use only, must be NULL */ - void *priv; -}; - -enum scx_opi { - SCX_OPI_BEGIN = 0, - SCX_OPI_NORMAL_BEGIN = 0, - SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), - SCX_OPI_END = SCX_OP_IDX(init), -}; - -/* - * Collection of event counters. Event types are placed in descending order. - */ -struct scx_event_stats { - /* - * If ops.select_cpu() returns a CPU which can't be used by the task, - * the core scheduler code silently picks a fallback CPU. - */ - s64 SCX_EV_SELECT_CPU_FALLBACK; - - /* - * When dispatching to a local DSQ, the CPU may have gone offline in - * the meantime. In this case, the task is bounced to the global DSQ. - */ - s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; - - /* - * If SCX_OPS_ENQ_LAST is not set, the number of times that a task - * continued to run because there were no other tasks on the CPU. - */ - s64 SCX_EV_DISPATCH_KEEP_LAST; - - /* - * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task - * is dispatched to a local DSQ when exiting. - */ - s64 SCX_EV_ENQ_SKIP_EXITING; - - /* - * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a - * migration disabled task skips ops.enqueue() and is dispatched to its - * local DSQ. - */ - s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; - - /* - * Total number of times a task's time slice was refilled with the - * default value (SCX_SLICE_DFL). - */ - s64 SCX_EV_REFILL_SLICE_DFL; - - /* - * The total duration of bypass modes in nanoseconds. - */ - s64 SCX_EV_BYPASS_DURATION; - - /* - * The number of tasks dispatched in the bypassing mode. - */ - s64 SCX_EV_BYPASS_DISPATCH; - - /* - * The number of times the bypassing mode has been activated. - */ - s64 SCX_EV_BYPASS_ACTIVATE; -}; - -struct scx_sched { - struct sched_ext_ops ops; - DECLARE_BITMAP(has_op, SCX_OPI_END); - - /* - * Dispatch queues. - * - * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. - * This is to avoid live-locking in bypass mode where all tasks are - * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If - * per-node split isn't sufficient, it can be further split. - */ - struct rhashtable dsq_hash; - struct scx_dispatch_q **global_dsqs; - - /* - * The event counters are in a per-CPU variable to minimize the - * accounting overhead. A system-wide view on the event counter is - * constructed when requested by scx_bpf_events(). - */ - struct scx_event_stats __percpu *event_stats_cpu; - - bool warned_zero_slice; - - atomic_t exit_kind; - struct scx_exit_info *exit_info; - - struct kobject kobj; - - struct kthread_worker *helper; - struct irq_work error_irq_work; - struct kthread_work disable_work; - struct rcu_work rcu_work; -}; - -enum scx_wake_flags { - /* expose select WF_* flags as enums */ - SCX_WAKE_FORK = WF_FORK, - SCX_WAKE_TTWU = WF_TTWU, - SCX_WAKE_SYNC = WF_SYNC, -}; - -enum scx_enq_flags { - /* expose select ENQUEUE_* flags as enums */ - SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, - SCX_ENQ_HEAD = ENQUEUE_HEAD, - SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, - - /* high 32bits are SCX specific */ - - /* - * Set the following to trigger preemption when calling - * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the - * current task is cleared to zero and the CPU is kicked into the - * scheduling path. Implies %SCX_ENQ_HEAD. - */ - SCX_ENQ_PREEMPT = 1LLU << 32, - - /* - * The task being enqueued was previously enqueued on the current CPU's - * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was - * invoked in a ->cpu_release() callback, and the task is again - * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the - * task will not be scheduled on the CPU until at least the next invocation - * of the ->cpu_acquire() callback. - */ - SCX_ENQ_REENQ = 1LLU << 40, - - /* - * The task being enqueued is the only task available for the cpu. By - * default, ext core keeps executing such tasks but when - * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the - * %SCX_ENQ_LAST flag set. - * - * The BPF scheduler is responsible for triggering a follow-up - * scheduling event. Otherwise, Execution may stall. - */ - SCX_ENQ_LAST = 1LLU << 41, - - /* high 8 bits are internal */ - __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, - - SCX_ENQ_CLEAR_OPSS = 1LLU << 56, - SCX_ENQ_DSQ_PRIQ = 1LLU << 57, -}; - -enum scx_deq_flags { - /* expose select DEQUEUE_* flags as enums */ - SCX_DEQ_SLEEP = DEQUEUE_SLEEP, - - /* high 32bits are SCX specific */ - - /* - * The generic core-sched layer decided to execute the task even though - * it hasn't been dispatched yet. Dequeue from the BPF side. - */ - SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, -}; - -enum scx_pick_idle_cpu_flags { - SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ - SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ -}; - -enum scx_kick_flags { - /* - * Kick the target CPU if idle. Guarantees that the target CPU goes - * through at least one full scheduling cycle before going idle. If the - * target CPU can be determined to be currently not idle and going to go - * through a scheduling cycle before going idle, noop. - */ - SCX_KICK_IDLE = 1LLU << 0, - - /* - * Preempt the current task and execute the dispatch path. If the - * current task of the target CPU is an SCX task, its ->scx.slice is - * cleared to zero before the scheduling path is invoked so that the - * task expires and the dispatch path is invoked. - */ - SCX_KICK_PREEMPT = 1LLU << 1, - - /* - * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will - * return after the target CPU finishes picking the next task. - */ - SCX_KICK_WAIT = 1LLU << 2, -}; - -enum scx_tg_flags { - SCX_TG_ONLINE = 1U << 0, - SCX_TG_INITED = 1U << 1, -}; - -enum scx_enable_state { - SCX_ENABLING, - SCX_ENABLED, - SCX_DISABLING, - SCX_DISABLED, -}; - -static const char *scx_enable_state_str[] = { - [SCX_ENABLING] = "enabling", - [SCX_ENABLED] = "enabled", - [SCX_DISABLING] = "disabling", - [SCX_DISABLED] = "disabled", -}; - -/* - * sched_ext_entity->ops_state - * - * Used to track the task ownership between the SCX core and the BPF scheduler. - * State transitions look as follows: - * - * NONE -> QUEUEING -> QUEUED -> DISPATCHING - * ^ | | - * | v v - * \-------------------------------/ - * - * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call - * sites for explanations on the conditions being waited upon and why they are - * safe. Transitions out of them into NONE or QUEUED must store_release and the - * waiters should load_acquire. - * - * Tracking scx_ops_state enables sched_ext core to reliably determine whether - * any given task can be dispatched by the BPF scheduler at all times and thus - * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler - * to try to dispatch any task anytime regardless of its state as the SCX core - * can safely reject invalid dispatches. - */ -enum scx_ops_state { - SCX_OPSS_NONE, /* owned by the SCX core */ - SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ - SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ - SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ - - /* - * QSEQ brands each QUEUED instance so that, when dispatch races - * dequeue/requeue, the dispatcher can tell whether it still has a claim - * on the task being dispatched. - * - * As some 32bit archs can't do 64bit store_release/load_acquire, - * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on - * 32bit machines. The dispatch race window QSEQ protects is very narrow - * and runs with IRQ disabled. 30 bits should be sufficient. - */ - SCX_OPSS_QSEQ_SHIFT = 2, -}; - -/* Use macros to ensure that the type is unsigned long for the masks */ -#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) -#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) - /* * NOTE: sched_ext is in the process of growing multiple scheduler support and * scx_root usage is in a transitional state. Naked dereferences are safe if the @@ -1170,7 +136,7 @@ static struct kset *scx_kset; #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); -static void scx_bpf_kick_cpu(s32 cpu, u64 flags); +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); @@ -1185,24 +151,7 @@ static __printf(4, 5) void scx_exit(struct scx_sched *sch, va_end(args); } -static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code, - const char *fmt, ...) -{ - struct scx_sched *sch; - va_list args; - - rcu_read_lock(); - sch = rcu_dereference(scx_root); - if (sch) { - va_start(args, fmt); - scx_vexit(sch, kind, exit_code, fmt, args); - va_end(args); - } - rcu_read_unlock(); -} - #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) -#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args) #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) @@ -1232,10 +181,9 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } -static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, + struct task_struct *p) { - struct scx_sched *sch = scx_root; - return sch->global_dsqs[cpu_to_node(task_cpu(p))]; } @@ -1363,11 +311,11 @@ do { \ }) /* @mask is constant, always inline to cull unnecessary branches */ -static __always_inline bool scx_kf_allowed(u32 mask) +static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask) { if (unlikely(!(current->scx.kf_mask & mask))) { - scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", - mask, current->scx.kf_mask); + scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); return false; } @@ -1380,13 +328,13 @@ static __always_inline bool scx_kf_allowed(u32 mask) */ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { - scx_kf_error("cpu_release kfunc called from a nested operation"); + scx_error(sch, "cpu_release kfunc called from a nested operation"); return false; } if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { - scx_kf_error("dispatch kfunc called from a nested operation"); + scx_error(sch, "dispatch kfunc called from a nested operation"); return false; } @@ -1394,15 +342,16 @@ static __always_inline bool scx_kf_allowed(u32 mask) } /* see SCX_CALL_OP_TASK() */ -static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, +static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, + u32 mask, struct task_struct *p) { - if (!scx_kf_allowed(mask)) + if (!scx_kf_allowed(sch, mask)) return false; if (unlikely((p != current->scx.kf_tasks[0] && p != current->scx.kf_tasks[1]))) { - scx_kf_error("called on a task not being operated on"); + scx_error(sch, "called on a task not being operated on"); return false; } @@ -1488,10 +437,11 @@ struct bpf_iter_scx_dsq { */ struct scx_task_iter { struct sched_ext_entity cursor; - struct task_struct *locked; + struct task_struct *locked_task; struct rq *rq; struct rq_flags rf; u32 cnt; + bool list_locked; }; /** @@ -1519,15 +469,16 @@ static void scx_task_iter_start(struct scx_task_iter *iter) iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); - iter->locked = NULL; + iter->locked_task = NULL; iter->cnt = 0; + iter->list_locked = true; } static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) { - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; + if (iter->locked_task) { + task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); + iter->locked_task = NULL; } } @@ -1537,24 +488,24 @@ static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) * * If @iter is in the middle of a locked iteration, it may be locking the rq of * the task currently being visited in addition to scx_tasks_lock. Unlock both. - * This function can be safely called anytime during an iteration. + * This function can be safely called anytime during an iteration. The next + * iterator operation will automatically restore the necessary locking. */ static void scx_task_iter_unlock(struct scx_task_iter *iter) { __scx_task_iter_rq_unlock(iter); - spin_unlock_irq(&scx_tasks_lock); + if (iter->list_locked) { + iter->list_locked = false; + spin_unlock_irq(&scx_tasks_lock); + } } -/** - * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() - * @iter: iterator to re-lock - * - * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it - * doesn't re-lock the rq lock. Must be called before other iterator operations. - */ -static void scx_task_iter_relock(struct scx_task_iter *iter) +static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { - spin_lock_irq(&scx_tasks_lock); + if (!iter->list_locked) { + spin_lock_irq(&scx_tasks_lock); + iter->list_locked = true; + } } /** @@ -1567,6 +518,7 @@ static void scx_task_iter_relock(struct scx_task_iter *iter) */ static void scx_task_iter_stop(struct scx_task_iter *iter) { + __scx_task_iter_maybe_relock(iter); list_del_init(&iter->cursor.tasks_node); scx_task_iter_unlock(iter); } @@ -1584,10 +536,12 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; + __scx_task_iter_maybe_relock(iter); + if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); - scx_task_iter_relock(iter); + __scx_task_iter_maybe_relock(iter); } list_for_each_entry(pos, cursor, tasks_node) { @@ -1650,7 +604,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return NULL; iter->rq = task_rq_lock(p, &iter->rf); - iter->locked = p; + iter->locked_task = p; return p; } @@ -1664,7 +618,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * This can be used when preemption is not disabled. */ #define scx_add_event(sch, name, cnt) do { \ - this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ trace_sched_ext_event(#name, (cnt)); \ } while(0) @@ -1677,7 +631,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * This should be used only when preemption is disabled. */ #define __scx_add_event(sch, name, cnt) do { \ - __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ trace_sched_ext_event(#name, cnt); \ } while(0) @@ -1766,23 +720,6 @@ static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) } /** - * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args - * @cpu: cpu number which came from a BPF ops - * @where: extra information reported on error - * - * The same as ops_cpu_valid() but @sch is implicit. - */ -static bool kf_cpu_valid(u32 cpu, const char *where) -{ - if (__cpu_valid(cpu)) { - return true; - } else { - scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); - return false; - } -} - -/** * ops_sanitize_err - Sanitize a -errno value * @sch: scx_sched to error out on error * @ops_name: operation to blame on failure @@ -1942,10 +879,10 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) WRITE_ONCE(dsq->nr, dsq->nr + delta); } -static void refill_task_slice_dfl(struct task_struct *p) +static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1); + __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, @@ -1963,7 +900,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); raw_spin_lock(&dsq->lock); } } @@ -2142,26 +1079,27 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return find_global_dsq(p); + return find_global_dsq(sch, p); return &cpu_rq(cpu)->scx.local_dsq; } if (dsq_id == SCX_DSQ_GLOBAL) - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); else dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); - return find_global_dsq(p); + return find_global_dsq(sch, p); } return dsq; } -static void mark_direct_dispatch(struct task_struct *ddsp_task, +static void mark_direct_dispatch(struct scx_sched *sch, + struct task_struct *ddsp_task, struct task_struct *p, u64 dsq_id, u64 enq_flags) { @@ -2175,10 +1113,10 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, /* @p must match the task on the enqueue path */ if (unlikely(p != ddsp_task)) { if (IS_ERR(ddsp_task)) - scx_kf_error("%s[%d] already direct-dispatched", + scx_error(sch, "%s[%d] already direct-dispatched", p->comm, p->pid); else - scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", ddsp_task->comm, ddsp_task->pid, p->comm, p->pid); return; @@ -2333,15 +1271,15 @@ local: * higher priority it becomes from scx_prio_less()'s POV. */ touch_core_sched(rq, p); - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); local_norefill: dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); return; global: touch_core_sched(rq, p); /* see the comment in local: */ - refill_task_slice_dfl(p); - dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags); + refill_task_slice_dfl(sch, p); + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2651,8 +1589,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch, if (!scx_rq_online(rq)) { if (enforce) - __scx_add_event(scx_root, - SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); + __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; } @@ -2754,7 +1691,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dst_dsq = find_global_dsq(p); + dst_dsq = find_global_dsq(sch, p); dst_rq = src_rq; } } else { @@ -2910,7 +1847,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dispatch_enqueue(sch, find_global_dsq(p), p, + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -3155,10 +2092,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * balance(), we want to complete this scheduling cycle and then * start a new one. IOW, we want to call resched_curr() on the * next, most likely idle, task, not the current one. Use - * scx_bpf_kick_cpu() for deferred kicking. + * scx_kick_cpu() for deferred kicking. */ if (unlikely(!--nr_loops)) { - scx_bpf_kick_cpu(cpu_of(rq), 0); + scx_kick_cpu(sch, cpu_of(rq), 0); break; } } while (dspc->nr_tasks); @@ -3442,24 +2379,25 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (keep_prev) { p = prev; if (!p->scx.slice) - refill_task_slice_dfl(p); + refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); } else { p = first_local_task(rq); if (!p) { if (kick_idle) - scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); + scx_kick_cpu(rcu_dereference_sched(scx_root), + cpu_of(rq), SCX_KICK_IDLE); return NULL; } if (unlikely(!p->scx.slice)) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = rcu_dereference_sched(scx_root); if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); sch->warned_zero_slice = true; } - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); } } @@ -3548,7 +2486,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; } else { cpu = prev_cpu; @@ -4084,7 +3022,7 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED -DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); static bool scx_cgroup_enabled; void scx_tg_init(struct task_group *tg) @@ -4101,8 +3039,6 @@ int scx_tg_online(struct task_group *tg) WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled) { if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = @@ -4122,7 +3058,6 @@ int scx_tg_online(struct task_group *tg) tg->scx.flags |= SCX_TG_ONLINE; } - percpu_up_read(&scx_cgroup_rwsem); return ret; } @@ -4132,15 +3067,11 @@ void scx_tg_offline(struct task_group *tg) WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && (tg->scx.flags & SCX_TG_INITED)) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); - - percpu_up_read(&scx_cgroup_rwsem); } int scx_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4150,9 +3081,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) struct task_struct *p; int ret; - /* released in scx_finish/cancel_attach() */ - percpu_down_read(&scx_cgroup_rwsem); - if (!scx_cgroup_enabled) return 0; @@ -4192,7 +3120,6 @@ err: p->scx.cgrp_moving_from = NULL; } - percpu_up_read(&scx_cgroup_rwsem); return ops_sanitize_err(sch, "cgroup_prep_move", ret); } @@ -4215,11 +3142,6 @@ void scx_cgroup_move_task(struct task_struct *p) p->scx.cgrp_moving_from = NULL; } -void scx_cgroup_finish_attach(void) -{ - percpu_up_read(&scx_cgroup_rwsem); -} - void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) { struct scx_sched *sch = scx_root; @@ -4227,7 +3149,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) struct task_struct *p; if (!scx_cgroup_enabled) - goto out_unlock; + return; cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(sch, cgroup_cancel_move) && @@ -4236,15 +3158,13 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } -out_unlock: - percpu_up_read(&scx_cgroup_rwsem); } void scx_group_set_weight(struct task_group *tg, unsigned long weight) { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) @@ -4253,7 +3173,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) tg->scx.weight = weight; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_idle(struct task_group *tg, bool idle) @@ -4266,7 +3186,7 @@ void scx_group_set_bandwidth(struct task_group *tg, { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && (tg->scx.bw_period_us != period_us || @@ -4279,23 +3199,25 @@ void scx_group_set_bandwidth(struct task_group *tg, tg->scx.bw_quota_us = quota_us; tg->scx.bw_burst_us = burst_us; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } static void scx_cgroup_lock(void) { - percpu_down_write(&scx_cgroup_rwsem); + percpu_down_write(&scx_cgroup_ops_rwsem); + cgroup_lock(); } static void scx_cgroup_unlock(void) { - percpu_up_write(&scx_cgroup_rwsem); + cgroup_unlock(); + percpu_up_write(&scx_cgroup_ops_rwsem); } #else /* CONFIG_EXT_GROUP_SCHED */ -static inline void scx_cgroup_lock(void) {} -static inline void scx_cgroup_unlock(void) {} +static void scx_cgroup_lock(void) {} +static void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED */ @@ -4411,15 +3333,12 @@ static void scx_cgroup_exit(struct scx_sched *sch) { struct cgroup_subsys_state *css; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - scx_cgroup_enabled = false; /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and exit all the inited ones, all online cgroups are exited. */ - rcu_read_lock(); css_for_each_descendant_post(css, &root_task_group.css) { struct task_group *tg = css_tg(css); @@ -4430,17 +3349,9 @@ static void scx_cgroup_exit(struct scx_sched *sch) if (!sch->ops.cgroup_exit) continue; - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup); - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); } static int scx_cgroup_init(struct scx_sched *sch) @@ -4448,13 +3359,10 @@ static int scx_cgroup_init(struct scx_sched *sch) struct cgroup_subsys_state *css; int ret; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and init, all online cgroups are initialized. */ - rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); struct scx_cgroup_init_args args = { @@ -4473,10 +3381,6 @@ static int scx_cgroup_init(struct scx_sched *sch) continue; } - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { @@ -4485,11 +3389,7 @@ static int scx_cgroup_init(struct scx_sched *sch) return ret; } tg->scx.flags |= SCX_TG_INITED; - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); WARN_ON_ONCE(scx_cgroup_enabled); scx_cgroup_enabled = true; @@ -4572,7 +3472,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work) int node; kthread_stop(sch->helper->task); - free_percpu(sch->event_stats_cpu); + free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); @@ -4671,9 +3571,22 @@ bool task_should_scx(int policy) bool scx_allow_ttwu_queue(const struct task_struct *p) { - return !scx_enabled() || - (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) || - p->sched_class != &ext_sched_class; + struct scx_sched *sch; + + if (!scx_enabled()) + return true; + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return true; + + if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) + return true; + + if (unlikely(p->sched_class != &ext_sched_class)) + return true; + + return false; } /** @@ -4789,7 +3702,7 @@ static void scx_clear_softlockup(void) * * - pick_next_task() suppresses zero slice warning. * - * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM * operations. * * - scx_prio_less() reverts to the default core_sched_at order. @@ -5234,7 +4147,8 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", p->scx.dsq_vtime, p->scx.slice, p->scx.weight); - dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); + dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), + p->migration_disabled); if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); @@ -5473,13 +4387,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sch->global_dsqs[node] = dsq; } - sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); - if (!sch->event_stats_cpu) + sch->pcpu = alloc_percpu(struct scx_sched_pcpu); + if (!sch->pcpu) goto err_free_gdsqs; sch->helper = kthread_run_worker(0, "sched_ext_helper"); if (!sch->helper) - goto err_free_event_stats; + goto err_free_pcpu; sched_set_fifo(sch->helper->task); atomic_set(&sch->exit_kind, SCX_EXIT_NONE); @@ -5497,8 +4411,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) err_stop_helper: kthread_stop(sch->helper->task); -err_free_event_stats: - free_percpu(sch->event_stats_cpu); +err_free_pcpu: + free_percpu(sch->pcpu); err_free_gdsqs: for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); @@ -5621,6 +4535,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_error(sch, "ops.init() failed (%d)", ret); goto err_disable; } + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; } for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) @@ -5713,7 +4628,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) ret = scx_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - scx_task_iter_relock(&sti); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); @@ -5723,7 +4637,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); - scx_task_iter_relock(&sti); } scx_task_iter_stop(&sti); scx_cgroup_unlock(); @@ -5795,7 +4708,7 @@ err_unlock: err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); - scx_bypass(false); + /* we'll soon enter disable path, keep bypass on */ err_disable: mutex_unlock(&scx_enable_mutex); /* @@ -6328,40 +5241,41 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) +static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, + u64 enq_flags) { - if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) + if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) return false; lockdep_assert_irqs_disabled(); if (unlikely(!p)) { - scx_kf_error("called with NULL task"); + scx_error(sch, "called with NULL task"); return false; } if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { - scx_kf_error("invalid enq_flags 0x%llx", enq_flags); + scx_error(sch, "invalid enq_flags 0x%llx", enq_flags); return false; } return true; } -static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, - u64 enq_flags) +static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 enq_flags) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct task_struct *ddsp_task; ddsp_task = __this_cpu_read(direct_dispatch_task); if (ddsp_task) { - mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); + mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); return; } if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { - scx_kf_error("dispatch buffer overflow"); + scx_error(sch, "dispatch buffer overflow"); return; } @@ -6413,7 +5327,14 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) { - if (!scx_dsq_insert_preamble(p, enq_flags)) + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) return; if (slice) @@ -6421,7 +5342,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice else p->scx.slice = p->scx.slice ?: 1; - scx_dsq_insert_commit(p, dsq_id, enq_flags); + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); } /** @@ -6448,7 +5369,14 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) { - if (!scx_dsq_insert_preamble(p, enq_flags)) + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) return; if (slice) @@ -6458,7 +5386,7 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, p->scx.dsq_vtime = vtime; - scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } __bpf_kfunc_end_defs(); @@ -6483,7 +5411,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, bool in_balance; unsigned long flags; - if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) + if (!scx_kf_allowed_if_unlocked() && + !scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; /* @@ -6568,7 +5497,15 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) { - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return 0; return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); @@ -6583,14 +5520,21 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) __bpf_kfunc void scx_bpf_dispatch_cancel(void) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_sched *sch; + + guard(rcu)(); - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return; if (dspc->cursor > 0) dspc->cursor--; else - scx_kf_error("dispatch buffer underflow"); + scx_error(sch, "dispatch buffer underflow"); } /** @@ -6609,11 +5553,17 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) */ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) { - struct scx_sched *sch = scx_root; struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; + struct scx_sched *sch; + + guard(rcu)(); - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return false; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; flush_dispatch_buf(sch, dspc->rq); @@ -6760,12 +5710,18 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) { + struct scx_sched *sch; LIST_HEAD(tasks); u32 nr_enqueued = 0; struct rq *rq; struct task_struct *p, *n; - if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) return 0; rq = cpu_rq(smp_processor_id()); @@ -6788,12 +5744,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. - * - * Also skip re-enqueueing tasks that can only run on this - * CPU, as they would just be re-added to the same local - * DSQ without any benefit. */ - if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) + if (p->migration_pending) continue; dispatch_dequeue(rq, p); @@ -6881,22 +5833,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { __bpf_kfunc_start_defs(); -/** - * scx_bpf_kick_cpu - Trigger reschedule on a CPU - * @cpu: cpu to kick - * @flags: %SCX_KICK_* flags - * - * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or - * trigger rescheduling on a busy CPU. This can be called from any online - * scx_ops operation and the actual kicking is performed asynchronously through - * an irq work. - */ -__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) { struct rq *this_rq; unsigned long irq_flags; - if (!kf_cpu_valid(cpu, NULL)) + if (!ops_cpu_valid(sch, cpu, NULL)) return; local_irq_save(irq_flags); @@ -6920,7 +5862,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) struct rq *target_rq = cpu_rq(cpu); if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) - scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); if (raw_spin_rq_trylock(target_rq)) { if (can_skip_idle_kick(target_rq)) { @@ -6945,6 +5887,26 @@ out: } /** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (likely(sch)) + scx_kick_cpu(sch, cpu, flags); +} + +/** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ * @@ -7124,28 +6086,29 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __bpf_kfunc_end_defs(); -static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, - char *fmt, unsigned long long *data, u32 data__sz) +static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, + size_t line_size, char *fmt, unsigned long long *data, + u32 data__sz) { struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; s32 ret; if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || (data__sz && !data)) { - scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz); + scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); return -EINVAL; } ret = copy_from_kernel_nofault(data_buf, data, data__sz); if (ret < 0) { - scx_kf_error("failed to read data fields (%d)", ret); + scx_error(sch, "failed to read data fields (%d)", ret); return ret; } ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, &bprintf_data); if (ret < 0) { - scx_kf_error("format preparation failed (%d)", ret); + scx_error(sch, "format preparation failed (%d)", ret); return ret; } @@ -7153,17 +6116,17 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, bprintf_data.bin_args); bpf_bprintf_cleanup(&bprintf_data); if (ret < 0) { - scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz); + scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); return ret; } return ret; } -static s32 bstr_format(struct scx_bstr_buf *buf, +static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, char *fmt, unsigned long long *data, u32 data__sz) { - return __bstr_format(buf->data, buf->line, sizeof(buf->line), + return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), fmt, data, data__sz); } @@ -7182,11 +6145,14 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -7202,11 +6168,14 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -7225,17 +6194,24 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; struct scx_dump_data *dd = &scx_dump_data; struct scx_bstr_buf *buf = &dd->buf; s32 ret; + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + if (raw_smp_processor_id() != dd->cpu) { - scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); + scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); return; } /* append the formatted string to the line buf */ - ret = __bstr_format(buf->data, buf->line + dd->cursor, + ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, sizeof(buf->line) - dd->cursor, fmt, data, data__sz); if (ret < 0) { dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", @@ -7271,7 +6247,12 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, */ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) { - if (kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_cpu_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7293,7 +6274,12 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) */ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) { - if (kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_freq_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7315,12 +6301,20 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) */ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(sch); + if (unlikely(!sch)) + return; + if (unlikely(perf > SCX_CPUPERF_ONE)) { - scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); return; } - if (kf_cpu_valid(cpu, NULL)) { + if (ops_cpu_valid(sch, cpu, NULL)) { struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); struct rq_flags rf; @@ -7329,7 +6323,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) * to the corresponding CPU to prevent ABBA deadlocks. */ if (locked_rq && rq != locked_rq) { - scx_kf_error("Invalid target CPU %d", cpu); + scx_error(sch, "Invalid target CPU %d", cpu); return; } @@ -7424,13 +6418,76 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) */ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) { - if (!kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) return NULL; + if (!ops_cpu_valid(sch, cpu, NULL)) + return NULL; + + if (!sch->warned_deprecated_rq) { + printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " + "use scx_bpf_locked_rq() when holding rq lock " + "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); + sch->warned_deprecated_rq = true; + } + return cpu_rq(cpu); } /** + * scx_bpf_locked_rq - Return the rq currently locked by SCX + * + * Returns the rq if a rq lock is currently held by SCX. + * Otherwise emits an error and returns NULL. + */ +__bpf_kfunc struct rq *scx_bpf_locked_rq(void) +{ + struct scx_sched *sch; + struct rq *rq; + + guard(preempt)(); + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return NULL; + + rq = scx_locked_rq(); + if (!rq) { + scx_error(sch, "accessing rq without holding rq lock"); + return NULL; + } + + return rq; +} + +/** + * scx_bpf_cpu_curr - Return remote CPU's curr task + * @cpu: CPU of interest + * + * Callers must hold RCU read lock (KF_RCU). + */ +__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (!ops_cpu_valid(sch, cpu, NULL)) + return NULL; + + return rcu_dereference(cpu_rq(cpu)->curr); +} + +/** * scx_bpf_task_cgroup - Return the sched cgroup of a task * @p: task of interest * @@ -7446,8 +6503,15 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) { struct task_group *tg = p->sched_task_group; struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + goto out; - if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p)) goto out; cgrp = tg_cgrp(tg); @@ -7528,7 +6592,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event /* Aggregate per-CPU event counters into @events. */ memset(events, 0, sizeof(*events)); for_each_possible_cpu(cpu) { - e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); + e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); @@ -7594,6 +6658,8 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) +BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL) +BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) #ifdef CONFIG_CGROUP_SCHED BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 292bb41a242e..43429b33e52c 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,29 +8,6 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT -static inline bool scx_kf_allowed_if_unlocked(void) -{ - return !current->scx.kf_mask; -} - -static inline bool scx_rq_bypassing(struct rq *rq) -{ - return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); -} - -DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); - -DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); - -/* - * Return the rq currently locked from an scx callback, or NULL if no rq is - * locked. - */ -static inline struct rq *scx_locked_rq(void) -{ - return __this_cpu_read(scx_locked_rq_state); -} - void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -100,7 +77,6 @@ int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); void scx_cgroup_move_task(struct task_struct *p); -void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); @@ -111,7 +87,6 @@ static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } static inline void scx_cgroup_move_task(struct task_struct *p) {} -static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7174e1c1a392..d2434c954848 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -819,10 +819,10 @@ void scx_idle_disable(void) * Helpers that can be called from the BPF scheduler. */ -static int validate_node(int node) +static int validate_node(struct scx_sched *sch, int node) { if (!static_branch_likely(&scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is disabled"); + scx_error(sch, "per-node idle tracking is disabled"); return -EOPNOTSUPP; } @@ -832,13 +832,13 @@ static int validate_node(int node) /* Make sure node is in a valid range */ if (node < 0 || node >= nr_node_ids) { - scx_kf_error("invalid node %d", node); + scx_error(sch, "invalid node %d", node); return -EINVAL; } /* Make sure the node is part of the set of possible nodes */ if (!node_possible(node)) { - scx_kf_error("unavailable node %d", node); + scx_error(sch, "unavailable node %d", node); return -EINVAL; } @@ -847,26 +847,53 @@ static int validate_node(int node) __bpf_kfunc_start_defs(); -static bool check_builtin_idle_enabled(void) +static bool check_builtin_idle_enabled(struct scx_sched *sch) { if (static_branch_likely(&scx_builtin_idle_enabled)) return true; - scx_kf_error("built-in idle tracking is disabled"); + scx_error(sch, "built-in idle tracking is disabled"); return false; } -static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, +/* + * Determine whether @p is a migration-disabled task in the context of BPF + * code. + * + * We can't simply check whether @p->migration_disabled is set in a + * sched_ext callback, because migration is always disabled for the current + * task while running BPF code. + * + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively + * disable and re-enable migration. For this reason, the current task + * inside a sched_ext callback is always a migration-disabled task. + * + * Therefore, when @p->migration_disabled == 1, check whether @p is the + * current task or not: if it is, then migration was not disabled before + * entering the callback, otherwise migration was disabled. + * + * Returns true if @p is migration-disabled, false otherwise. + */ +static bool is_bpf_migration_disabled(const struct task_struct *p) +{ + if (p->migration_disabled == 1) + return p != current; + else + return p->migration_disabled; +} + +static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, + s32 prev_cpu, u64 wake_flags, const struct cpumask *allowed, u64 flags) { struct rq *rq; struct rq_flags rf; s32 cpu; - if (!kf_cpu_valid(prev_cpu, NULL)) + if (!ops_cpu_valid(sch, prev_cpu, NULL)) return -EINVAL; - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return -EBUSY; /* @@ -879,7 +906,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f if (scx_kf_allowed_if_unlocked()) { rq = task_rq_lock(p, &rf); } else { - if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) + if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) return -EPERM; rq = scx_locked_rq(); } @@ -898,7 +925,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f * selection optimizations and simply check whether the previously * used CPU is idle and within the allowed cpumask. */ - if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) { + if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) { if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && scx_idle_test_and_clear_cpu(prev_cpu)) cpu = prev_cpu; @@ -922,9 +949,13 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f */ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) { - if (!kf_cpu_valid(cpu, NULL)) - return NUMA_NO_NODE; + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL)) + return NUMA_NO_NODE; return cpu_to_node(cpu); } @@ -946,15 +977,21 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { + struct scx_sched *sch; s32 cpu; - cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0); + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { *is_idle = true; return cpu; } *is_idle = false; - return prev_cpu; } @@ -981,7 +1018,16 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *cpus_allowed, u64 flags) { - return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, + cpus_allowed, flags); } /** @@ -995,7 +1041,15 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + + node = validate_node(sch, node); if (node < 0) return cpu_none_mask; @@ -1011,12 +1065,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return cpu_none_mask; return idle_cpumask(NUMA_NO_NODE)->cpu; @@ -1034,7 +1096,15 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + + node = validate_node(sch, node); if (node < 0) return cpu_none_mask; @@ -1054,12 +1124,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return cpu_none_mask; if (sched_smt_active()) @@ -1095,10 +1173,18 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) */ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) { - if (!check_builtin_idle_enabled()) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) return false; - if (!kf_cpu_valid(cpu, NULL)) + if (!check_builtin_idle_enabled(sch)) + return false; + + if (!ops_cpu_valid(sch, cpu, NULL)) return false; return scx_idle_test_and_clear_cpu(cpu); @@ -1126,7 +1212,15 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, int node, u64 flags) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + node = validate_node(sch, node); if (node < 0) return node; @@ -1158,12 +1252,20 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is enabled"); + scx_error(sch, "per-node idle tracking is enabled"); return -EBUSY; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return -EBUSY; return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); @@ -1193,9 +1295,16 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, int node, u64 flags) { + struct scx_sched *sch; s32 cpu; - node = validate_node(node); + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + node = validate_node(sch, node); if (node < 0) return node; @@ -1233,10 +1342,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, u64 flags) { + struct scx_sched *sch; s32 cpu; + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is enabled"); + scx_error(sch, "per-node idle tracking is enabled"); return -EBUSY; } diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h new file mode 100644 index 000000000000..b3617abed510 --- /dev/null +++ b/kernel/sched/ext_internal.h @@ -0,0 +1,1078 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + */ +#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) + +enum scx_consts { + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, + + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_TASK_ITER_BATCH = 32, +}; + +enum scx_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ + SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ + SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +/* + * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), + * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes + * are 64bit of the format: + * + * Bits: [63 .. 48 47 .. 32 31 .. 0] + * [ SYS ACT ] [ SYS RSN ] [ USR ] + * + * SYS ACT: System-defined exit actions + * SYS RSN: System-defined exit reasons + * USR : User-defined exit codes and reasons + * + * Using the above, users may communicate intention and context by ORing system + * actions and/or system reasons with a user-defined exit code. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +enum scx_exit_flags { + /* + * ops.exit() may be called even if the loading failed before ops.init() + * finishes successfully. This is because ops.exit() allows rich exit + * info communication. The following flag indicates whether ops.init() + * finished successfully. + */ + SCX_EFLAG_INITIALIZED, +}; + +/* + * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is + * being disabled. + */ +struct scx_exit_info { + /* %SCX_EXIT_* - broad category of the exit reason */ + enum scx_exit_kind kind; + + /* exit code if gracefully exiting */ + s64 exit_code; + + /* %SCX_EFLAG_* */ + u64 flags; + + /* textual representation of the above */ + const char *reason; + + /* backtrace if exiting due to an error */ + unsigned long *bt; + u32 bt_len; + + /* informational message */ + char *msg; + + /* debug dump */ + char *dump; +}; + +/* sched_ext_ops.flags */ +enum scx_ops_flags { + /* + * Keep built-in idle tracking even if ops.update_idle() is implemented. + */ + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + + /* + * By default, if there are no other task to run on the CPU, ext core + * keeps running the current task even after its slice expires. If this + * flag is specified, such tasks are passed to ops.enqueue() with + * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. + */ + SCX_OPS_ENQ_LAST = 1LLU << 1, + + /* + * An exiting task may schedule after PF_EXITING is set. In such cases, + * bpf_task_from_pid() may not be able to find the task and if the BPF + * scheduler depends on pid lookup for dispatching, the task will be + * lost leading to various issues including RCU grace period stalls. + * + * To mask this problem, by default, unhashed tasks are automatically + * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't + * depend on pid lookups and wants to handle these tasks directly, the + * following flag can be used. + */ + SCX_OPS_ENQ_EXITING = 1LLU << 2, + + /* + * If set, only tasks with policy set to SCHED_EXT are attached to + * sched_ext. If clear, SCHED_NORMAL tasks are also included. + */ + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, + + /* + * CPU cgroup support flags + */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ + + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | + SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | + SCX_OPS_HAS_CGROUP_WEIGHT, + + /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ + __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, + + SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, +}; + +/* argument container for ops.init_task() */ +struct scx_init_task_args { + /* + * Set if ops.init_task() is being invoked on the fork path, as opposed + * to the scheduler transition path. + */ + bool fork; +#ifdef CONFIG_EXT_GROUP_SCHED + /* the cgroup the task is joining */ + struct cgroup *cgroup; +#endif +}; + +/* argument container for ops.exit_task() */ +struct scx_exit_task_args { + /* Whether the task exited before running on sched_ext. */ + bool cancelled; +}; + +/* argument container for ops->cgroup_init() */ +struct scx_cgroup_init_args { + /* the weight of the cgroup [1..10000] */ + u32 weight; + + /* bandwidth control parameters from cpu.max and cpu.max.burst */ + u64 bw_period_us; + u64 bw_quota_us; + u64 bw_burst_us; +}; + +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, + /* next task is being scheduled by &sched_class_dl */ + SCX_CPU_PREEMPT_DL, + /* next task is being scheduled by &sched_class_stop */ + SCX_CPU_PREEMPT_STOP, + /* unknown reason for SCX being preempted */ + SCX_CPU_PREEMPT_UNKNOWN, +}; + +/* + * Argument container for ops->cpu_acquire(). Currently empty, but may be + * expanded in the future. + */ +struct scx_cpu_acquire_args {}; + +/* argument container for ops->cpu_release() */ +struct scx_cpu_release_args { + /* the reason the CPU was preempted */ + enum scx_cpu_preempt_reason reason; + + /* the task that's going to be scheduled on the CPU */ + struct task_struct *task; +}; + +/* + * Informational context provided to dump operations. + */ +struct scx_dump_ctx { + enum scx_exit_kind kind; + s64 exit_code; + const char *reason; + u64 at_ns; + u64 at_jiffies; +}; + +/** + * struct sched_ext_ops - Operation table for BPF scheduler implementation + * + * A BPF scheduler can implement an arbitrary scheduling policy by + * implementing and loading operations in this table. Note that a userland + * scheduling policy can also be implemented using the BPF scheduler + * as a shim layer. + */ +struct sched_ext_ops { + /** + * @select_cpu: Pick the target CPU for a task which is being woken up + * @p: task being woken up + * @prev_cpu: the cpu @p was on before sleeping + * @wake_flags: SCX_WAKE_* + * + * Decision made here isn't final. @p may be moved to any CPU while it + * is getting dispatched for execution later. However, as @p is not on + * the rq at this point, getting the eventual execution CPU right here + * saves a small bit of overhead down the line. + * + * If an idle CPU is returned, the CPU is kicked and will try to + * dispatch. While an explicit custom mechanism can be added, + * select_cpu() serves as the default way to wake up idle CPUs. + * + * @p may be inserted into a DSQ directly by calling + * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. + * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ + * of the CPU returned by this operation. + * + * Note that select_cpu() is never called for tasks that can only run + * on a single CPU or tasks with migration disabled, as they don't have + * the option to select a different CPU. See select_task_rq() for + * details. + */ + s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); + + /** + * @enqueue: Enqueue a task on the BPF scheduler + * @p: task being enqueued + * @enq_flags: %SCX_ENQ_* + * + * @p is ready to run. Insert directly into a DSQ by calling + * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly + * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, + * the task will stall. + * + * If @p was inserted into a DSQ from ops.select_cpu(), this callback is + * skipped. + */ + void (*enqueue)(struct task_struct *p, u64 enq_flags); + + /** + * @dequeue: Remove a task from the BPF scheduler + * @p: task being dequeued + * @deq_flags: %SCX_DEQ_* + * + * Remove @p from the BPF scheduler. This is usually called to isolate + * the task while updating its scheduling properties (e.g. priority). + * + * The ext core keeps track of whether the BPF side owns a given task or + * not and can gracefully ignore spurious dispatches from BPF side, + * which makes it safe to not implement this method. However, depending + * on the scheduling logic, this can lead to confusing behaviors - e.g. + * scheduling position not being updated across a priority change. + */ + void (*dequeue)(struct task_struct *p, u64 deq_flags); + + /** + * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs + * @cpu: CPU to dispatch tasks for + * @prev: previous task being switched out + * + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ + * using scx_bpf_dsq_move_to_local(). + * + * The maximum number of times scx_bpf_dsq_insert() can be called + * without an intervening scx_bpf_dsq_move_to_local() is specified by + * ops.dispatch_max_batch. See the comments on top of the two functions + * for more details. + * + * When not %NULL, @prev is an SCX task with its slice depleted. If + * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in + * @prev->scx.flags, it is not enqueued yet and will be enqueued after + * ops.dispatch() returns. To keep executing @prev, return without + * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. + */ + void (*dispatch)(s32 cpu, struct task_struct *prev); + + /** + * @tick: Periodic tick + * @p: task running currently + * + * This operation is called every 1/HZ seconds on CPUs which are + * executing an SCX task. Setting @p->scx.slice to 0 will trigger an + * immediate dispatch cycle on the CPU. + */ + void (*tick)(struct task_struct *p); + + /** + * @runnable: A task is becoming runnable on its associated CPU + * @p: task becoming runnable + * @enq_flags: %SCX_ENQ_* + * + * This and the following three functions can be used to track a task's + * execution state transitions. A task becomes ->runnable() on a CPU, + * and then goes through one or more ->running() and ->stopping() pairs + * as it runs on the CPU, and eventually becomes ->quiescent() when it's + * done running on the CPU. + * + * @p is becoming runnable on the CPU because it's + * + * - waking up (%SCX_ENQ_WAKEUP) + * - being moved from another CPU + * - being restored after temporarily taken off the queue for an + * attribute change. + * + * This and ->enqueue() are related but not coupled. This operation + * notifies @p's state transition and may not be followed by ->enqueue() + * e.g. when @p is being dispatched to a remote CPU, or when @p is + * being enqueued on a CPU experiencing a hotplug event. Likewise, a + * task may be ->enqueue()'d without being preceded by this operation + * e.g. after exhausting its slice. + */ + void (*runnable)(struct task_struct *p, u64 enq_flags); + + /** + * @running: A task is starting to run on its associated CPU + * @p: task starting to run + * + * Note that this callback may be called from a CPU other than the + * one the task is going to run on. This can happen when a task + * property is changed (i.e., affinity), since scx_next_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to determine the + * target CPU the task is going to use. + * + * See ->runnable() for explanation on the task state notifiers. + */ + void (*running)(struct task_struct *p); + + /** + * @stopping: A task is stopping execution + * @p: task stopping to run + * @runnable: is task @p still runnable? + * + * Note that this callback may be called from a CPU other than the + * one the task was running on. This can happen when a task + * property is changed (i.e., affinity), since dequeue_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU + * the task was running on. + * + * See ->runnable() for explanation on the task state notifiers. If + * !@runnable, ->quiescent() will be invoked after this operation + * returns. + */ + void (*stopping)(struct task_struct *p, bool runnable); + + /** + * @quiescent: A task is becoming not runnable on its associated CPU + * @p: task becoming not runnable + * @deq_flags: %SCX_DEQ_* + * + * See ->runnable() for explanation on the task state notifiers. + * + * @p is becoming quiescent on the CPU because it's + * + * - sleeping (%SCX_DEQ_SLEEP) + * - being moved to another CPU + * - being temporarily taken off the queue for an attribute change + * (%SCX_DEQ_SAVE) + * + * This and ->dequeue() are related but not coupled. This operation + * notifies @p's state transition and may not be preceded by ->dequeue() + * e.g. when @p is being dispatched to a remote CPU. + */ + void (*quiescent)(struct task_struct *p, u64 deq_flags); + + /** + * @yield: Yield CPU + * @from: yielding task + * @to: optional yield target task + * + * If @to is NULL, @from is yielding the CPU to other runnable tasks. + * The BPF scheduler should ensure that other available tasks are + * dispatched before the yielding task. Return value is ignored in this + * case. + * + * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf + * scheduler can implement the request, return %true; otherwise, %false. + */ + bool (*yield)(struct task_struct *from, struct task_struct *to); + + /** + * @core_sched_before: Task ordering for core-sched + * @a: task A + * @b: task B + * + * Used by core-sched to determine the ordering between two tasks. See + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on + * core-sched. + * + * Both @a and @b are runnable and may or may not currently be queued on + * the BPF scheduler. Should return %true if @a should run before @b. + * %false if there's no required ordering or @b should run before @a. + * + * If not specified, the default is ordering them according to when they + * became runnable. + */ + bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); + + /** + * @set_weight: Set task weight + * @p: task to set weight for + * @weight: new weight [1..10000] + * + * Update @p's weight to @weight. + */ + void (*set_weight)(struct task_struct *p, u32 weight); + + /** + * @set_cpumask: Set CPU affinity + * @p: task to set CPU affinity for + * @cpumask: cpumask of cpus that @p can run on + * + * Update @p's CPU affinity to @cpumask. + */ + void (*set_cpumask)(struct task_struct *p, + const struct cpumask *cpumask); + + /** + * @update_idle: Update the idle state of a CPU + * @cpu: CPU to update the idle state for + * @idle: whether entering or exiting the idle state + * + * This operation is called when @rq's CPU goes or leaves the idle + * state. By default, implementing this operation disables the built-in + * idle CPU tracking and the following helpers become unavailable: + * + * - scx_bpf_select_cpu_dfl() + * - scx_bpf_select_cpu_and() + * - scx_bpf_test_and_clear_cpu_idle() + * - scx_bpf_pick_idle_cpu() + * + * The user also must implement ops.select_cpu() as the default + * implementation relies on scx_bpf_select_cpu_dfl(). + * + * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle + * tracking. + */ + void (*update_idle)(s32 cpu, bool idle); + + /** + * @cpu_acquire: A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * @cpu_release: A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); + + /** + * @init_task: Initialize a task to run in a BPF scheduler + * @p: task to initialize for BPF scheduling + * @args: init arguments, see the struct definition + * + * Either we're loading a BPF scheduler or a new task is being forked. + * Initialize @p for BPF scheduling. This operation may block and can + * be used for allocations, and is called exactly once for a task. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During a fork, it + * will abort that specific fork. + */ + s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); + + /** + * @exit_task: Exit a previously-running task from the system + * @p: task to exit + * @args: exit arguments, see the struct definition + * + * @p is exiting or the BPF scheduler is being unloaded. Perform any + * necessary cleanup for @p. + */ + void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); + + /** + * @enable: Enable BPF scheduling for a task + * @p: task to enable BPF scheduling for + * + * Enable @p for BPF scheduling. enable() is called on @p any time it + * enters SCX, and is always paired with a matching disable(). + */ + void (*enable)(struct task_struct *p); + + /** + * @disable: Disable BPF scheduling for a task + * @p: task to disable BPF scheduling for + * + * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. + * Disable BPF scheduling for @p. A disable() call is always matched + * with a prior enable() call. + */ + void (*disable)(struct task_struct *p); + + /** + * @dump: Dump BPF scheduler state on error + * @ctx: debug dump context + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. + */ + void (*dump)(struct scx_dump_ctx *ctx); + + /** + * @dump_cpu: Dump BPF scheduler state for a CPU on error + * @ctx: debug dump context + * @cpu: CPU to generate debug dump for + * @idle: @cpu is currently idle without any runnable tasks + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @cpu. If @idle is %true and this operation doesn't produce any + * output, @cpu is skipped for dump. + */ + void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); + + /** + * @dump_task: Dump BPF scheduler state for a runnable task on error + * @ctx: debug dump context + * @p: runnable task to generate debug dump for + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @p. + */ + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); + +#ifdef CONFIG_EXT_GROUP_SCHED + /** + * @cgroup_init: Initialize a cgroup + * @cgrp: cgroup being initialized + * @args: init arguments, see the struct definition + * + * Either the BPF scheduler is being loaded or @cgrp created, initialize + * @cgrp for sched_ext. This operation may block. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During cgroup + * creation, it will abort the specific cgroup creation. + */ + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + + /** + * @cgroup_exit: Exit a cgroup + * @cgrp: cgroup being exited + * + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit + * @cgrp for sched_ext. This operation my block. + */ + void (*cgroup_exit)(struct cgroup *cgrp); + + /** + * @cgroup_prep_move: Prepare a task to be moved to a different cgroup + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Prepare @p for move from cgroup @from to @to. This operation may + * block and can be used for allocations. + * + * Return 0 for success, -errno for failure. An error return aborts the + * migration. + */ + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_move: Commit cgroup move + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Commit the move. @p is dequeued during this operation. + */ + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_cancel_move: Cancel cgroup move + * @p: task whose cgroup move is being canceled + * @from: cgroup @p was being moved from + * @to: cgroup @p was being moved to + * + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). + * Undo the preparation. + */ + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_set_weight: A cgroup's weight is being changed + * @cgrp: cgroup whose weight is being updated + * @weight: new weight [1..10000] + * + * Update @cgrp's weight to @weight. + */ + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); + + /** + * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed + * @cgrp: cgroup whose bandwidth is being updated + * @period_us: bandwidth control period + * @quota_us: bandwidth control quota + * @burst_us: bandwidth control burst + * + * Update @cgrp's bandwidth control parameters. This is from the cpu.max + * cgroup interface. + * + * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled + * to. For example, if @period_us is 1_000_000 and @quota_us is + * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be + * interpreted in the same fashion and specifies how much @cgrp can + * burst temporarily. The specific control mechanism and thus the + * interpretation of @period_us and burstiness is upto to the BPF + * scheduler. + */ + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us); + +#endif /* CONFIG_EXT_GROUP_SCHED */ + + /* + * All online ops must come before ops.cpu_online(). + */ + + /** + * @cpu_online: A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * @cpu_offline: A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). + */ + + /** + * @init: Initialize the BPF scheduler + */ + s32 (*init)(void); + + /** + * @exit: Clean up after the BPF scheduler + * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. + */ + void (*exit)(struct scx_exit_info *info); + + /** + * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch + */ + u32 dispatch_max_batch; + + /** + * @flags: %SCX_OPS_* flags + */ + u64 flags; + + /** + * @timeout_ms: The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + + /** + * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default + * value of 32768 is used. + */ + u32 exit_dump_len; + + /** + * @hotplug_seq: A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + + /** + * @name: BPF scheduler's name + * + * Must be a non-zero valid BPF object name including only isalnum(), + * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the + * BPF scheduler is enabled. + */ + char name[SCX_OPS_NAME_LEN]; + + /* internal use only, must be NULL */ + void *priv; +}; + +enum scx_opi { + SCX_OPI_BEGIN = 0, + SCX_OPI_NORMAL_BEGIN = 0, + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), + SCX_OPI_END = SCX_OP_IDX(init), +}; + +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * Total number of times a task's time slice was refilled with the + * default value (SCX_SLICE_DFL). + */ + s64 SCX_EV_REFILL_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +struct scx_sched_pcpu { + /* + * The event counters are in a per-CPU variable to minimize the + * accounting overhead. A system-wide view on the event counter is + * constructed when requested by scx_bpf_events(). + */ + struct scx_event_stats event_stats; +}; + +struct scx_sched { + struct sched_ext_ops ops; + DECLARE_BITMAP(has_op, SCX_OPI_END); + + /* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. + * This is to avoid live-locking in bypass mode where all tasks are + * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If + * per-node split isn't sufficient, it can be further split. + */ + struct rhashtable dsq_hash; + struct scx_dispatch_q **global_dsqs; + struct scx_sched_pcpu __percpu *pcpu; + + bool warned_zero_slice:1; + bool warned_deprecated_rq:1; + + atomic_t exit_kind; + struct scx_exit_info *exit_info; + + struct kobject kobj; + + struct kthread_worker *helper; + struct irq_work error_irq_work; + struct kthread_work disable_work; + struct rcu_work rcu_work; +}; + +enum scx_wake_flags { + /* expose select WF_* flags as enums */ + SCX_WAKE_FORK = WF_FORK, + SCX_WAKE_TTWU = WF_TTWU, + SCX_WAKE_SYNC = WF_SYNC, +}; + +enum scx_enq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, + SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, + + /* high 32bits are SCX specific */ + + /* + * Set the following to trigger preemption when calling + * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + + /* + * The task being enqueued is the only task available for the cpu. By + * default, ext core keeps executing such tasks but when + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the + * %SCX_ENQ_LAST flag set. + * + * The BPF scheduler is responsible for triggering a follow-up + * scheduling event. Otherwise, Execution may stall. + */ + SCX_ENQ_LAST = 1LLU << 41, + + /* high 8 bits are internal */ + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, + + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +}; + +enum scx_deq_flags { + /* expose select DEQUEUE_* flags as enums */ + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ +}; + +enum scx_kick_flags { + /* + * Kick the target CPU if idle. Guarantees that the target CPU goes + * through at least one full scheduling cycle before going idle. If the + * target CPU can be determined to be currently not idle and going to go + * through a scheduling cycle before going idle, noop. + */ + SCX_KICK_IDLE = 1LLU << 0, + + /* + * Preempt the current task and execute the dispatch path. If the + * current task of the target CPU is an SCX task, its ->scx.slice is + * cleared to zero before the scheduling path is invoked so that the + * task expires and the dispatch path is invoked. + */ + SCX_KICK_PREEMPT = 1LLU << 1, + + /* + * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will + * return after the target CPU finishes picking the next task. + */ + SCX_KICK_WAIT = 1LLU << 2, +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + +enum scx_enable_state { + SCX_ENABLING, + SCX_ENABLED, + SCX_DISABLING, + SCX_DISABLED, +}; + +static const char *scx_enable_state_str[] = { + [SCX_ENABLING] = "enabling", + [SCX_ENABLED] = "enabled", + [SCX_DISABLING] = "disabling", + [SCX_DISABLED] = "disabled", +}; + +/* + * sched_ext_entity->ops_state + * + * Used to track the task ownership between the SCX core and the BPF scheduler. + * State transitions look as follows: + * + * NONE -> QUEUEING -> QUEUED -> DISPATCHING + * ^ | | + * | v v + * \-------------------------------/ + * + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call + * sites for explanations on the conditions being waited upon and why they are + * safe. Transitions out of them into NONE or QUEUED must store_release and the + * waiters should load_acquire. + * + * Tracking scx_ops_state enables sched_ext core to reliably determine whether + * any given task can be dispatched by the BPF scheduler at all times and thus + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler + * to try to dispatch any task anytime regardless of its state as the SCX core + * can safely reject invalid dispatches. + */ +enum scx_ops_state { + SCX_OPSS_NONE, /* owned by the SCX core */ + SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ + SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ + SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ + + /* + * QSEQ brands each QUEUED instance so that, when dispatch races + * dequeue/requeue, the dispatcher can tell whether it still has a claim + * on the task being dispatched. + * + * As some 32bit archs can't do 64bit store_release/load_acquire, + * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on + * 32bit machines. The dispatch race window QSEQ protects is very narrow + * and runs with IRQ disabled. 30 bits should be sufficient. + */ + SCX_OPSS_QSEQ_SHIFT = 2, +}; + +/* Use macros to ensure that the type is unsigned long for the masks */ +#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) + +DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(scx_locked_rq_state); +} + +static inline bool scx_kf_allowed_if_unlocked(void) +{ + return !current->scx.kf_mask; +} + +static inline bool scx_rq_bypassing(struct rq *rq) +{ + return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..3a89f949e307 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3542,7 +3542,7 @@ out: } } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +void init_numa_balancing(u64 clone_flags, struct task_struct *p) { int mm_users = 0; struct mm_struct *mm = p->mm; @@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se) if (!gcfs_rq || !gcfs_rq->load.weight) return; - if (throttled_hierarchy(gcfs_rq)) - return; - shares = calc_group_shares(gcfs_rq); if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); @@ -5291,18 +5288,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); - if (!throttled_hierarchy(cfs_rq)) { - list_add_leaf_cfs_rq(cfs_rq); - } else { + list_add_leaf_cfs_rq(cfs_rq); #ifdef CONFIG_CFS_BANDWIDTH + if (cfs_rq->pelt_clock_throttled) { struct rq *rq = rq_of(cfs_rq); - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) - cfs_rq->throttled_clock = rq_clock(rq); - if (!cfs_rq->throttled_clock_self) - cfs_rq->throttled_clock_self = rq_clock(rq); -#endif + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; } +#endif } } @@ -5341,8 +5336,6 @@ static void set_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable--; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5363,8 +5356,6 @@ static void clear_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable++; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5392,7 +5383,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * DELAY_DEQUEUE relies on spurious wakeups, special task * states must not suffer spurious wakeups, excempt them. */ - if (flags & DEQUEUE_SPECIAL) + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) delay = false; WARN_ON_ONCE(delay && se->sched_delayed); @@ -5450,8 +5441,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_queued == 0) + if (cfs_rq->nr_queued == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); +#ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { + struct rq *rq = rq_of(cfs_rq); + + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; + } +#endif + } return true; } @@ -5725,74 +5726,253 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled; +} + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return cfs_bandwidth_used() && cfs_rq->throttle_count; } +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) +{ + return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); +} + +static inline bool task_is_throttled(struct task_struct *p) +{ + return cfs_bandwidth_used() && p->throttled; +} + +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags); +static void throttle_cfs_rq_work(struct callback_head *work) +{ + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work); + struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; + + WARN_ON_ONCE(p != current); + p->sched_throttle_work.next = &p->sched_throttle_work; + + /* + * If task is exiting, then there won't be a return to userspace, so we + * don't have to bother with any of this. + */ + if ((p->flags & PF_EXITING)) + return; + + scoped_guard(task_rq_lock, p) { + se = &p->se; + cfs_rq = cfs_rq_of(se); + + /* Raced, forget */ + if (p->sched_class != &fair_sched_class) + return; + + /* + * If not in limbo, then either replenish has happened or this + * task got migrated out of the throttled cfs_rq, move along. + */ + if (!cfs_rq->throttle_count) + return; + rq = scope.rq; + update_rq_clock(rq); + WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + /* + * Must not set throttled before dequeue or dequeue will + * mistakenly regard this task as an already throttled one. + */ + p->throttled = true; + resched_curr(rq); + } +} + +void init_cfs_throttle_work(struct task_struct *p) +{ + init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work); + /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */ + p->sched_throttle_work.next = &p->sched_throttle_work; + INIT_LIST_HEAD(&p->throttle_node); +} + /* - * Ensure that neither of the group entities corresponding to src_cpu or - * dest_cpu are members of a throttled hierarchy when performing group - * load-balance operations. + * Task is throttled and someone wants to dequeue it again: + * it could be sched/core when core needs to do things like + * task affinity change, task group change, task sched class + * change etc. and in these cases, DEQUEUE_SLEEP is not set; + * or the task is blocked after throttled due to freezer etc. + * and in these cases, DEQUEUE_SLEEP is set. */ -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) +static void detach_task_cfs_rq(struct task_struct *p); +static void dequeue_throttled_task(struct task_struct *p, int flags) +{ + WARN_ON_ONCE(p->se.on_rq); + list_del_init(&p->throttle_node); + + /* task blocked after throttled */ + if (flags & DEQUEUE_SLEEP) { + p->throttled = false; + return; + } + + /* + * task is migrating off its old cfs_rq, detach + * the task's load from its old cfs_rq. + */ + if (task_on_rq_migrating(p)) + detach_task_cfs_rq(p); +} + +static bool enqueue_throttled_task(struct task_struct *p) { - struct cfs_rq *src_cfs_rq, *dest_cfs_rq; + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); - src_cfs_rq = tg->cfs_rq[src_cpu]; - dest_cfs_rq = tg->cfs_rq[dest_cpu]; + /* @p should have gone through dequeue_throttled_task() first */ + WARN_ON_ONCE(!list_empty(&p->throttle_node)); + + /* + * If the throttled task @p is enqueued to a throttled cfs_rq, + * take the fast path by directly putting the task on the + * target cfs_rq's limbo list. + * + * Do not do that when @p is current because the following race can + * cause @p's group_node to be incorectly re-insterted in its rq's + * cfs_tasks list, despite being throttled: + * + * cpuX cpuY + * p ret2user + * throttle_cfs_rq_work() sched_move_task(p) + * LOCK task_rq_lock + * dequeue_task_fair(p) + * UNLOCK task_rq_lock + * LOCK task_rq_lock + * task_current_donor(p) == true + * task_on_rq_queued(p) == true + * dequeue_task(p) + * put_prev_task(p) + * sched_change_group() + * enqueue_task(p) -> p's new cfs_rq + * is throttled, go + * fast path and skip + * actual enqueue + * set_next_task(p) + * list_move(&se->group_node, &rq->cfs_tasks); // bug + * schedule() + * + * In the above race case, @p current cfs_rq is in the same rq as + * its previous cfs_rq because sched_move_task() only moves a task + * to a different group from the same rq, so we can use its current + * cfs_rq to derive rq and test if the task is current. + */ + if (throttled_hierarchy(cfs_rq) && + !task_current_donor(rq_of(cfs_rq), p)) { + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + return true; + } - return throttled_hierarchy(src_cfs_rq) || - throttled_hierarchy(dest_cfs_rq); + /* we can't take the fast path, do an actual enqueue*/ + p->throttled = false; + return false; } +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct task_struct *p, *tmp; + + if (--cfs_rq->throttle_count) + return 0; - cfs_rq->throttle_count--; - if (!cfs_rq->throttle_count) { + if (cfs_rq->pelt_clock_throttled) { cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; + } - /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; - if (cfs_rq->throttled_clock_self) { - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + cfs_rq->throttled_clock_self = 0; - cfs_rq->throttled_clock_self = 0; + if (WARN_ON_ONCE((s64)delta < 0)) + delta = 0; - if (WARN_ON_ONCE((s64)delta < 0)) - delta = 0; + cfs_rq->throttled_clock_self_time += delta; + } - cfs_rq->throttled_clock_self_time += delta; - } + /* Re-enqueue the tasks that have been throttled at this level. */ + list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { + list_del_init(&p->throttle_node); + p->throttled = false; + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); } + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + return 0; } +static inline bool task_has_throttle_work(struct task_struct *p) +{ + return p->sched_throttle_work.next != &p->sched_throttle_work; +} + +static inline void task_throttle_setup_work(struct task_struct *p) +{ + if (task_has_throttle_work(p)) + return; + + /* + * Kthreads and exiting tasks don't return to userspace, so adding the + * work is pointless + */ + if ((p->flags & (PF_EXITING | PF_KTHREAD))) + return; + + task_work_add(p, &p->sched_throttle_work, TWA_RESUME); +} + +static void record_throttle_clock(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); - list_del_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttle_count++) + return 0; - WARN_ON_ONCE(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock_self = rq_clock(rq); + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ + if (!cfs_rq->nr_queued) { + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; } - cfs_rq->throttle_count++; + WARN_ON_ONCE(cfs_rq->throttled_clock_self); + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } @@ -5800,8 +5980,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta, dequeue = 1; + int dequeue = 1; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5824,76 +6003,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!dequeue) return false; /* Throttle no longer required. */ - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - int flags; - - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - /* - * Abuse SPECIAL to avoid delayed dequeue in this instance. - * This avoids teaching dequeue_entities() about throttled - * entities and keeps things relatively simple. - */ - flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; - if (se->sched_delayed) - flags |= DEQUEUE_DELAYED; - dequeue_entity(qcfs_rq, se, flags); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - - if (qcfs_rq->load.weight) { - /* Avoid re-evaluating load for this entity: */ - se = parent_entity(se); - break; - } - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - } - - /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, queued_delta); -done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; WARN_ON_ONCE(cfs_rq->throttled_clock); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -5901,9 +6021,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta; - long rq_h_nr_queued = rq->cfs.h_nr_queued; + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + + /* + * It's possible we are called with !runtime_remaining due to things + * like user changed quota setting(see tg_set_cfs_bandwidth()) or async + * unthrottled us with a positive runtime_remaining but other still + * running entities consumed those runtime before we reached here. + * + * Anyway, we can't unthrottle this cfs_rq without any runtime remaining + * because any enqueue in tg_unthrottle_up() will immediately trigger a + * throttle, which is not supposed to happen on unthrottle path. + */ + if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) + return; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5933,62 +6064,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (list_add_leaf_cfs_rq(cfs_rq_of(se))) break; } - goto unthrottle_throttle; - } - - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - /* Handle any unfinished DELAY_DEQUEUE business first. */ - if (se->sched_delayed) { - int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; - - dequeue_entity(qcfs_rq, se, flags); - } else if (se->on_rq) - break; - enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - update_load_avg(qcfs_rq, se, UPDATE_TG); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; } - /* Start the fair server if un-throttling resulted in new runnable tasks */ - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) - dl_server_start(&rq->fair_server); - - /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, queued_delta); - -unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ @@ -6472,6 +6549,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -6639,19 +6717,28 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void task_throttle_setup_work(struct task_struct *p) {} +static bool task_is_throttled(struct task_struct *p) { return false; } +static void dequeue_throttled_task(struct task_struct *p, int flags) {} +static bool enqueue_throttled_task(struct task_struct *p) { return false; } +static void record_throttle_clock(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { return 0; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return false; +} + static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return 0; } -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) { return 0; } @@ -6831,6 +6918,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; + if (task_is_throttled(p) && enqueue_throttled_task(p)) + return; + /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. @@ -6883,10 +6973,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; - flags = ENQUEUE_WAKEUP; } @@ -6908,10 +6994,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; } if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { @@ -6941,7 +7023,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) check_update_overutilized_status(rq); -enqueue_throttle: assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -6963,6 +7044,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) bool was_sched_idle = sched_idle_rq(rq); bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; + bool task_throttled = flags & DEQUEUE_THROTTLE; struct task_struct *p = NULL; int h_nr_idle = 0; int h_nr_queued = 0; @@ -6996,9 +7078,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -7010,7 +7091,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + if (task_sleep && se) set_next_buddy(se); break; } @@ -7037,9 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); } sub_nr_running(rq, h_nr_queued); @@ -7073,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) */ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { + if (task_is_throttled(p)) { + dequeue_throttled_task(p, flags); + return true; + } + if (!p->se.sched_delayed) util_est_dequeue(&rq->cfs, p); @@ -8660,7 +8745,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + if (task_is_throttled(p)) return; if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { @@ -8741,19 +8826,22 @@ static struct task_struct *pick_task_fair(struct rq *rq) { struct sched_entity *se; struct cfs_rq *cfs_rq; + struct task_struct *p; + bool throttled; again: cfs_rq = &rq->cfs; if (!cfs_rq->nr_queued) return NULL; + throttled = false; + do { /* Might not have done put_prev_entity() */ if (cfs_rq->curr && cfs_rq->curr->on_rq) update_curr(cfs_rq); - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto again; + throttled |= check_cfs_rq_runtime(cfs_rq); se = pick_next_entity(rq, cfs_rq); if (!se) @@ -8761,7 +8849,10 @@ again: cfs_rq = group_cfs_rq(se); } while (cfs_rq); - return task_of(se); + p = task_of(se); + if (unlikely(throttled)) + task_throttle_setup_work(p); + return p; } static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); @@ -8859,11 +8950,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru return pick_next_task_fair(rq, prev, NULL); } -static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) -{ - return !!dl_se->rq->cfs.nr_queued; -} - static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) { return pick_task_fair(dl_se->rq); @@ -8875,7 +8961,7 @@ void fair_server_init(struct rq *rq) init_dl_entity(dl_se); - dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); + dl_server_init(dl_se, rq, fair_server_pick_task); } /* @@ -8928,8 +9014,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - /* throttled hierarchies are not runnable */ - if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + /* !se->on_rq also covers throttled task */ + if (!se->on_rq) return false; /* Tell the scheduler that we'd really like se to run next. */ @@ -9288,7 +9374,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or - * 2) throttled_lb_pair, or + * 2) target cfs_rq is in throttled hierarchy, or * 3) cannot be migrated to this CPU due to cpus_ptr, or * 4) running (obviously), or * 5) are cache-hot on their current CPU, or @@ -9297,7 +9383,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (lb_throttled_hierarchy(p, env->dst_cpu)) return 0; /* @@ -13081,10 +13167,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - return; - - if (!throttled_hierarchy(cfs_rq)) + /* + * If a task gets attached to this cfs_rq and before being queued, + * it gets migrated to another CPU due to reasons like affinity + * change, make sure this cfs_rq stays on leaf cfs_rq list to have + * that removed load decayed or it can cause faireness problem. + */ + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ @@ -13095,10 +13184,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) update_load_avg(cfs_rq, se, UPDATE_TG); - if (cfs_rq_throttled(cfs_rq)) - break; - - if (!throttled_hierarchy(cfs_rq)) + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 62c3fa543c0f..f921302dc40f 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { u64 throttled; - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) throttled = U64_MAX; else throttled = cfs_rq->throttled_clock_pelt_time; @@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c new file mode 100644 index 000000000000..a23747bbe25b --- /dev/null +++ b/kernel/sched/rq-offsets.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#define COMPILE_OFFSETS +#include <linux/kbuild.h> +#include <linux/types.h> +#include "sched.h" + +int main(void) +{ + DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned)); + + return 0; +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f7..1f5d07067f60 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -365,25 +365,50 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * * dl_se::rq -- runqueue we belong to. * - * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the - * server when it runs out of tasks to run. - * * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this * returns NULL. * * dl_server_update() -- called from update_curr_common(), propagates runtime * to the server. * - * dl_server_start() - * dl_server_stop() -- start/stop the server when it has (no) tasks. + * dl_server_start() -- start the server when it has tasks; it will stop + * automatically when there are no more tasks, per + * dl_se::server_pick() returning NULL. + * + * dl_server_stop() -- (force) stop the server; use when updating + * parameters. * * dl_server_init() -- initializes the server. + * + * When started the dl_server will (per dl_defer) schedule a timer for its + * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a + * server will run at the very end of its period. + * + * This is done such that any runtime from the target class can be accounted + * against the server -- through dl_server_update() above -- such that when it + * becomes time to run, it might already be out of runtime and get deferred + * until the next period. In this case dl_server_timer() will alternate + * between defer and replenish but never actually enqueue the server. + * + * Only when the target class does not manage to exhaust the server's runtime + * (there's actualy starvation in the given period), will the dl_server get on + * the runqueue. Once queued it will pick tasks from the target class and run + * them until either its runtime is exhaused, at which point its back to + * dl_server_timer, or until there are no more tasks to run, at which point + * the dl_server stops itself. + * + * By stopping at this point the dl_server retains bandwidth, which, if a new + * task wakes up imminently (starting the server again), can be used -- + * subject to CBS wakeup rules -- without having to wait for the next period. + * + * Additionally, because of the dl_defer behaviour the start/stop behaviour is + * naturally thottled to once per period, avoiding high context switch + * workloads from spamming the hrtimer program/cancel paths. */ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task); extern void sched_init_dl_servers(void); @@ -735,10 +760,12 @@ struct cfs_rq { u64 throttled_clock_pelt_time; u64 throttled_clock_self; u64 throttled_clock_self_time; - int throttled; + bool throttled:1; + bool pelt_clock_throttled:1; int throttle_count; struct list_head throttled_list; struct list_head throttled_csd_list; + struct list_head throttled_limbo_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1935,12 +1962,12 @@ extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); -extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +extern void init_numa_balancing(u64 clone_flags, struct task_struct *p); #else /* !CONFIG_NUMA_BALANCING: */ static inline void -init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +init_numa_balancing(u64 clone_flags, struct task_struct *p) { } @@ -2342,6 +2369,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SPECIAL 0x10 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_THROTTLE 0x800 #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2658,6 +2686,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_entity(struct sched_dl_entity *dl_se); +extern void init_cfs_throttle_work(struct task_struct *p); + #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6e2f54169e66..444bdfdab731 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1591,7 +1591,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd) enum numa_topology_type sched_numa_topology_type; static int sched_domains_numa_levels; -static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; @@ -1632,14 +1631,7 @@ sd_init(struct sched_domain_topology_level *tl, int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; -#ifdef CONFIG_NUMA - /* - * Ugly hack to pass state to sd_numa_mask()... - */ - sched_domains_curr_level = tl->numa_level; -#endif - - sd_weight = cpumask_weight(tl->mask(cpu)); + sd_weight = cpumask_weight(tl->mask(tl, cpu)); if (tl->sd_flags) sd_flags = (*tl->sd_flags)(); @@ -1677,7 +1669,7 @@ sd_init(struct sched_domain_topology_level *tl, }; sd_span = sched_domain_span(sd); - cpumask_and(sd_span, cpu_map, tl->mask(cpu)); + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); sd_id = cpumask_first(sd_span); sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); @@ -1732,22 +1724,63 @@ sd_init(struct sched_domain_topology_level *tl, return sd; } +#ifdef CONFIG_SCHED_SMT +int cpu_smt_flags(void) +{ + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; +} + +const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_smt_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_CLUSTER +int cpu_cluster_flags(void) +{ + return SD_CLUSTER | SD_SHARE_LLC; +} + +const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_clustergroup_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_MC +int cpu_core_flags(void) +{ + return SD_SHARE_LLC; +} + +const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_coregroup_mask(cpu); +} +#endif + +const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_node_mask(cpu); +} + /* * Topology list, bottom-up. */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; @@ -1768,10 +1801,14 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl) } #ifdef CONFIG_NUMA +static int cpu_numa_flags(void) +{ + return SD_NUMA; +} -static const struct cpumask *sd_numa_mask(int cpu) +static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) { - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; + return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)]; } static void sched_numa_warn(const char *str) @@ -2413,7 +2450,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) * breaks the linking done for an earlier span. */ for_each_cpu(cpu, cpu_map) { - const struct cpumask *tl_cpu_mask = tl->mask(cpu); + const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu); int id; /* lowest bit set in this mask is used as a unique id */ @@ -2421,7 +2458,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) if (cpumask_test_cpu(id, id_seen)) { /* First CPU has already been seen, ensure identical spans */ - if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) + if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask)) return false; } else { /* First CPU hasn't been seen before, ensure it's a completely new span */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 41aa761c7738..25f62867a16d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -741,6 +741,26 @@ out: } #ifdef SECCOMP_ARCH_NATIVE +static bool seccomp_uprobe_exception(struct seccomp_data *sd) +{ +#if defined __NR_uretprobe || defined __NR_uprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + { +#ifdef __NR_uretprobe + if (sd->nr == __NR_uretprobe) + return true; +#endif +#ifdef __NR_uprobe + if (sd->nr == __NR_uprobe) + return true; +#endif + } +#endif + return false; +} + /** * seccomp_is_const_allow - check if filter is constant allow with given data * @fprog: The BPF programs @@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, return false; /* Our single exception to filtering. */ -#ifdef __NR_uretprobe -#ifdef SECCOMP_ARCH_COMPAT - if (sd->arch == SECCOMP_ARCH_NATIVE) -#endif - if (sd->nr == __NR_uretprobe) - return true; -#endif + if (seccomp_uprobe_exception(sd)) + return true; for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; @@ -1043,6 +1058,9 @@ static const int mode1_syscalls[] = { #ifdef __NR_uretprobe __NR_uretprobe, #endif +#ifdef __NR_uprobe + __NR_uprobe, +#endif -1, /* negative terminated */ }; @@ -1139,7 +1157,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn static bool should_sleep_killable(struct seccomp_filter *match, struct seccomp_knotif *n) { - return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT; + return match->wait_killable_recv && n->state >= SECCOMP_NOTIFY_SENT; } static int seccomp_do_user_notification(int this_syscall, @@ -1186,13 +1204,11 @@ static int seccomp_do_user_notification(int this_syscall, if (err != 0) { /* - * Check to see if the notifcation got picked up and - * whether we should switch to wait killable. + * Check to see whether we should switch to wait + * killable. Only return the interrupted error if not. */ - if (!wait_killable && should_sleep_killable(match, &n)) - continue; - - goto interrupted; + if (!(!wait_killable && should_sleep_killable(match, &n))) + goto interrupted; } addfd = list_first_entry_or_null(&n.addfd, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c..bf5d05c635ff 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); COND_SYSCALL(rseq); COND_SYSCALL(uretprobe); +COND_SYSCALL(uprobe); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c..e8c479329282 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -787,10 +787,10 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) @@ -2295,11 +2295,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 667452768ed3..5b6997f4dc3d 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> +#include <linux/nstree.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/cred.h> @@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); + ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; - refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free_page; ns->ucounts = ucounts; - ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; + ns_tree_add(ns); return ns; fail_free_page: @@ -130,7 +129,7 @@ fail: * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) @@ -253,16 +252,13 @@ out: void free_time_ns(struct time_namespace *ns) { + ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); __free_page(ns->vvar_page); - kfree(ns); -} - -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) @@ -466,7 +462,6 @@ out: const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -476,7 +471,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -484,9 +478,15 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.count = REFCOUNT_INIT(3), + .ns.ns_type = ns_common_type(&init_time_ns), + .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, - .ns.inum = PROC_TIME_INIT_INO, + .ns.inum = ns_init_inum(&init_time_ns), .ns.ops = &timens_operations, .frozen_offsets = true, }; + +void __init time_ns_init(void) +{ + ns_tree_add(&init_time_ns); +} diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 2a42c1036ea8..484ad7a18463 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -815,6 +815,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe unsigned long bitmap; unsigned long ret; int offset; + int bit; int i; ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset); @@ -829,6 +830,15 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe if (fregs) ftrace_regs_set_instruction_pointer(fregs, ret); + bit = ftrace_test_recursion_trylock(trace.func, ret); + /* + * This can fail because ftrace_test_recursion_trylock() allows one nest + * call. If we are already in a nested call, then we don't probe this and + * just return the original return address. + */ + if (unlikely(bit < 0)) + goto out; + #ifdef CONFIG_FUNCTION_GRAPH_RETVAL trace.retval = ftrace_regs_get_return_value(fregs); #endif @@ -852,6 +862,8 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe } } + ftrace_test_recursion_unlock(bit); +out: /* * The ftrace_graph_return() may still access the current * ret_stack structure, we need to make sure the update of @@ -1397,7 +1409,8 @@ error: ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); - unregister_pm_notifier(&ftrace_suspend_notifier); + if (!ftrace_graph_active) + unregister_pm_notifier(&ftrace_suspend_notifier); } return ret; } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index c8034dfc1070..5a807d62e76d 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -428,8 +428,9 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad { unsigned long *addrs; - if (alist->index >= alist->size) - return -ENOMEM; + /* Previously we failed to expand the list. */ + if (alist->index == alist->size) + return -ENOSPC; alist->addrs[alist->index++] = addr; if (alist->index < alist->size) @@ -489,7 +490,7 @@ static int fprobe_module_callback(struct notifier_block *nb, for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); - if (alist.index < alist.size && alist.index > 0) + if (alist.index > 0) ftrace_set_filter_ips(&fprobe_graph_ops.ops, alist.addrs, alist.index, 1, 0); mutex_unlock(&fprobe_mutex); diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c index eea447b06907..c1347da69e9d 100644 --- a/kernel/trace/rv/monitors/sleep/sleep.c +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -127,7 +127,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) mon = ltl_get_monitor(current); switch (id) { +#ifdef __NR_clock_nanosleep case __NR_clock_nanosleep: +#endif #ifdef __NR_clock_nanosleep_time64 case __NR_clock_nanosleep_time64: #endif @@ -138,7 +140,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true); break; +#ifdef __NR_futex case __NR_futex: +#endif #ifdef __NR_futex_time64 case __NR_futex_time64: #endif diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 1482e91c39f4..48338520376f 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -495,7 +495,7 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) */ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) { - struct rv_monitor *mon = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); (*pos)++; @@ -805,7 +805,7 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) retval = create_monitor_dir(monitor, parent); if (retval) - return retval; + goto out_unlock; /* keep children close to the parent for easier visualisation */ if (parent) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1b7db732c0b1..b3c94fbaf002 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -834,7 +834,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, /* copy the current bits to the new max */ ret = trace_pid_list_first(filtered_pids, &pid); while (!ret) { - trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_set(pid_list, pid); + if (ret < 0) + goto out; + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } @@ -871,6 +874,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, trace_parser_clear(&parser); ret = 0; } + out: trace_parser_put(&parser); if (ret < 0) { @@ -7209,7 +7213,7 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user entry = ring_buffer_event_data(event); entry->ip = ip; - len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + len = copy_from_user_nofault(&entry->buf, ubuf, cnt); if (len) { memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; @@ -7306,7 +7310,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, entry = ring_buffer_event_data(event); - len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + len = copy_from_user_nofault(&entry->id, ubuf, cnt); if (len) { entry->id = -1; memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 5d64a18cacac..d06854bd32b3 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -230,6 +230,10 @@ static int dyn_event_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = tracing_check_open_get_tr(NULL); if (ret) return ret; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index af42aaa3d172..2ab283fd3032 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -496,7 +496,7 @@ static bool user_event_enabler_queue_fault(struct user_event_mm *mm, { struct user_event_enabler_fault *fault; - fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN); + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT); if (!fault) return false; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ccae62d4fb91..fa60362a3f31 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -908,6 +908,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], return -EINVAL; } buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index fd259da0aa64..dc734867f0fc 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2322,12 +2322,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, int running, err; char *buf __free(kfree) = NULL; - buf = kmalloc(count, GFP_KERNEL); + if (count < 1) + return 0; + + buf = kmalloc(count + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_user(buf, ubuf, count)) return -EFAULT; + buf[count] = '\0'; if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) return -ENOMEM; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 16b283f9d831..6ea2f6363b90 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -57,12 +57,11 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); + stats->ac_ppid = task_ppid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); - stats->ac_ppid = pid_alive(tsk) ? - task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); task_cputime(tsk, &utime, &stime); diff --git a/kernel/user.c b/kernel/user.c index f46b1d41163b..0163665914c9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,10 +65,11 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.count = REFCOUNT_INIT(3), + .ns.ns_type = ns_common_type(&init_user_ns), + .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .ns.inum = PROC_USER_INIT_INO, + .ns.inum = ns_init_inum(&init_user_ns), #ifdef CONFIG_USER_NS .ns.ops = &userns_operations, #endif diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 682f40d5632d..03cb63883d04 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,6 +21,7 @@ #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> +#include <linux/nstree.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); @@ -124,12 +125,11 @@ int create_user_ns(struct cred *new) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_alloc_inum(&ns->ns); + + ret = ns_common_init(ns); if (ret) goto fail_free; - ns->ns.ops = &userns_operations; - refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; @@ -159,12 +159,13 @@ int create_user_ns(struct cred *new) goto fail_keyring; set_cred_user_ns(new, ns); + ns_tree_add(ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: @@ -201,6 +202,7 @@ static void free_user_ns(struct work_struct *work) do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + ns_tree_remove(ns); if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); @@ -218,11 +220,12 @@ static void free_user_ns(struct work_struct *work) #endif retire_userns_sysctls(ns); key_free_user_ns(ns); - ns_free_inum(&ns->ns); - kmem_cache_free(user_ns_cachep, ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; - } while (refcount_dec_and_test(&parent->ns.count)); + } while (ns_ref_put(parent)); } void __put_user_ns(struct user_namespace *ns) @@ -1322,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns) } EXPORT_SYMBOL(current_in_userns); -static inline struct user_namespace *to_user_ns(struct ns_common *ns) -{ - return container_of(ns, struct user_namespace, ns); -} - static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; @@ -1402,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns) const struct proc_ns_operations userns_operations = { .name = "user", - .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, @@ -1413,6 +1410,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); + ns_tree_add(&init_user_ns); return 0; } subsys_initcall(user_namespaces_init); diff --git a/kernel/utsname.c b/kernel/utsname.c index b1ac3ca870f2..ebbfc578a9d3 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -13,6 +13,7 @@ #include <linux/cred.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> +#include <linux/nstree.h> #include <linux/sched/task.h> static struct kmem_cache *uts_ns_cache __ro_after_init; @@ -27,16 +28,6 @@ static void dec_uts_namespaces(struct ucounts *ucounts) dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); } -static struct uts_namespace *create_uts_ns(void) -{ - struct uts_namespace *uts_ns; - - uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); - if (uts_ns) - refcount_set(&uts_ns->ns.count, 1); - return uts_ns; -} - /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone @@ -55,21 +46,20 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = create_uts_ns(); + ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL); if (!ns) goto fail_dec; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free; ns->ucounts = ucounts; - ns->ns.ops = &utsns_operations; - down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); + ns_tree_add(ns); return ns; fail_free: @@ -86,7 +76,7 @@ fail: * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, +struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; @@ -105,15 +95,12 @@ struct uts_namespace *copy_utsname(unsigned long flags, void free_uts_ns(struct uts_namespace *ns) { + ns_tree_remove(ns); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kmem_cache_free(uts_ns_cache, ns); -} - -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *utsns_get(struct task_struct *task) @@ -159,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns) const struct proc_ns_operations utsns_operations = { .name = "uts", - .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, @@ -174,4 +160,5 @@ void __init uts_ns_init(void) offsetof(struct uts_namespace, name), sizeof_field(struct uts_namespace, name), NULL); + ns_tree_add(&init_uts_ns); } diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index bc738fa90c1d..27107dcc1cbf 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -100,6 +100,7 @@ void vhost_task_stop(struct vhost_task *vtsk) * freeing it below. */ wait_for_completion(&vtsk->exited); + put_task_struct(vtsk->task); kfree(vtsk); } EXPORT_SYMBOL_GPL(vhost_task_stop); @@ -148,7 +149,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), return ERR_CAST(tsk); } - vtsk->task = tsk; + vtsk->task = get_task_struct(tsk); return vtsk; } EXPORT_SYMBOL_GPL(vhost_task_create); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c6b79b3675c3..45320e27a16c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -222,7 +222,9 @@ struct worker_pool { struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ - +#ifdef CONFIG_PREEMPT_RT + spinlock_t cb_lock; /* BH worker cancel lock */ +#endif /* * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). @@ -2930,7 +2932,7 @@ static void idle_worker_timeout(struct timer_list *t) raw_spin_unlock_irq(&pool->lock); if (do_cull) - queue_work(system_unbound_wq, &pool->idle_cull_work); + queue_work(system_dfl_wq, &pool->idle_cull_work); } /** @@ -3078,6 +3080,31 @@ restart: goto restart; } +#ifdef CONFIG_PREEMPT_RT +static void worker_lock_callback(struct worker_pool *pool) +{ + spin_lock(&pool->cb_lock); +} + +static void worker_unlock_callback(struct worker_pool *pool) +{ + spin_unlock(&pool->cb_lock); +} + +static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) +{ + spin_lock(&pool->cb_lock); + spin_unlock(&pool->cb_lock); +} + +#else + +static void worker_lock_callback(struct worker_pool *pool) { } +static void worker_unlock_callback(struct worker_pool *pool) { } +static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { } + +#endif + /** * manage_workers - manage worker pool * @worker: self @@ -3557,6 +3584,7 @@ static void bh_worker(struct worker *worker) int nr_restarts = BH_WORKER_RESTARTS; unsigned long end = jiffies + BH_WORKER_JIFFIES; + worker_lock_callback(pool); raw_spin_lock_irq(&pool->lock); worker_leave_idle(worker); @@ -3585,6 +3613,7 @@ done: worker_enter_idle(worker); kick_pool(pool); raw_spin_unlock_irq(&pool->lock); + worker_unlock_callback(pool); } /* @@ -4222,17 +4251,17 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) (data & WORK_OFFQ_BH)) { /* * On RT, prevent a live lock when %current preempted - * soft interrupt processing or prevents ksoftirqd from - * running by keeping flipping BH. If the BH work item - * runs on a different CPU then this has no effect other - * than doing the BH disable/enable dance for nothing. - * This is copied from - * kernel/softirq.c::tasklet_unlock_spin_wait(). + * soft interrupt processing by blocking on lock which + * is owned by the thread invoking the callback. */ while (!try_wait_for_completion(&barr.done)) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - local_bh_disable(); - local_bh_enable(); + struct worker_pool *pool; + + guard(rcu)(); + pool = get_work_pool(work); + if (pool) + workqueue_callback_cancel_wait_running(pool); } else { cpu_relax(); } @@ -4782,6 +4811,9 @@ static int init_worker_pool(struct worker_pool *pool) ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; +#ifdef CONFIG_PREEMPT_RT + spin_lock_init(&pool->cb_lock); +#endif /* shouldn't fail above this point */ pool->attrs = alloc_workqueue_attrs(); @@ -6046,7 +6078,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) struct pool_workqueue *pwq; bool ret; - rcu_read_lock(); preempt_disable(); if (cpu == WORK_CPU_UNBOUND) @@ -6056,7 +6087,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) ret = !list_empty(&pwq->inactive_works); preempt_enable(); - rcu_read_unlock(); return ret; } @@ -7546,8 +7576,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (!thresh) return; - rcu_read_lock(); - for_each_pool(pool, pi) { unsigned long pool_ts, touched, ts; @@ -7589,8 +7617,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) } - rcu_read_unlock(); - if (lockup_detected) show_all_workqueues(); @@ -7642,7 +7668,7 @@ static int wq_watchdog_param_set_thresh(const char *val, if (ret) return ret; - if (system_wq) + if (system_percpu_wq) wq_watchdog_set_thresh(thresh); else wq_watchdog_thresh = thresh; @@ -7802,22 +7828,22 @@ void __init workqueue_init_early(void) ordered_wq_attrs[i] = attrs; } - system_wq = alloc_workqueue("events", 0, 0); - system_percpu_wq = alloc_workqueue("events", 0, 0); - system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); - system_long_wq = alloc_workqueue("events_long", 0, 0); + system_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_highpri_wq = alloc_workqueue("events_highpri", + WQ_HIGHPRI | WQ_PERCPU, 0); + system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", - WQ_FREEZABLE, 0); + WQ_FREEZABLE | WQ_PERCPU, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", - WQ_POWER_EFFICIENT, 0); + WQ_POWER_EFFICIENT | WQ_PERCPU, 0); system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", - WQ_FREEZABLE | WQ_POWER_EFFICIENT, - 0); - system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); + WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0); + system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", - WQ_BH | WQ_HIGHPRI, 0); + WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0); BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || |