diff options
Diffstat (limited to 'kernel')
163 files changed, 10097 insertions, 4604 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 422270d64820..54e581072617 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -109,6 +109,15 @@ config KEXEC_HANDOVER to keep data or state alive across the kexec. For this to work, both source and target kernels need to have this option enabled. +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index df3dd8291bb6..9fe722305c9b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/acct.c b/kernel/acct.c index 61630110e29d..2a2b3c874acd 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -520,26 +520,23 @@ static void fill_ac(struct bsd_acct_struct *acct) static void acct_write_process(struct bsd_acct_struct *acct) { struct file *file = acct->file; - const struct cred *cred; acct_t *ac = &acct->ac; /* Perform file operations on behalf of whoever enabled accounting */ - cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. Then get freeze protection. If - * the fs is frozen, just skip the write as we could deadlock - * the system otherwise. - */ - if (check_free_space(acct) && file_start_write_trylock(file)) { - /* it's been opened O_APPEND, so position is irrelevant */ - loff_t pos = 0; - __kernel_write(file, ac, sizeof(acct_t), &pos); - file_end_write(file); + scoped_with_creds(file->f_cred) { + /* + * First check to see if there is enough free_space to continue + * the process accounting system. Then get freeze protection. If + * the fs is frozen, just skip the write as we could deadlock + * the system otherwise. + */ + if (check_free_space(acct) && file_start_write_trylock(file)) { + /* it's been opened O_APPEND, so position is irrelevant */ + loff_t pos = 0; + __kernel_write(file, ac, sizeof(acct_t), &pos); + file_end_write(file); + } } - - revert_creds(cred); } static void do_acct_process(struct bsd_acct_struct *acct) diff --git a/kernel/audit.h b/kernel/audit.h index 0f05933a173b..7c401729e21b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -138,7 +138,7 @@ struct audit_context { struct audit_aux_data *aux_pids; struct sockaddr_storage *sockaddr; size_t sockaddr_len; - /* Save things to print about task_struct */ + /* Save things to print about task_struct */ pid_t ppid; kuid_t uid, euid, suid, fsuid; kgid_t gid, egid, sgid, fsgid; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index c401082d9b25..6a86c0683b67 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -638,10 +638,9 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) void *bufp; int i; - data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL); + data = kzalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL); if (unlikely(!data)) return NULL; - memset(data, 0, sizeof(*data)); data->flags = krule->flags | krule->listnr; data->action = krule->action; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d1966144bdfe..dd0563a8e0be 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2416,41 +2416,36 @@ void __audit_inode_child(struct inode *parent, if (inode) handle_one(inode); - /* look for a parent entry first */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name || - (n->type != AUDIT_TYPE_PARENT && - n->type != AUDIT_TYPE_UNKNOWN)) + /* can only match entries that have a name */ + if (!n->name) continue; - if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev && - !audit_compare_dname_path(dname, - n->name->name, n->name_len)) { - if (n->type == AUDIT_TYPE_UNKNOWN) - n->type = AUDIT_TYPE_PARENT; + /* look for a parent entry first */ + if (!found_parent && + (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) && + (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev && + !audit_compare_dname_path(dname, n->name->name, n->name_len))) { + n->type = AUDIT_TYPE_PARENT; found_parent = n; - break; - } - } - - cond_resched(); - - /* is there a matching child entry? */ - list_for_each_entry(n, &context->names_list, list) { - /* can only match entries that have a name */ - if (!n->name || - (n->type != type && n->type != AUDIT_TYPE_UNKNOWN)) + if (found_child) + break; continue; + } - if (!strcmp(dname->name, n->name->name) || - !audit_compare_dname_path(dname, n->name->name, + /* is there a matching child entry? */ + if (!found_child && + (n->type == type || n->type == AUDIT_TYPE_UNKNOWN) && + (!strcmp(dname->name, n->name->name) || + !audit_compare_dname_path(dname, n->name->name, found_parent ? found_parent->name_len : - AUDIT_NAME_FULL)) { + AUDIT_NAME_FULL))) { if (n->type == AUDIT_TYPE_UNKNOWN) n->type = type; found_child = n; - break; + if (found_parent) + break; } } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 7fd0badfacb1..232cbc97434d 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,7 +9,7 @@ CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 80b1765a3159..1eeb31c5b317 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -335,18 +335,17 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) } /* Called from syscall */ -static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { - struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; - if (index >= array->map.max_entries) { + if (index >= map->max_entries) { *next = 0; return 0; } - if (index == array->map.max_entries - 1) + if (index == map->max_entries - 1) return -ENOENT; *next = index + 1; @@ -448,19 +447,12 @@ static void array_map_free_internal_structs(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free fields other than timer and workqueue - * on uref dropping to zero. - */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { - for (i = 0; i < array->map.max_entries; i++) { - if (btf_record_has_field(map->record, BPF_TIMER)) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); - if (btf_record_has_field(map->record, BPF_TASK_WORK)) - bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); - } - } + /* We only free internal structs on uref dropping to zero */ + if (!bpf_map_has_internal_structs(map)) + return; + + for (i = 0; i < array->map.max_entries; i++) + bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -796,7 +788,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, @@ -822,7 +814,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, @@ -1211,7 +1203,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_poke_track = prog_array_map_poke_track, .map_poke_untrack = prog_array_map_poke_untrack, .map_poke_run = prog_array_map_poke_run, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, @@ -1315,7 +1307,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = perf_event_fd_array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, @@ -1351,7 +1343,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, @@ -1436,7 +1428,7 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = array_of_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c new file mode 100644 index 000000000000..c96630cb75bf --- /dev/null +++ b/kernel/bpf/bpf_insn_array.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Isovalent */ + +#include <linux/bpf.h> + +struct bpf_insn_array { + struct bpf_map map; + atomic_t used; + long *ips; + DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values); +}; + +#define cast_insn_array(MAP_PTR) \ + container_of((MAP_PTR), struct bpf_insn_array, map) + +#define INSN_DELETED ((u32)-1) + +static inline u64 insn_array_alloc_size(u32 max_entries) +{ + const u64 base_size = sizeof(struct bpf_insn_array); + const u64 entry_size = sizeof(struct bpf_insn_array_value); + + return base_size + max_entries * (entry_size + sizeof(long)); +} + +static int insn_array_alloc_check(union bpf_attr *attr) +{ + u32 value_size = sizeof(struct bpf_insn_array_value); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != value_size || attr->map_flags != 0) + return -EINVAL; + + return 0; +} + +static void insn_array_free(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + bpf_map_area_free(insn_array); +} + +static struct bpf_map *insn_array_alloc(union bpf_attr *attr) +{ + u64 size = insn_array_alloc_size(attr->max_entries); + struct bpf_insn_array *insn_array; + + insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE); + if (!insn_array) + return ERR_PTR(-ENOMEM); + + /* ips are allocated right after the insn_array->values[] array */ + insn_array->ips = (void *)&insn_array->values[attr->max_entries]; + + bpf_map_init_from_attr(&insn_array->map, attr); + + /* BPF programs aren't allowed to write to the map */ + insn_array->map.map_flags |= BPF_F_RDONLY_PROG; + + return &insn_array->map; +} + +static void *insn_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= insn_array->map.max_entries)) + return NULL; + + return &insn_array->values[index]; +} + +static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + struct bpf_insn_array_value val = {}; + + if (unlikely(index >= insn_array->map.max_entries)) + return -E2BIG; + + if (unlikely(map_flags & BPF_NOEXIST)) + return -EEXIST; + + copy_map_value(map, &val, value); + if (val.jitted_off || val.xlated_off) + return -EINVAL; + + insn_array->values[index].orig_off = val.orig_off; + + return 0; +} + +static long insn_array_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +static int insn_array_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + if (!btf_type_is_i32(key_type)) + return -EINVAL; + + if (!btf_type_is_i64(value_type)) + return -EINVAL; + + return 0; +} + +static u64 insn_array_mem_usage(const struct bpf_map *map) +{ + return insn_array_alloc_size(map->max_entries); +} + +static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + if ((off % sizeof(long)) != 0 || + (off / sizeof(long)) >= map->max_entries) + return -EINVAL; + + /* from BPF's point of view, this map is a jump table */ + *imm = (unsigned long)insn_array->ips + off; + + return 0; +} + +BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array) + +const struct bpf_map_ops insn_array_map_ops = { + .map_alloc_check = insn_array_alloc_check, + .map_alloc = insn_array_alloc, + .map_free = insn_array_free, + .map_get_next_key = bpf_array_get_next_key, + .map_lookup_elem = insn_array_lookup_elem, + .map_update_elem = insn_array_update_elem, + .map_delete_elem = insn_array_delete_elem, + .map_check_btf = insn_array_check_btf, + .map_mem_usage = insn_array_mem_usage, + .map_direct_value_addr = insn_array_map_direct_value_addr, + .map_btf_id = &insn_array_btf_ids[0], +}; + +static inline bool is_frozen(struct bpf_map *map) +{ + guard(mutex)(&map->freeze_mutex); + + return map->frozen; +} + +static bool is_insn_array(const struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_INSN_ARRAY; +} + +static inline bool valid_offsets(const struct bpf_insn_array *insn_array, + const struct bpf_prog *prog) +{ + u32 off; + int i; + + for (i = 0; i < insn_array->map.max_entries; i++) { + off = insn_array->values[i].orig_off; + + if (off >= prog->len) + return false; + + if (off > 0) { + if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM)) + return false; + } + } + + return true; +} + +int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + struct bpf_insn_array_value *values = insn_array->values; + int i; + + if (!is_frozen(map)) + return -EINVAL; + + if (!valid_offsets(insn_array, prog)) + return -EINVAL; + + /* + * There can be only one program using the map + */ + if (atomic_xchg(&insn_array->used, 1)) + return -EBUSY; + + /* + * Reset all the map indexes to the original values. This is needed, + * e.g., when a replay of verification with different log level should + * be performed. + */ + for (i = 0; i < map->max_entries; i++) + values[i].xlated_off = values[i].orig_off; + + return 0; +} + +int bpf_insn_array_ready(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (!insn_array->ips[i]) + return -EFAULT; + } + + return 0; +} + +void bpf_insn_array_release(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + atomic_set(&insn_array->used, 0); +} + +void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + if (len <= 1) + return; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off <= off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + insn_array->values[i].xlated_off += len - 1; + } +} + +void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off < off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (insn_array->values[i].xlated_off < off + len) + insn_array->values[i].xlated_off = INSN_DELETED; + else + insn_array->values[i].xlated_off -= len; + } +} + +/* + * This function is called by JITs. The image is the real program + * image, the offsets array set up the xlated -> jitted mapping. + * The offsets[xlated] offset should point to the beginning of + * the jitted instruction. + */ +void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) +{ + struct bpf_insn_array *insn_array; + struct bpf_map *map; + u32 xlated_off; + int i, j; + + if (!offsets || !image) + return; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (!is_insn_array(map)) + continue; + + insn_array = cast_insn_array(map); + for (j = 0; j < map->max_entries; j++) { + xlated_off = insn_array->values[j].xlated_off; + if (xlated_off == INSN_DELETED) + continue; + if (xlated_off < prog->aux->subprog_start) + continue; + xlated_off -= prog->aux->subprog_start; + if (xlated_off >= prog->len) + continue; + + insn_array->values[j].jitted_off = offsets[xlated_off]; + insn_array->ips[j] = (long)(image + offsets[xlated_off]); + } + } +} diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 6ac35430c573..eec60b57bd3d 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -634,37 +634,24 @@ release_prog: int bpf_iter_new_fd(struct bpf_link *link) { struct bpf_iter_link *iter_link; - struct file *file; unsigned int flags; - int err, fd; + int err; if (link->ops != &bpf_iter_link_lops) return -EINVAL; flags = O_RDONLY | O_CLOEXEC; - fd = get_unused_fd_flags(flags); - if (fd < 0) - return fd; - - file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); - if (IS_ERR(file)) { - err = PTR_ERR(file); - goto free_fd; - } + + FD_PREPARE(fdf, flags, anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags)); + if (fdf.err) + return fdf.err; iter_link = container_of(link, struct bpf_iter_link, link); - err = prepare_seq_file(file, iter_link); + err = prepare_seq_file(fd_prepare_file(fdf), iter_link); if (err) - goto free_file; + return err; /* Automatic cleanup handles fput */ - fd_install(fd, file); - return fd; - -free_file: - fput(file); -free_fd: - put_unused_fd(fd); - return err; + return fd_publish(fdf); } struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b931fbceb54d..e2fe6c32822b 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -73,30 +73,24 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags) + void *value, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; - if (charge_mem && mem_charge(smap, owner, smap->elem_size)) + if (mem_charge(smap, owner, smap->elem_size)) return NULL; - if (smap->bpf_ma) { - selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); - if (selem) - /* Keep the original bpf_map_kzalloc behavior - * before started using the bpf_mem_cache_alloc. - * - * No need to use zero_map_value. The bpf_selem_free() - * only does bpf_mem_cache_free when there is - * no other bpf prog is using the selem. - */ - memset(SDATA(selem)->data, 0, smap->map.value_size); + if (smap->use_kmalloc_nolock) { + selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size, + __GFP_ZERO, NUMA_NO_NODE); } else { selem = bpf_map_kzalloc(&smap->map, smap->elem_size, gfp_flags | __GFP_NOWARN); } if (selem) { + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + if (value) { /* No need to call check_and_init_map_value as memory is zero init */ copy_map_value(&smap->map, SDATA(selem)->data, value); @@ -106,13 +100,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return selem; } - if (charge_mem) - mem_uncharge(smap, owner, smap->elem_size); + mem_uncharge(smap, owner, smap->elem_size); return NULL; } -/* rcu tasks trace callback for bpf_ma == false */ +/* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; @@ -127,12 +120,23 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) kfree_rcu(local_storage, rcu); } +/* Handle use_kmalloc_nolock == false */ +static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, + bool vanilla_rcu) +{ + if (vanilla_rcu) + kfree_rcu(local_storage, rcu); + else + call_rcu_tasks_trace(&local_storage->rcu, + __bpf_local_storage_free_trace_rcu); +} + static void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; local_storage = container_of(rcu, struct bpf_local_storage, rcu); - bpf_mem_cache_raw_free(local_storage); + kfree_nolock(local_storage); } static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) @@ -143,46 +147,27 @@ static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) call_rcu(rcu, bpf_local_storage_free_rcu); } -/* Handle bpf_ma == false */ -static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, - bool vanilla_rcu) -{ - if (vanilla_rcu) - kfree_rcu(local_storage, rcu); - else - call_rcu_tasks_trace(&local_storage->rcu, - __bpf_local_storage_free_trace_rcu); -} - static void bpf_local_storage_free(struct bpf_local_storage *local_storage, - struct bpf_local_storage_map *smap, - bool bpf_ma, bool reuse_now) + bool reuse_now) { if (!local_storage) return; - if (!bpf_ma) { + if (!local_storage->use_kmalloc_nolock) { __bpf_local_storage_free(local_storage, reuse_now); return; } - if (!reuse_now) { - call_rcu_tasks_trace(&local_storage->rcu, - bpf_local_storage_free_trace_rcu); + if (reuse_now) { + call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); return; } - if (smap) - bpf_mem_cache_free(&smap->storage_ma, local_storage); - else - /* smap could be NULL if the selem that triggered - * this 'local_storage' creation had been long gone. - * In this case, directly do call_rcu(). - */ - call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); + call_rcu_tasks_trace(&local_storage->rcu, + bpf_local_storage_free_trace_rcu); } -/* rcu tasks trace callback for bpf_ma == false */ +/* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; @@ -194,7 +179,7 @@ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) kfree_rcu(selem, rcu); } -/* Handle bpf_ma == false */ +/* Handle use_kmalloc_nolock == false */ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, bool vanilla_rcu) { @@ -216,7 +201,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) migrate_disable(); bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); migrate_enable(); - bpf_mem_cache_raw_free(selem); + kfree_nolock(selem); } static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) @@ -228,14 +213,17 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) } void bpf_selem_free(struct bpf_local_storage_elem *selem, - struct bpf_local_storage_map *smap, bool reuse_now) { - if (!smap->bpf_ma) { - /* Only task storage has uptrs and task storage - * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true - * for task storage, so this bpf_obj_free_fields() won't unpin - * any uptr. + struct bpf_local_storage_map *smap; + + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + + if (!smap->use_kmalloc_nolock) { + /* + * No uptr will be unpin even when reuse_now == false since uptr + * is only supported in task local storage, where + * smap->use_kmalloc_nolock == true. */ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); @@ -243,18 +231,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, } if (reuse_now) { - /* reuse_now == true only happens when the storage owner - * (e.g. task_struct) is being destructed or the map itself - * is being destructed (ie map_free). In both cases, - * no bpf prog can have a hold on the selem. It is - * safe to unpin the uptrs and free the selem now. - */ - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - /* Instead of using the vanilla call_rcu(), - * bpf_mem_cache_free will be able to reuse selem - * immediately. + /* + * While it is okay to call bpf_obj_free_fields() that unpins uptr when + * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity. */ - bpf_mem_cache_free(&smap->selem_ma, selem); + call_rcu(&selem->rcu, bpf_selem_free_rcu); return; } @@ -264,7 +245,6 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) { struct bpf_local_storage_elem *selem; - struct bpf_local_storage_map *smap; struct hlist_node *n; /* The "_safe" iteration is needed. @@ -272,10 +252,8 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) * but bpf_selem_free will use the selem->rcu_head * which is union-ized with the selem->free_node. */ - hlist_for_each_entry_safe(selem, n, list, free_node) { - smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - bpf_selem_free(selem, smap, reuse_now); - } + hlist_for_each_entry_safe(selem, n, list, free_node) + bpf_selem_free(selem, reuse_now); } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -284,7 +262,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem, struct hlist_head *free_selem_list) + struct hlist_head *free_selem_list) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -297,8 +275,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor * The owner may be freed once the last selem is unlinked * from local_storage. */ - if (uncharge_mem) - mem_uncharge(smap, owner, smap->elem_size); + mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); @@ -336,47 +313,11 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor return free_local_storage; } -static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, - struct bpf_local_storage_map *storage_smap, - struct bpf_local_storage_elem *selem) -{ - - struct bpf_local_storage_map *selem_smap; - - /* local_storage->smap may be NULL. If it is, get the bpf_ma - * from any selem in the local_storage->list. The bpf_ma of all - * local_storage and selem should have the same value - * for the same map type. - * - * If the local_storage->list is already empty, the caller will not - * care about the bpf_ma value also because the caller is not - * responsible to free the local_storage. - */ - - if (storage_smap) - return storage_smap->bpf_ma; - - if (!selem) { - struct hlist_node *n; - - n = rcu_dereference_check(hlist_first_rcu(&local_storage->list), - bpf_rcu_lock_held()); - if (!n) - return false; - - selem = hlist_entry(n, struct bpf_local_storage_elem, snode); - } - selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - - return selem_smap->bpf_ma; -} - static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, bool reuse_now) { - struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; - bool bpf_ma, free_local_storage = false; + bool free_local_storage = false; HLIST_HEAD(selem_free_list); unsigned long flags; @@ -386,20 +327,17 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); - storage_smap = rcu_dereference_check(local_storage->smap, - bpf_rcu_lock_held()); - bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem); raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, &selem_free_list); + local_storage, selem, &selem_free_list); raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&selem_free_list, reuse_now); if (free_local_storage) - bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); + bpf_local_storage_free(local_storage, reuse_now); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -434,7 +372,6 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, unsigned long flags; raw_spin_lock_irqsave(&b->lock, flags); - RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); } @@ -493,8 +430,9 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - if (smap->bpf_ma) - storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); + if (smap->use_kmalloc_nolock) + storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage), + __GFP_ZERO, NUMA_NO_NODE); else storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), gfp_flags | __GFP_NOWARN); @@ -507,6 +445,7 @@ int bpf_local_storage_alloc(void *owner, INIT_HLIST_HEAD(&storage->list); raw_spin_lock_init(&storage->lock); storage->owner = owner; + storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); bpf_selem_link_map(smap, first_selem); @@ -528,22 +467,12 @@ int bpf_local_storage_alloc(void *owner, bpf_selem_unlink_map(first_selem); err = -EAGAIN; goto uncharge; - - /* Note that even first_selem was linked to smap's - * bucket->list, first_selem can be freed immediately - * (instead of kfree_rcu) because - * bpf_local_storage_map_free() does a - * synchronize_rcu_mult (waiting for both sleepable and - * normal programs) before walking the bucket->list. - * Hence, no one is accessing selem from the - * bucket->list under rcu_read_lock(). - */ } return 0; uncharge: - bpf_local_storage_free(storage, smap, smap->bpf_ma, true); + bpf_local_storage_free(storage, true); mem_uncharge(smap, owner, sizeof(*storage)); return err; } @@ -582,13 +511,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); + selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { - bpf_selem_free(selem, smap, true); + bpf_selem_free(selem, true); mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } @@ -616,7 +545,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ - alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); @@ -656,7 +585,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - true, &old_selem_free_list); + &old_selem_free_list); } unlock: @@ -664,7 +593,7 @@ unlock: bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); - bpf_selem_free(alloc_selem, smap, true); + bpf_selem_free(alloc_selem, true); } return err ? ERR_PTR(err) : SDATA(selem); } @@ -730,16 +659,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { - struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; - bool bpf_ma, free_storage = false; + bool free_storage = false; HLIST_HEAD(free_selem_list); struct hlist_node *n; unsigned long flags; - storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); - bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL); - /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. * Thus, no elem can be added to or deleted from the @@ -762,14 +687,14 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, &free_selem_list); + local_storage, selem, &free_selem_list); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&free_selem_list, true); if (free_storage) - bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); + bpf_local_storage_free(local_storage, true); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -782,20 +707,10 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) return usage; } -/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory. - * A deadlock free allocator is useful for storage that the bpf prog can easily - * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf. - * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses - * memory immediately. To be reuse-immediate safe, the owner destruction - * code path needs to go through a rcu grace period before calling - * bpf_local_storage_destroy(). - * - * When bpf_ma == false, the kmalloc and kfree are used. - */ struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, - bool bpf_ma) + bool use_kmalloc_nolock) { struct bpf_local_storage_map *smap; unsigned int i; @@ -829,20 +744,9 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non * preemptible context. Thus, enforce all storages to use - * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled. + * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled. */ - smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma; - if (smap->bpf_ma) { - err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); - if (err) - goto free_smap; - - err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false); - if (err) { - bpf_mem_alloc_destroy(&smap->selem_ma); - goto free_smap; - } - } + smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock; smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; @@ -912,12 +816,9 @@ void bpf_local_storage_map_free(struct bpf_map *map, */ synchronize_rcu(); - if (smap->bpf_ma) { + if (smap->use_kmalloc_nolock) { rcu_barrier_tasks_trace(); - if (!rcu_trace_implies_rcu_gp()) - rcu_barrier(); - bpf_mem_alloc_destroy(&smap->selem_ma); - bpf_mem_alloc_destroy(&smap->storage_ma); + rcu_barrier(); } kvfree(smap->buckets); bpf_map_area_free(smap); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 0a59df1c550a..7cb6e8d4282c 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -51,6 +51,7 @@ BTF_ID(func, bpf_lsm_key_getsecurity) BTF_ID(func, bpf_lsm_audit_rule_match) #endif BTF_ID(func, bpf_lsm_ismaclabel) +BTF_ID(func, bpf_lsm_file_alloc_security) BTF_SET_END(bpf_lsm_disabled_hooks) /* List of LSM hooks that should operate on 'current' cgroup regardless diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index a41e6730edcf..278490683d28 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1162,6 +1162,7 @@ bool bpf_struct_ops_get(const void *kdata) map = __bpf_map_inc_not_zero(&st_map->map, false); return !IS_ERR(map); } +EXPORT_SYMBOL_GPL(bpf_struct_ops_get); void bpf_struct_ops_put(const void *kdata) { @@ -1173,6 +1174,7 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } +EXPORT_SYMBOL_GPL(bpf_struct_ops_put); u32 bpf_struct_ops_id(const void *kdata) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 248f517d66d0..69988af44b37 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1665,7 +1665,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * returned value != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx, @@ -1676,7 +1676,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, .uaddr = uaddr, .t_ctx = t_ctx, }; - struct sockaddr_storage unspec; + struct sockaddr_storage storage; struct cgroup *cgrp; int ret; @@ -1688,8 +1688,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, return 0; if (!ctx.uaddr) { - memset(&unspec, 0, sizeof(unspec)); - ctx.uaddr = (struct sockaddr *)&unspec; + memset(&storage, 0, sizeof(storage)); + ctx.uaddr = (struct sockaddr_unsized *)&storage; ctx.uaddrlen = 0; } else { ctx.uaddrlen = *uaddrlen; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d595fe512498..c8ae6ab31651 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1450,6 +1450,23 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) bpf_prog_clone_free(fp_other); } +static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len) +{ +#ifdef CONFIG_BPF_SYSCALL + struct bpf_map *map; + int i; + + if (len <= 1) + return; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) + bpf_insn_array_adjust(map, off, len); + } +#endif +} + struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) { struct bpf_insn insn_buff[16], aux[2]; @@ -1505,6 +1522,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) clone = tmp; insn_delta = rewritten - 1; + /* Instructions arrays must be updated using absolute xlated offsets */ + adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten); + /* Walk new program and skip insns we just inserted. */ insn = clone->insnsi + i + insn_delta; insn_cnt += insn_delta; @@ -1688,6 +1708,7 @@ bool bpf_opcode_in_insntable(u8 code) [BPF_LD | BPF_IND | BPF_B] = true, [BPF_LD | BPF_IND | BPF_H] = true, [BPF_LD | BPF_IND | BPF_W] = true, + [BPF_JMP | BPF_JA | BPF_X] = true, [BPF_JMP | BPF_JCOND] = true, }; #undef BPF_INSN_3_TBL @@ -3129,8 +3150,9 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } -int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *addr1, void *addr2) +int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { return -ENOTSUPP; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 20883c6b1546..f8a3c7eb451e 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -358,6 +358,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) { + verbose(cbs->private_data, "(%02x) gotox r%d\n", + insn->code, insn->dst_reg); } else if (insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO) { verbose(cbs->private_data, "(%02x) may_goto pc%+d\n", diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c2fcd0cd51e5..c8a9b27f8663 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -215,19 +215,6 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } -static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem) -{ - if (btf_record_has_field(htab->map.record, BPF_TIMER)) - bpf_obj_free_timer(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) - bpf_obj_free_task_work(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); -} - static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; @@ -240,7 +227,8 @@ static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - htab_free_internal_structs(htab, elem); + bpf_map_free_internal_structs(&htab->map, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -669,8 +657,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -947,15 +934,21 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { + void *ptr; + if (!onallcpus) { /* copy true value_size bytes */ - copy_map_value(&htab->map, this_cpu_ptr(pptr), value); + ptr = this_cpu_ptr(pptr); + copy_map_value(&htab->map, ptr, value); + bpf_obj_free_fields(htab->map.record, ptr); } else { u32 size = round_up(htab->map.value_size, 8); int off = 0, cpu; for_each_possible_cpu(cpu) { - copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off); + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value_long(&htab->map, ptr, value + off); + bpf_obj_free_fields(htab->map.record, ptr); off += size; } } @@ -1098,8 +1091,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1206,8 +1198,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1275,8 +1266,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1338,8 +1328,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1416,8 +1405,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1452,8 +1440,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1509,8 +1496,9 @@ static void htab_free_malloced_internal_structs(struct bpf_htab *htab) struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { - /* We only free timer on uref dropping to zero */ - htab_free_internal_structs(htab, l); + /* We only free internal structs on uref dropping to zero */ + bpf_map_free_internal_structs(&htab->map, + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } @@ -1521,13 +1509,14 @@ static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { - if (!htab_is_prealloc(htab)) - htab_free_malloced_internal_structs(htab); - else - htab_free_prealloced_internal_structs(htab); - } + /* We only free internal structs on uref dropping to zero */ + if (!bpf_map_has_internal_structs(map)) + return; + + if (htab_is_prealloc(htab)) + htab_free_prealloced_internal_structs(htab); + else + htab_free_malloced_internal_structs(htab); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c9fab9a356df..db72b96f9c8c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -28,6 +28,7 @@ #include <linux/verification.h> #include <linux/task_work.h> #include <linux/irq_work.h> +#include <linux/buildid.h> #include "../../lib/kstrtox.h" @@ -42,8 +43,7 @@ */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -59,8 +59,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -77,8 +76,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_delete_elem(map, key); } @@ -134,8 +132,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } @@ -777,9 +774,11 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; + preempt_disable(); nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { this_cpu_dec(bpf_bprintf_nest_level); + preempt_enable(); return -EBUSY; } *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); @@ -792,6 +791,7 @@ void bpf_put_buffers(void) if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); + preempt_enable(); } void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) @@ -1215,13 +1215,20 @@ static void bpf_wq_work(struct work_struct *work) rcu_read_unlock_trace(); } +static void bpf_async_cb_rcu_free(struct rcu_head *rcu) +{ + struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + + kfree_nolock(cb); +} + static void bpf_wq_delete_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, delete_work); cancel_work_sync(&w->work); - kfree_rcu(w, cb.rcu); + call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free); } static void bpf_timer_delete_work(struct work_struct *work) @@ -1230,13 +1237,13 @@ static void bpf_timer_delete_work(struct work_struct *work) /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call - * kfree_rcu(t) right after for both preallocated and non-preallocated + * call_rcu() right after for both preallocated and non-preallocated * maps. The async->cb = NULL was already done and no code path can see * address 't' anymore. Timer if armed for existing bpf_hrtimer before * bpf_timer_cancel_and_free will have been cancelled. */ hrtimer_cancel(&t->timer); - kfree_rcu(t, cb.rcu); + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); } static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, @@ -1270,11 +1277,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until - * kmalloc_nolock() is available, avoid locking issues by using - * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). - */ - cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); + cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; @@ -1315,7 +1318,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u * or pinned in bpffs. */ WRITE_ONCE(async->cb, NULL); - kfree(cb); + kfree_nolock(cb); ret = -EPERM; } out: @@ -1580,7 +1583,7 @@ void bpf_timer_cancel_and_free(void *val) * timer _before_ calling us, such that failing to cancel it here will * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. * Therefore, we _need_ to cancel any outstanding timers before we do - * kfree_rcu, even though no more timers can be armed. + * call_rcu, even though no more timers can be armed. * * Moreover, we need to schedule work even if timer does not belong to * the calling callback_fn, as on two different CPUs, we can end up in a @@ -1607,7 +1610,7 @@ void bpf_timer_cancel_and_free(void *val) * completion. */ if (hrtimer_try_to_cancel(&t->timer) >= 0) - kfree_rcu(t, cb.rcu); + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); else queue_work(system_dfl_wq, &t->cb.delete_work); } else { @@ -1657,6 +1660,13 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { .arg2_btf_id = BPF_PTR_POISON, }; +struct bpf_dynptr_file_impl { + struct freader freader; + /* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */ + u64 offset; + u64 size; +}; + /* Since the upper 8 bits of dynptr->size is reserved, the * maximum supported size is 2^24 - 1. */ @@ -1685,23 +1695,65 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } -u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) +u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + return df->size; + } + return ptr->size & DYNPTR_SIZE_MASK; } -static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) +static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off) +{ + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + df->offset += off; + return; + } + ptr->offset += off; +} + +static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size) { u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; - ptr->size = new_size | metadata; + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + df->size = new_size; + return; + } + ptr->size = (u32)new_size | metadata; } -int bpf_dynptr_check_size(u32 size) +int bpf_dynptr_check_size(u64 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; } +static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len) +{ + const void *ptr; + + if (!buf) + return -EINVAL; + + df->freader.buf = buf; + df->freader.buf_sz = len; + ptr = freader_fetch(&df->freader, offset + df->offset, len); + if (!ptr) + return df->freader.err; + + if (ptr != buf) /* Force copying into the buffer */ + memcpy(buf, ptr, len); + + return 0; +} + void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size) { @@ -1716,7 +1768,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) +BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; @@ -1751,8 +1803,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, - u32 offset, u64 flags) +static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src, + u64 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1782,14 +1834,16 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s case BPF_DYNPTR_TYPE_SKB_META: memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); return 0; + case BPF_DYNPTR_TYPE_FILE: + return bpf_file_fetch_bytes(src->data, offset, dst, len); default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; } } -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src, + u64, offset, u64, flags) { return __bpf_dynptr_read(dst, len, src, offset, flags); } @@ -1805,8 +1859,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, - u32 len, u64 flags) +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, + u64 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1839,18 +1893,16 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); case BPF_DYNPTR_TYPE_SKB_META: - if (flags) - return -EINVAL; - memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len); - return 0; + return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src, + len, flags); default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; } } -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src, + u64, len, u64, flags) { return __bpf_dynptr_write(dst, offset, src, len, flags); } @@ -1866,7 +1918,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len) { enum bpf_dynptr_type type; int err; @@ -2681,12 +2733,12 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; - u32 len = buffer__szk; + u64 len = buffer__szk; int err; if (!ptr->data) @@ -2720,6 +2772,9 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, } case BPF_DYNPTR_TYPE_SKB_META: return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); + case BPF_DYNPTR_TYPE_FILE: + err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk); + return err ? NULL : buffer__opt; default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; @@ -2768,8 +2823,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2801,10 +2856,10 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); } -__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end) +__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; - u32 size; + u64 size; if (!ptr->data || start > end) return -EINVAL; @@ -2814,7 +2869,7 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end if (start > size || end > size) return -ERANGE; - ptr->offset += start; + bpf_dynptr_advance_offset(ptr, start); bpf_dynptr_set_size(ptr, end - start); return 0; @@ -2837,7 +2892,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) return __bpf_dynptr_is_rdonly(ptr); } -__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p) +__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2874,14 +2929,14 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, - struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, + struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; - u32 off; + u64 off; src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); @@ -2903,7 +2958,7 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, off = 0; while (off < size) { - u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + u64 chunk_sz = min_t(u64, sizeof(buf), size - off); int err; err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); @@ -2929,10 +2984,10 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ - __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val) - { +__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) +{ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; - u32 chunk_sz, write_off; + u64 chunk_sz, write_off; char buf[256]; void* slice; int err; @@ -2951,11 +3006,11 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, return err; /* Non-linear data under the dynptr, write from a local buffer */ - chunk_sz = min_t(u32, sizeof(buf), size); + chunk_sz = min_t(u64, sizeof(buf), size); memset(buf, val, chunk_sz); for (write_off = 0; write_off < size; write_off += chunk_sz) { - chunk_sz = min_t(u32, sizeof(buf), size - write_off); + chunk_sz = min_t(u64, sizeof(buf), size - write_off); err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); if (err) return err; @@ -3675,34 +3730,21 @@ err_out: return -EFAULT; } -/** - * bpf_strnstr - Find the first substring in a length-limited string - * @s1__ign: The string to be searched - * @s2__ign: The string to search for - * @len: the maximum number of characters to search - * - * Return: - * * >=0 - Index of the first character of the first occurrence of @s2__ign - * within the first @len characters of @s1__ign - * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign - * * %-EFAULT - Cannot read one of the strings - * * %-E2BIG - One of the strings is too large - * * %-ERANGE - One of the strings is outside of kernel address space - */ -__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len) +static int __bpf_strnstr(const char *s1, const char *s2, size_t len, + bool ignore_case) { char c1, c2; int i, j; - if (!copy_from_kernel_nofault_allowed(s1__ign, 1) || - !copy_from_kernel_nofault_allowed(s2__ign, 1)) { + if (!copy_from_kernel_nofault_allowed(s1, 1) || + !copy_from_kernel_nofault_allowed(s2, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { - __get_kernel_nofault(&c2, s2__ign + j, char, err_out); + __get_kernel_nofault(&c2, s2 + j, char, err_out); if (c2 == '\0') return i; /* @@ -3712,7 +3754,13 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len */ if (i + j == len) break; - __get_kernel_nofault(&c1, s1__ign + j, char, err_out); + __get_kernel_nofault(&c1, s1 + j, char, err_out); + + if (ignore_case) { + c1 = tolower(c1); + c2 = tolower(c2); + } + if (c1 == '\0') return -ENOENT; if (c1 != c2) @@ -3722,7 +3770,7 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len return -E2BIG; if (i + j == len) return -ENOENT; - s1__ign++; + s1++; } return -E2BIG; err_out: @@ -3744,8 +3792,69 @@ err_out: */ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) { - return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); + return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false); +} + +/** + * bpf_strcasestr - Find the first substring in a string, ignoring the case of + * the characters + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within @s1__ign + * * %-ENOENT - @s2__ign is not a substring of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true); +} + +/** + * bpf_strnstr - Find the first substring in a length-limited string + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * @len: the maximum number of characters to search + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within the first @len characters of @s1__ign + * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, + size_t len) +{ + return __bpf_strnstr(s1__ign, s2__ign, len, false); } + +/** + * bpf_strncasestr - Find the first substring in a length-limited string, + * ignoring the case of the characters + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * @len: the maximum number of characters to search + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within the first @len characters of @s1__ign + * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign, + size_t len) +{ + return __bpf_strnstr(s1__ign, s2__ign, len, true); +} + #ifdef CONFIG_KEYS /** * bpf_lookup_user_key - lookup a key by its serial @@ -4166,7 +4275,8 @@ release_prog: } /** - * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4175,15 +4285,17 @@ release_prog: * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); } /** - * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4192,13 +4304,62 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } +static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, + struct bpf_dynptr_kern *ptr) +{ + struct bpf_dynptr_file_impl *state; + + /* flags is currently unsupported */ + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl)); + if (!state) { + bpf_dynptr_set_null(ptr); + return -ENOMEM; + } + state->offset = 0; + state->size = U64_MAX; /* Don't restrict size, as file may change anyways */ + freader_init_from_file(&state->freader, NULL, 0, file, may_sleep); + bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0); + bpf_dynptr_set_rdonly(ptr); + return 0; +} + +__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) +{ + return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit); +} + +int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) +{ + return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit); +} + +__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr; + struct bpf_dynptr_file_impl *df = ptr->data; + + if (!df) + return 0; + + freader_cleanup(&df->freader); + bpf_mem_free(&bpf_global_ma, df); + bpf_dynptr_set_null(ptr); + return 0; +} + __bpf_kfunc_end_defs(); static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) @@ -4342,6 +4503,7 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_local_irq_save) BTF_ID_FLAGS(func, bpf_local_irq_restore) +#ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) @@ -4350,6 +4512,7 @@ BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +#endif #ifdef CONFIG_DMA_SHARED_BUFFER BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) @@ -4367,13 +4530,17 @@ BTF_ID_FLAGS(func, bpf_strnlen); BTF_ID_FLAGS(func, bpf_strspn); BTF_ID_FLAGS(func, bpf_strcspn); BTF_ID_FLAGS(func, bpf_strstr); +BTF_ID_FLAGS(func, bpf_strcasestr); BTF_ID_FLAGS(func, bpf_strnstr); +BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { @@ -4414,7 +4581,7 @@ late_initcall(kfunc_init); /* Get a pointer to dynptr data up to len bytes for read only access. If * the dynptr doesn't have continuous data up to len bytes, return NULL. */ -const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len) { const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr; @@ -4425,9 +4592,19 @@ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) * the dynptr doesn't have continuous data up to len bytes, or the dynptr * is read only, return NULL. */ -void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len) +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len) { if (__bpf_dynptr_is_rdonly(ptr)) return NULL; return (void *)__bpf_dynptr_data(ptr, len); } + +void bpf_map_free_internal_structs(struct bpf_map *map, void *val) +{ + if (btf_record_has_field(map->record, BPF_TIMER)) + bpf_obj_free_timer(map->record, val); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(map->record, val); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, val); +} diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 3c611aba7f52..60db5d655495 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -34,7 +34,7 @@ * - read and write marks propagation. * - The propagation phase is a textbook live variable data flow analysis: * - * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)] + * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)] * state[cc, i].live_before = * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read * @@ -54,7 +54,7 @@ * The equation for "must_write_acc" propagation looks as follows: * * state[cc, i].must_write_acc = - * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)] + * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)] * U state[cc, i].must_write * * (An intersection of all "must_write_acc" for instruction successors @@ -195,8 +195,10 @@ static struct func_instance *__lookup_instance(struct bpf_verifier_env *env, return ERR_PTR(-ENOMEM); result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set), GFP_KERNEL_ACCOUNT); - if (!result->must_write_set) + if (!result->must_write_set) { + kvfree(result); return ERR_PTR(-ENOMEM); + } memcpy(&result->callchain, callchain, sizeof(*callchain)); result->insn_cnt = subprog_sz; hash_add(liveness->func_instances, &result->hl_node, key); @@ -445,7 +447,12 @@ int bpf_jmp_offset(struct bpf_insn *insn) __diag_push(); __diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl"); -inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +/* + * Returns an array of instructions succ, with succ->items[0], ..., + * succ->items[n-1] with successor instructions, where n=succ->cnt + */ +inline struct bpf_iarray * +bpf_insn_successors(struct bpf_verifier_env *env, u32 idx) { static const struct opcode_info { bool can_jump; @@ -472,19 +479,29 @@ inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}), #undef _J }; + struct bpf_prog *prog = env->prog; struct bpf_insn *insn = &prog->insnsi[idx]; const struct opcode_info *opcode_info; - int i = 0, insn_sz; + struct bpf_iarray *succ, *jt; + int insn_sz; + + jt = env->insn_aux_data[idx].jt; + if (unlikely(jt)) + return jt; + + /* pre-allocated array of size up to 2; reset cnt, as it may have been used already */ + succ = env->succ; + succ->cnt = 0; opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)]; insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; if (opcode_info->can_fallthrough) - succ[i++] = idx + insn_sz; + succ->items[succ->cnt++] = idx + insn_sz; if (opcode_info->can_jump) - succ[i++] = idx + bpf_jmp_offset(insn) + 1; + succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1; - return i; + return succ; } __diag_pop(); @@ -522,6 +539,8 @@ static int propagate_to_outer_instance(struct bpf_verifier_env *env, this_subprog_start = callchain_subprog_start(callchain); outer_instance = get_outer_instance(env, instance); + if (IS_ERR(outer_instance)) + return PTR_ERR(outer_instance); callsite = callchain->callsites[callchain->curframe - 1]; reset_stack_write_marks(env, outer_instance, callsite); @@ -544,11 +563,12 @@ static inline bool update_insn(struct bpf_verifier_env *env, struct bpf_insn_aux_data *aux = env->insn_aux_data; u64 new_before, new_after, must_write_acc; struct per_frame_masks *insn, *succ_insn; - u32 succ_num, s, succ[2]; + struct bpf_iarray *succ; + u32 s; bool changed; - succ_num = bpf_insn_successors(env->prog, insn_idx, succ); - if (unlikely(succ_num == 0)) + succ = bpf_insn_successors(env, insn_idx); + if (succ->cnt == 0) return false; changed = false; @@ -560,8 +580,8 @@ static inline bool update_insn(struct bpf_verifier_env *env, * of successors plus all "must_write" slots of instruction itself. */ must_write_acc = U64_MAX; - for (s = 0; s < succ_num; ++s) { - succ_insn = get_frame_masks(instance, frame, succ[s]); + for (s = 0; s < succ->cnt; ++s) { + succ_insn = get_frame_masks(instance, frame, succ->items[s]); new_after |= succ_insn->live_before; must_write_acc &= succ_insn->must_write_acc; } diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index f50533169cc3..a0c3b35de2ce 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -461,6 +461,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) [PTR_TO_ARENA] = "arena", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", + [PTR_TO_INSN] = "insn", [PTR_TO_MAP_KEY] = "map_key", [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; @@ -500,6 +501,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type) return "xdp"; case BPF_DYNPTR_TYPE_SKB_META: return "skb_meta"; + case BPF_DYNPTR_TYPE_FILE: + return "file"; case BPF_DYNPTR_TYPE_INVALID: return "<invalid>"; default: diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c index 37b80a23ae1a..99c63d982c5d 100644 --- a/kernel/bpf/range_tree.c +++ b/kernel/bpf/range_tree.c @@ -2,7 +2,6 @@ /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #include <linux/interval_tree_generic.h> #include <linux/slab.h> -#include <linux/bpf_mem_alloc.h> #include <linux/bpf.h> #include "range_tree.h" @@ -21,7 +20,7 @@ * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree"). * * The implementation relies on external lock to protect rbtree-s. - * The alloc/free of range_node-s is done via bpf_mem_alloc. + * The alloc/free of range_node-s is done via kmalloc_nolock(). * * bpf arena is using range_tree to represent unallocated slots. * At init time: @@ -150,9 +149,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) range_it_insert(rn, rt); /* Add a range */ - migrate_disable(); - new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); - migrate_enable(); + new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); if (!new_rn) return -ENOMEM; new_rn->rn_start = last + 1; @@ -172,9 +169,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) } else { /* in the middle of the clearing range */ range_it_remove(rn, rt); - migrate_disable(); - bpf_mem_free(&bpf_global_ma, rn); - migrate_enable(); + kfree_nolock(rn); } } return 0; @@ -227,9 +222,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) range_it_remove(right, rt); left->rn_last = right->rn_last; range_it_insert(left, rt); - migrate_disable(); - bpf_mem_free(&bpf_global_ma, right); - migrate_enable(); + kfree_nolock(right); } else if (left) { /* Combine with the left range */ range_it_remove(left, rt); @@ -241,9 +234,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) right->rn_start = start; range_it_insert(right, rt); } else { - migrate_disable(); - left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); - migrate_enable(); + left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); if (!left) return -ENOMEM; left->rn_start = start; @@ -259,7 +250,7 @@ void range_tree_destroy(struct range_tree *rt) while ((rn = range_it_iter_first(rt, 0, -1U))) { range_it_remove(rn, rt); - bpf_mem_free(&bpf_global_ma, rn); + kfree_nolock(rn); } } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 719d73299397..f6a075ffac63 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -13,7 +13,7 @@ #include <linux/btf_ids.h> #include <asm/rqspinlock.h> -#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BPF_F_RB_OVERWRITE) /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ #define RINGBUF_PGOFF \ @@ -30,6 +30,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; + bool overwrite_mode; rqspinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than @@ -73,6 +74,7 @@ struct bpf_ringbuf { unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); unsigned long pending_pos; + unsigned long overwrite_pos; /* position after the last overwritten record */ char data[] __aligned(PAGE_SIZE); }; @@ -166,7 +168,7 @@ static void bpf_ringbuf_notify(struct irq_work *work) * considering that the maximum value of data_sz is (4GB - 1), there * will be no overflow, so just note the size limit in the comments. */ -static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, bool overwrite_mode) { struct bpf_ringbuf *rb; @@ -183,17 +185,25 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) rb->consumer_pos = 0; rb->producer_pos = 0; rb->pending_pos = 0; + rb->overwrite_mode = overwrite_mode; return rb; } static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) { + bool overwrite_mode = false; struct bpf_ringbuf_map *rb_map; if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + if (attr->map_flags & BPF_F_RB_OVERWRITE) { + if (attr->map_type != BPF_MAP_TYPE_RINGBUF) + return ERR_PTR(-EINVAL); + overwrite_mode = true; + } + if (attr->key_size || attr->value_size || !is_power_of_2(attr->max_entries) || !PAGE_ALIGNED(attr->max_entries)) @@ -205,7 +215,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&rb_map->map, attr); - rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, overwrite_mode); if (!rb_map->rb) { bpf_map_area_free(rb_map); return ERR_PTR(-ENOMEM); @@ -216,6 +226,8 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) static void bpf_ringbuf_free(struct bpf_ringbuf *rb) { + irq_work_sync(&rb->work); + /* copy pages pointer and nr_pages to local variable, as we are going * to unmap rb itself with vunmap() below */ @@ -293,13 +305,26 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } +/* + * Return an estimate of the available data in the ring buffer. + * Note: the returned value can exceed the actual ring buffer size because the + * function is not synchronized with the producer. The producer acquires the + * ring buffer's spinlock, but this function does not. + */ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { - unsigned long cons_pos, prod_pos; + unsigned long cons_pos, prod_pos, over_pos; cons_pos = smp_load_acquire(&rb->consumer_pos); - prod_pos = smp_load_acquire(&rb->producer_pos); - return prod_pos - cons_pos; + + if (unlikely(rb->overwrite_mode)) { + over_pos = smp_load_acquire(&rb->overwrite_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - max(cons_pos, over_pos); + } else { + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; + } } static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) @@ -402,11 +427,43 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) return (void*)((addr & PAGE_MASK) - off); } +static bool bpf_ringbuf_has_space(const struct bpf_ringbuf *rb, + unsigned long new_prod_pos, + unsigned long cons_pos, + unsigned long pend_pos) +{ + /* + * No space if oldest not yet committed record until the newest + * record span more than (ringbuf_size - 1). + */ + if (new_prod_pos - pend_pos > rb->mask) + return false; + + /* Ok, we have space in overwrite mode */ + if (unlikely(rb->overwrite_mode)) + return true; + + /* + * No space if producer position advances more than (ringbuf_size - 1) + * ahead of consumer position when not in overwrite mode. + */ + if (new_prod_pos - cons_pos > rb->mask) + return false; + + return true; +} + +static u32 bpf_ringbuf_round_up_hdr_len(u32 hdr_len) +{ + hdr_len &= ~BPF_RINGBUF_DISCARD_BIT; + return round_up(hdr_len + BPF_RINGBUF_HDR_SZ, 8); +} + static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) { - unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags; + unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, over_pos, flags; struct bpf_ringbuf_hdr *hdr; - u32 len, pg_off, tmp_size, hdr_len; + u32 len, pg_off, hdr_len; if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) return NULL; @@ -429,24 +486,43 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) hdr_len = READ_ONCE(hdr->len); if (hdr_len & BPF_RINGBUF_BUSY_BIT) break; - tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT; - tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8); - pend_pos += tmp_size; + pend_pos += bpf_ringbuf_round_up_hdr_len(hdr_len); } rb->pending_pos = pend_pos; - /* check for out of ringbuf space: - * - by ensuring producer position doesn't advance more than - * (ringbuf_size - 1) ahead - * - by ensuring oldest not yet committed record until newest - * record does not span more than (ringbuf_size - 1) - */ - if (new_prod_pos - cons_pos > rb->mask || - new_prod_pos - pend_pos > rb->mask) { + if (!bpf_ringbuf_has_space(rb, new_prod_pos, cons_pos, pend_pos)) { raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } + /* + * In overwrite mode, advance overwrite_pos when the ring buffer is full. + * The key points are to stay on record boundaries and consume enough records + * to fit the new one. + */ + if (unlikely(rb->overwrite_mode)) { + over_pos = rb->overwrite_pos; + while (new_prod_pos - over_pos > rb->mask) { + hdr = (void *)rb->data + (over_pos & rb->mask); + hdr_len = READ_ONCE(hdr->len); + /* + * The bpf_ringbuf_has_space() check above ensures we won’t + * step over a record currently being worked on by another + * producer. + */ + over_pos += bpf_ringbuf_round_up_hdr_len(hdr_len); + } + /* + * smp_store_release(&rb->producer_pos, new_prod_pos) at + * the end of the function ensures that when consumer sees + * the updated rb->producer_pos, it always sees the updated + * rb->overwrite_pos, so when consumer reads overwrite_pos + * after smp_load_acquire(r->producer_pos), the overwrite_pos + * will always be valid. + */ + WRITE_ONCE(rb->overwrite_pos, over_pos); + } + hdr = (void *)rb->data + (prod_pos & rb->mask); pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); hdr->len = size | BPF_RINGBUF_BUSY_BIT; @@ -576,6 +652,8 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: return smp_load_acquire(&rb->producer_pos); + case BPF_RB_OVERWRITE_POS: + return smp_load_acquire(&rb->overwrite_pos); default: return 0; } diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index a00561b1d3e5..f7d0c8d4644e 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -89,15 +89,14 @@ struct rqspinlock_timeout { DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); EXPORT_SYMBOL_GPL(rqspinlock_held_locks); -static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) +static bool is_lock_released(rqspinlock_t *lock, u32 mask) { if (!(atomic_read_acquire(&lock->val) & (mask))) return true; return false; } -static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) +static noinline int check_deadlock_AA(rqspinlock_t *lock) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); int cnt = min(RES_NR_HELD, rqh->cnt); @@ -118,8 +117,7 @@ static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, * more locks, which reduce to ABBA). This is not exhaustive, and we rely on * timeouts as the final line of defense. */ -static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) +static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); int rqh_cnt = min(RES_NR_HELD, rqh->cnt); @@ -142,7 +140,7 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, * Let's ensure to break out of this loop if the lock is available for * us to potentially acquire. */ - if (is_lock_released(lock, mask, ts)) + if (is_lock_released(lock, mask)) return 0; /* @@ -198,33 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, return 0; } -static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) -{ - int ret; - - ret = check_deadlock_AA(lock, mask, ts); - if (ret) - return ret; - ret = check_deadlock_ABBA(lock, mask, ts); - if (ret) - return ret; - - return 0; -} - static noinline int check_timeout(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) { - u64 time = ktime_get_mono_fast_ns(); u64 prev = ts->cur; + u64 time; if (!ts->timeout_end) { - ts->cur = time; - ts->timeout_end = time + ts->duration; + if (check_deadlock_AA(lock)) + return -EDEADLK; + ts->cur = ktime_get_mono_fast_ns(); + ts->timeout_end = ts->cur + ts->duration; return 0; } + time = ktime_get_mono_fast_ns(); if (time > ts->timeout_end) return -ETIMEDOUT; @@ -234,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, */ if (prev + NSEC_PER_MSEC < time) { ts->cur = time; - return check_deadlock(lock, mask, ts); + return check_deadlock_ABBA(lock, mask); } return 0; @@ -278,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) int val, ret = 0; RES_INIT_TIMEOUT(ts); + /* + * The fast path is not invoked for the TAS fallback, so we must grab + * the deadlock detection entry here. + */ grab_held_lock_entry(lock); /* @@ -400,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) goto queue; } - /* - * Grab an entry in the held locks array, to enable deadlock detection. - */ - grab_held_lock_entry(lock); + /* Deadlock detection entry already held after failing fast path. */ /* * We're pending, wait for the owner to go away. @@ -450,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) * queuing. */ queue: - lockevent_inc(lock_slowpath); /* - * Grab deadlock detection entry for the queue path. + * Do not queue if we're a waiter and someone is attempting this lock on + * the same CPU. In case of NMIs, this prevents long timeouts where we + * interrupt the pending waiter, and the owner, that will eventually + * signal the head of our queue, both of which are logically but not + * physically part of the queue, hence outside the scope of the idx > 0 + * check above for the trylock fallback. */ - grab_held_lock_entry(lock); + if (check_deadlock_AA(lock)) { + ret = -EDEADLK; + goto err_release_entry; + } + lockevent_inc(lock_slowpath); + /* Deadlock detection entry already held after failing fast path. */ node = this_cpu_ptr(&rqnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); @@ -467,19 +463,17 @@ queue: * not be nested NMIs taking spinlocks. That may not be true in * some architectures even though the chance of needing more than * 4 nodes will still be extremely unlikely. When that happens, - * we fall back to spinning on the lock directly without using - * any MCS node. This is not the most elegant solution, but is - * simple enough. + * we fall back to attempting a trylock operation without using + * any MCS node. Unlike qspinlock which cannot fail, we have the + * option of failing the slow path, and under contention, such a + * trylock spinning will likely be treated unfairly due to lack of + * queueing, hence do not spin. */ - if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { + if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) { lockevent_inc(lock_no_node); - RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); - while (!queued_spin_trylock(lock)) { - if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { - lockevent_inc(rqspinlock_lock_timeout); - goto err_release_node; - } - cpu_relax(); + if (!queued_spin_trylock(lock)) { + ret = -EDEADLK; + goto err_release_node; } goto release; } @@ -540,7 +534,7 @@ queue: val = arch_mcs_spin_lock_contended(&node->locked); if (val == RES_TIMEOUT_VAL) { - ret = -EDEADLK; + ret = -ETIMEDOUT; goto waitq_timeout; } @@ -575,6 +569,14 @@ queue: val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); + /* Disable queue destruction when we detect deadlocks. */ + if (ret == -EDEADLK) { + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); + arch_mcs_spin_unlock_contended(&next->locked); + goto err_release_node; + } + waitq_timeout: if (ret) { /* diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4d53cdd1374c..da3d328f5c15 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -42,6 +42,28 @@ static inline int stack_map_data_size(struct bpf_map *map) sizeof(struct bpf_stack_build_id) : sizeof(u64); } +/** + * stack_map_calculate_max_depth - Calculate maximum allowed stack trace depth + * @size: Size of the buffer/map value in bytes + * @elem_size: Size of each stack trace element + * @flags: BPF stack trace flags (BPF_F_USER_STACK, BPF_F_USER_BUILD_ID, ...) + * + * Return: Maximum number of stack trace entries that can be safely stored + */ +static u32 stack_map_calculate_max_depth(u32 size, u32 elem_size, u64 flags) +{ + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 max_depth; + u32 curr_sysctl_max_stack = READ_ONCE(sysctl_perf_event_max_stack); + + max_depth = size / elem_size; + max_depth += skip; + if (max_depth > curr_sysctl_max_stack) + return curr_sysctl_max_stack; + + return max_depth; +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u64 elem_size = sizeof(struct stack_map_bucket) + @@ -229,8 +251,8 @@ static long __bpf_get_stackid(struct bpf_map *map, { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 hash, id, trace_nr, trace_len, i, max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; - u32 hash, id, trace_nr, trace_len, i; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; @@ -239,7 +261,8 @@ static long __bpf_get_stackid(struct bpf_map *map, /* skipping more than usable stack trace */ return -EFAULT; - trace_nr = trace->nr - skip; + max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags); + trace_nr = min_t(u32, trace->nr - skip, max_depth - skip); trace_len = trace_nr * sizeof(u64); ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); @@ -300,22 +323,19 @@ static long __bpf_get_stackid(struct bpf_map *map, BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { - u32 max_depth = map->value_size / stack_map_data_size(map); - u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 elem_size = stack_map_data_size(map); bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; + u32 max_depth; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - max_depth += skip; - if (max_depth > sysctl_perf_event_max_stack) - max_depth = sysctl_perf_event_max_stack; - + max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags); trace = get_perf_callchain(regs, kernel, user, max_depth, - false, false); + false, false, 0); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -371,15 +391,11 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, return -EFAULT; nr_kernel = count_kernel_ip(trace); + __u64 nr = trace->nr; /* save original */ if (kernel) { - __u64 nr = trace->nr; - trace->nr = nr_kernel; ret = __bpf_get_stackid(map, trace, flags); - - /* restore nr */ - trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -390,6 +406,10 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; ret = __bpf_get_stackid(map, trace, flags); } + + /* restore nr */ + trace->nr = nr; + return ret; } @@ -406,7 +426,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags, bool may_fault) { - u32 trace_nr, copy_len, elem_size, num_elem, max_depth; + u32 trace_nr, copy_len, elem_size, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; bool crosstask = task && task != current; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -438,21 +458,20 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, goto clear; } - num_elem = size / elem_size; - max_depth = num_elem + skip; - if (sysctl_perf_event_max_stack < max_depth) - max_depth = sysctl_perf_event_max_stack; + max_depth = stack_map_calculate_max_depth(size, elem_size, flags); if (may_fault) rcu_read_lock(); /* need RCU for perf's callchain below */ - if (trace_in) + if (trace_in) { trace = trace_in; - else if (kernel && task) + trace->nr = min_t(u32, trace->nr, max_depth); + } else if (kernel && task) { trace = get_callchain_entry_for_task(task, max_depth); - else + } else { trace = get_perf_callchain(regs, kernel, user, max_depth, - crosstask, false); + crosstask, false, 0); + } if (unlikely(!trace) || trace->nr < skip) { if (may_fault) @@ -461,7 +480,6 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, } trace_nr = trace->nr - skip; - trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; ips = trace->ip + skip; diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index eb6c5a21c2ef..0b6bc3f30335 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -4,111 +4,10 @@ #include <linux/bpf.h> #include <linux/filter.h> #include <linux/bpf_mem_alloc.h> -#include <linux/percpu.h> -#include <linux/refcount.h> #include <linux/gfp.h> #include <linux/memory.h> -#include <linux/local_lock.h> #include <linux/mutex.h> -/* - * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe - * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and - * stash it in a local per-CPU variable, and bump allocate from the page - * whenever items need to be printed to a stream. Each page holds a global - * atomic refcount in its first 4 bytes, and then records of variable length - * that describe the printed messages. Once the global refcount has dropped to - * zero, it is a signal to free the page back to the kernel's page allocator, - * given all the individual records in it have been consumed. - * - * It is possible the same page is used to serve allocations across different - * programs, which may be consumed at different times individually, hence - * maintaining a reference count per-page is critical for correct lifetime - * tracking. - * - * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it - * lands. - */ -struct bpf_stream_page { - refcount_t ref; - u32 consumed; - char buf[]; -}; - -/* Available room to add data to a refcounted page. */ -#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed)) - -static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock); -static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page); - -static bool bpf_stream_page_local_lock(unsigned long *flags) -{ - return local_trylock_irqsave(&stream_local_lock, *flags); -} - -static void bpf_stream_page_local_unlock(unsigned long *flags) -{ - local_unlock_irqrestore(&stream_local_lock, *flags); -} - -static void bpf_stream_page_free(struct bpf_stream_page *stream_page) -{ - struct page *p; - - if (!stream_page) - return; - p = virt_to_page(stream_page); - free_pages_nolock(p, 0); -} - -static void bpf_stream_page_get(struct bpf_stream_page *stream_page) -{ - refcount_inc(&stream_page->ref); -} - -static void bpf_stream_page_put(struct bpf_stream_page *stream_page) -{ - if (refcount_dec_and_test(&stream_page->ref)) - bpf_stream_page_free(stream_page); -} - -static void bpf_stream_page_init(struct bpf_stream_page *stream_page) -{ - refcount_set(&stream_page->ref, 1); - stream_page->consumed = 0; -} - -static struct bpf_stream_page *bpf_stream_page_replace(void) -{ - struct bpf_stream_page *stream_page, *old_stream_page; - struct page *page; - - page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0); - if (!page) - return NULL; - stream_page = page_address(page); - bpf_stream_page_init(stream_page); - - old_stream_page = this_cpu_read(stream_pcpu_page); - if (old_stream_page) - bpf_stream_page_put(old_stream_page); - this_cpu_write(stream_pcpu_page, stream_page); - return stream_page; -} - -static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len) -{ - int min = offsetof(struct bpf_stream_elem, str[0]); - int consumed = stream_page->consumed; - int total = BPF_STREAM_PAGE_SZ; - int rem = max(0, total - consumed - min); - - /* Let's give room of at least 8 bytes. */ - WARN_ON_ONCE(rem % 8 != 0); - rem = rem < 8 ? 0 : rem; - return min(len, rem); -} - static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) { init_llist_node(&elem->node); @@ -116,54 +15,12 @@ static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) elem->consumed_len = 0; } -static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem) -{ - unsigned long addr = (unsigned long)elem; - - return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr); -} - -static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len) -{ - u32 consumed = stream_page->consumed; - - stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8); - return (struct bpf_stream_elem *)&stream_page->buf[consumed]; -} - -static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len) -{ - struct bpf_stream_elem *elem = NULL; - struct bpf_stream_page *page; - int room = 0; - - page = this_cpu_read(stream_pcpu_page); - if (!page) - page = bpf_stream_page_replace(); - if (!page) - return NULL; - - room = bpf_stream_page_check_room(page, len); - if (room != len) - page = bpf_stream_page_replace(); - if (!page) - return NULL; - bpf_stream_page_get(page); - room = bpf_stream_page_check_room(page, len); - WARN_ON_ONCE(room != len); - - elem = bpf_stream_page_push_elem(page, room); - bpf_stream_elem_init(elem, room); - return elem; -} - static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) { const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf); struct bpf_stream_elem *elem; - unsigned long flags; + size_t alloc_size; - BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ); /* * Length denotes the amount of data to be written as part of stream element, * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can @@ -172,10 +29,13 @@ static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) if (len < 0 || len > max_len) return NULL; - if (!bpf_stream_page_local_lock(&flags)) + alloc_size = offsetof(struct bpf_stream_elem, str[len]); + elem = kmalloc_nolock(alloc_size, __GFP_ZERO, -1); + if (!elem) return NULL; - elem = bpf_stream_page_reserve_elem(len); - bpf_stream_page_local_unlock(&flags); + + bpf_stream_elem_init(elem, len); + return elem; } @@ -231,10 +91,7 @@ static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bp static void bpf_stream_free_elem(struct bpf_stream_elem *elem) { - struct bpf_stream_page *p; - - p = bpf_stream_page_from_elem(elem); - bpf_stream_page_put(p); + kfree_nolock(elem); } static void bpf_stream_free_list(struct llist_node *list) @@ -355,7 +212,8 @@ __bpf_kfunc_start_defs(); * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the * enum in headers. */ -__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog) +__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, void *aux__prog) { struct bpf_bprintf_data data = { .get_bin_args = true, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2a9456a3e730..6589acc89ef8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -158,7 +158,7 @@ static void maybe_wait_bpf_programs(struct bpf_map *map) */ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) - synchronize_rcu(); + synchronize_rcu_expedited(); } static void unpin_uptr_kaddr(void *kaddr) @@ -520,6 +520,21 @@ void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, return ptr; } +void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, + int node) +{ + struct mem_cgroup *memcg, *old_memcg; + void *ptr; + + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); + ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); + + return ptr; +} + void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; @@ -1219,6 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) return src - orig_src; } +EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); int map_check_no_btf(const struct bpf_map *map, const struct btf *btf, @@ -1478,6 +1494,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: if (!bpf_token_capable(token, CAP_BPF)) goto put_token; break; @@ -1570,7 +1587,8 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) goto free_map; } } else if (attr->excl_prog_hash_size) { - return -EINVAL; + err = -EINVAL; + goto free_map; } err = security_bpf_map_create(map, attr, token, uattr.is_kernel); @@ -1709,9 +1727,6 @@ static int map_lookup_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; - if (attr->flags & ~BPF_F_LOCK) - return -EINVAL; - CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -1719,9 +1734,9 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - if ((attr->flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) - return -EINVAL; + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + if (err) + return err; key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) @@ -1784,11 +1799,9 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - if ((attr->flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { - err = -EINVAL; + err = bpf_map_check_op_flags(map, attr->flags, ~0); + if (err) goto err_put; - } key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { @@ -1992,13 +2005,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - if (attr->batch.elem_flags & ~BPF_F_LOCK) - return -EINVAL; - - if ((attr->batch.elem_flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { - return -EINVAL; - } + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + if (err) + return err; value_size = bpf_map_value_size(map); @@ -2055,12 +2064,9 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - if (attr->batch.elem_flags & ~BPF_F_LOCK) - return -EINVAL; - - if ((attr->batch.elem_flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) - return -EINVAL; + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + if (err) + return err; value_size = bpf_map_value_size(map); @@ -2315,7 +2321,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) return; if (audit_enabled == AUDIT_OFF) return; - if (!in_irq() && !irqs_disabled()) + if (!in_hardirq() && !irqs_disabled()) ctx = audit_context(); ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); if (unlikely(!ab)) @@ -2413,7 +2419,7 @@ static void __bpf_prog_put(struct bpf_prog *prog) struct bpf_prog_aux *aux = prog->aux; if (atomic64_dec_and_test(&aux->refcnt)) { - if (in_irq() || irqs_disabled()) { + if (in_hardirq() || irqs_disabled()) { INIT_WORK(&aux->work, bpf_prog_put_deferred); schedule_work(&aux->work); } else { @@ -2447,6 +2453,9 @@ void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) struct bpf_prog_stats *stats; unsigned int flags; + if (unlikely(!prog->stats)) + return; + stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); @@ -2838,6 +2847,23 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr return err; } +static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) +{ + int err; + int i; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY) + continue; + + err = bpf_insn_array_ready(prog->aux->used_maps[i]); + if (err) + return err; + } + + return 0; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD keyring_id @@ -3067,6 +3093,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_used_maps; + err = bpf_prog_mark_insn_arrays_ready(prog); + if (err < 0) + goto free_used_maps; + err = bpf_prog_alloc_id(prog); if (err) goto free_used_maps; @@ -5019,19 +5049,19 @@ static int bpf_prog_get_info_by_fd(struct file *file, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { + if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) { + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); + if (!insns_sanitized) + return -ENOMEM; + uinsns = u64_to_user_ptr(info.xlated_prog_insns); + ulen = min_t(u32, info.xlated_prog_len, ulen); + fault = copy_to_user(uinsns, insns_sanitized, ulen); + kfree(insns_sanitized); + if (fault) + return -EFAULT; + } else { info.xlated_prog_insns = 0; - goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); - if (!insns_sanitized) - return -ENOMEM; - uinsns = u64_to_user_ptr(info.xlated_prog_insns); - ulen = min_t(u32, info.xlated_prog_len, ulen); - fault = copy_to_user(uinsns, insns_sanitized, ulen); - kfree(insns_sanitized); - if (fault) - return -EFAULT; } if (bpf_prog_is_offloaded(prog->aux)) { diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 0bbe412f854e..feecd8f4dbf9 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -110,16 +110,15 @@ const struct file_operations bpf_token_fops = { int bpf_token_create(union bpf_attr *attr) { + struct bpf_token *token __free(kfree) = NULL; struct bpf_mount_opts *mnt_opts; - struct bpf_token *token = NULL; struct user_namespace *userns; struct inode *inode; - struct file *file; CLASS(fd, f)(attr->token_create.bpffs_fd); struct path path; struct super_block *sb; umode_t mode; - int err, fd; + int err; if (fd_empty(f)) return -EBADF; @@ -166,23 +165,20 @@ int bpf_token_create(union bpf_attr *attr) inode->i_fop = &bpf_token_fops; clear_nlink(inode); /* make sure it is unlinked */ - file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); - if (IS_ERR(file)) { - iput(inode); - return PTR_ERR(file); - } + FD_PREPARE(fdf, O_CLOEXEC, + alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, + O_RDWR, &bpf_token_fops)); + if (fdf.err) + return fdf.err; token = kzalloc(sizeof(*token), GFP_USER); - if (!token) { - err = -ENOMEM; - goto out_file; - } + if (!token) + return -ENOMEM; atomic64_set(&token->refcnt, 1); - /* remember bpffs owning userns for future ns_capable() checks */ - token->userns = get_user_ns(userns); - + /* remember bpffs owning userns for future ns_capable() checks. */ + token->userns = userns; token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; token->allowed_progs = mnt_opts->delegate_progs; @@ -190,24 +186,11 @@ int bpf_token_create(union bpf_attr *attr) err = security_bpf_token_create(token, attr, &path); if (err) - goto out_token; - - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - err = fd; - goto out_token; - } - - file->private_data = token; - fd_install(fd, file); - - return fd; + return err; -out_token: - bpf_token_free(token); -out_file: - fput(file); - return err; + get_user_ns(token->userns); + fd_prepare_file(fdf)->private_data = no_free_ptr(token); + return fd_publish(fdf); } int bpf_token_get_info_by_fd(struct bpf_token *token, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5949095e51c3..976d89011b15 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -175,23 +175,42 @@ out: return tr; } -static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) +static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr, void *new_addr) { + enum bpf_text_poke_type new_t = BPF_MOD_CALL, old_t = BPF_MOD_CALL; void *ip = tr->func.addr; + + if (!new_addr) + new_t = BPF_MOD_NOP; + else if (bpf_trampoline_use_jmp(tr->flags)) + new_t = BPF_MOD_JUMP; + + if (!old_addr) + old_t = BPF_MOD_NOP; + else if (bpf_trampoline_use_jmp(orig_flags)) + old_t = BPF_MOD_JUMP; + + return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr); +} + +static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr) +{ int ret; if (tr->func.ftrace_managed) ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); else - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); + ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); return ret; } -static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, +static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr, void *new_addr, bool lock_direct_mutex) { - void *ip = tr->func.addr; int ret; if (tr->func.ftrace_managed) { @@ -200,7 +219,8 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad else ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); } else { - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); + ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, + new_addr); } return ret; } @@ -220,10 +240,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } if (tr->func.ftrace_managed) { - ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); + ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); + if (ret) + return ret; ret = register_ftrace_direct(tr->fops, (long)new_addr); } else { - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } return ret; @@ -334,8 +356,9 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) * call_rcu_tasks() is not necessary. */ if (im->ip_after_call) { - int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, - NULL, im->ip_epilogue); + int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, + im->ip_epilogue); WARN_ON(err); if (IS_ENABLED(CONFIG_TASKS_RCU)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); @@ -408,7 +431,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut return PTR_ERR(tlinks); if (total == 0) { - err = unregister_fentry(tr, tr->cur_image->image); + err = unregister_fentry(tr, orig_flags, tr->cur_image->image); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; goto out; @@ -432,9 +455,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS again: - if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && - (tr->flags & BPF_TRAMP_F_CALL_ORIG)) - tr->flags |= BPF_TRAMP_F_ORIG_STACK; + if (tr->flags & BPF_TRAMP_F_CALL_ORIG) { + if (tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) { + /* The BPF_TRAMP_F_SKIP_FRAME can be cleared in the + * first try, reset it in the second try. + */ + tr->flags |= BPF_TRAMP_F_ORIG_STACK | BPF_TRAMP_F_SKIP_FRAME; + } else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_JMP)) { + /* Use "jmp" instead of "call" for the trampoline + * in the origin call case, and we don't need to + * skip the frame. + */ + tr->flags &= ~BPF_TRAMP_F_SKIP_FRAME; + } + } #endif size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, @@ -465,10 +499,18 @@ again: if (err) goto out_free; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP + if (bpf_trampoline_use_jmp(tr->flags)) + tr->fops->flags |= FTRACE_OPS_FL_JMP; + else + tr->fops->flags &= ~FTRACE_OPS_FL_JMP; +#endif + WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); + err = modify_fentry(tr, orig_flags, tr->cur_image->image, + im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); @@ -479,11 +521,6 @@ again: * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the * trampoline again, and retry register. */ - /* reset fops->func and fops->trampoline for re-register */ - tr->fops->func = NULL; - tr->fops->trampoline = 0; - - /* free im memory and reallocate later */ bpf_tramp_image_free(im); goto again; } @@ -496,8 +533,15 @@ again: tr->cur_image = im; out: /* If any error happens, restore previous flags */ - if (err) + if (err) { tr->flags = orig_flags; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP + if (bpf_trampoline_use_jmp(tr->flags)) + tr->fops->flags |= FTRACE_OPS_FL_JMP; + else + tr->fops->flags &= ~FTRACE_OPS_FL_JMP; +#endif + } kfree(tlinks); return err; @@ -573,7 +617,8 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, if (err) return err; tr->extension_prog = link->link.prog; - return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } if (cnt >= BPF_MAX_TRAMP_LINKS) @@ -621,6 +666,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, + BPF_MOD_NOP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; guard(mutex)(&tgt_prog->aux->ext_mutex); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff40e5e65c43..f0ca69f888fa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -209,8 +209,6 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); -static void specialize_kfunc(struct bpf_verifier_env *env, - u32 func_id, u16 offset, unsigned long *addr); static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) @@ -515,6 +513,7 @@ static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); +static bool is_task_work_add_kfunc(u32 func_id); static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { @@ -547,6 +546,21 @@ static bool is_async_callback_calling_insn(struct bpf_insn *insn) (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm)); } +static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + /* bpf_timer callbacks are never sleepable. */ + if (bpf_helper_call(insn) && insn->imm == BPF_FUNC_timer_set_callback) + return false; + + /* bpf_wq and bpf_task_work callbacks are always sleepable. */ + if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 && + (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) + return true; + + verifier_bug(env, "unhandled async callback in is_async_cb_sleepable"); + return false; +} + static bool is_may_goto_insn(struct bpf_insn *insn) { return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; @@ -676,6 +690,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_XDP; case DYNPTR_TYPE_SKB_META: return BPF_DYNPTR_TYPE_SKB_META; + case DYNPTR_TYPE_FILE: + return BPF_DYNPTR_TYPE_FILE; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -694,6 +710,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) return DYNPTR_TYPE_XDP; case BPF_DYNPTR_TYPE_SKB_META: return DYNPTR_TYPE_SKB_META; + case BPF_DYNPTR_TYPE_FILE: + return DYNPTR_TYPE_FILE; default: return 0; } @@ -701,7 +719,7 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) static bool dynptr_type_refcounted(enum bpf_dynptr_type type) { - return type == BPF_DYNPTR_TYPE_RINGBUF; + return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE; } static void __mark_dynptr_reg(struct bpf_reg_state *reg, @@ -812,6 +830,15 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re struct bpf_func_state *state = func(env, reg); int spi, ref_obj_id, i; + /* + * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr + * is safe to do directly. + */ + if (reg->type == CONST_PTR_TO_DYNPTR) { + verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); + return -EFAULT; + } spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; @@ -1410,7 +1437,7 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf dst->acquired_refs = src->acquired_refs; dst->active_locks = src->active_locks; dst->active_preempt_locks = src->active_preempt_locks; - dst->active_rcu_lock = src->active_rcu_lock; + dst->active_rcu_locks = src->active_rcu_locks; dst->active_irq_id = src->active_irq_id; dst->active_lock_id = src->active_lock_id; dst->active_lock_ptr = src->active_lock_ptr; @@ -2093,7 +2120,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - return NULL; + return ERR_PTR(-ENOMEM); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2103,12 +2130,12 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->stack_size++; err = copy_verifier_state(&elem->st, cur); if (err) - return NULL; + return ERR_PTR(-ENOMEM); elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { verbose(env, "The sequence of %d jumps is too complex.\n", env->stack_size); - return NULL; + return ERR_PTR(-E2BIG); } if (elem->st.parent) { ++elem->st.parent->branches; @@ -2903,7 +2930,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - return NULL; + return ERR_PTR(-ENOMEM); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2915,7 +2942,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, verbose(env, "The sequence of %d jumps is too complex for async cb.\n", env->stack_size); - return NULL; + return ERR_PTR(-E2BIG); } /* Unlike push_stack() do not copy_verifier_state(). * The caller state doesn't matter. @@ -2926,7 +2953,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, elem->st.in_sleepable = is_sleepable; frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT); if (!frame) - return NULL; + return ERR_PTR(-ENOMEM); init_func_state(env, frame, BPF_MAIN_FUNC /* callsite */, 0 /* frameno within this callchain */, @@ -3097,6 +3124,9 @@ struct bpf_kfunc_btf_tab { u32 nr_descs; }; +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, + int insn_idx); + static int kfunc_desc_cmp_by_id_off(const void *a, const void *b) { const struct bpf_kfunc_desc *d0 = a; @@ -3114,7 +3144,7 @@ static int kfunc_btf_cmp_by_off(const void *a, const void *b) return d0->offset - d1->offset; } -static const struct bpf_kfunc_desc * +static struct bpf_kfunc_desc * find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset) { struct bpf_kfunc_desc desc = { @@ -3237,12 +3267,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) { const struct btf_type *func, *func_proto; struct bpf_kfunc_btf_tab *btf_tab; + struct btf_func_model func_model; struct bpf_kfunc_desc_tab *tab; struct bpf_prog_aux *prog_aux; struct bpf_kfunc_desc *desc; const char *func_name; struct btf *desc_btf; - unsigned long call_imm; unsigned long addr; int err; @@ -3326,19 +3356,6 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) func_name); return -EINVAL; } - specialize_kfunc(env, func_id, offset, &addr); - - if (bpf_jit_supports_far_kfunc_call()) { - call_imm = func_id; - } else { - call_imm = BPF_CALL_IMM(addr); - /* Check whether the relative offset overflows desc->imm */ - if ((unsigned long)(s32)call_imm != call_imm) { - verbose(env, "address of kernel function %s is out of range\n", - func_name); - return -EINVAL; - } - } if (bpf_dev_bound_kfunc_id(func_id)) { err = bpf_dev_bound_kfunc_check(&env->log, prog_aux); @@ -3346,18 +3363,20 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return err; } + err = btf_distill_func_proto(&env->log, desc_btf, + func_proto, func_name, + &func_model); + if (err) + return err; + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; - desc->imm = call_imm; desc->offset = offset; desc->addr = addr; - err = btf_distill_func_proto(&env->log, desc_btf, - func_proto, func_name, - &desc->func_model); - if (!err) - sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), - kfunc_desc_cmp_by_id_off, NULL); - return err; + desc->func_model = func_model; + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_id_off, NULL); + return 0; } static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) @@ -3372,16 +3391,43 @@ static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) return 0; } -static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog) +static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc) +{ + unsigned long call_imm; + + if (bpf_jit_supports_far_kfunc_call()) { + call_imm = desc->func_id; + } else { + call_imm = BPF_CALL_IMM(desc->addr); + /* Check whether the relative offset overflows desc->imm */ + if ((unsigned long)(s32)call_imm != call_imm) { + verbose(env, "address of kernel func_id %u is out of range\n", + desc->func_id); + return -EINVAL; + } + } + desc->imm = call_imm; + return 0; +} + +static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env) { struct bpf_kfunc_desc_tab *tab; + int i, err; - tab = prog->aux->kfunc_tab; + tab = env->prog->aux->kfunc_tab; if (!tab) - return; + return 0; + + for (i = 0; i < tab->nr_descs; i++) { + err = set_kfunc_desc_imm(env, &tab->descs[i]); + if (err) + return err; + } sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off, NULL); + return 0; } bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) @@ -3509,8 +3555,12 @@ static int check_subprogs(struct bpf_verifier_env *env) subprog[cur_subprog].has_ld_abs = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; - if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + if (BPF_OP(code) == BPF_CALL) goto next; + if (BPF_OP(code) == BPF_EXIT) { + subprog[cur_subprog].exit_idx = i; + goto next; + } off = i + bpf_jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); @@ -4392,6 +4442,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, bt_reg_mask(bt)); return -EFAULT; } + if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call + && subseq_idx - idx != 1) { + if (bt_subprog_enter(bt)) + return -EFAULT; + } } else if (opcode == BPF_EXIT) { bool r0_precise; @@ -5826,8 +5881,7 @@ bad_type: static bool in_sleepable(struct bpf_verifier_env *env) { - return env->prog->sleepable || - (env->cur_state && env->cur_state->in_sleepable); + return env->cur_state->in_sleepable; } /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() @@ -5835,7 +5889,7 @@ static bool in_sleepable(struct bpf_verifier_env *env) */ static bool in_rcu_cs(struct bpf_verifier_env *env) { - return env->cur_state->active_rcu_lock || + return env->cur_state->active_rcu_locks || env->cur_state->active_locks || !in_sleepable(env); } @@ -5988,6 +6042,18 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, return 0; } +/* + * Return the size of the memory region accessible from a pointer to map value. + * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible. + */ +static u32 map_mem_size(const struct bpf_map *map) +{ + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) + return map->max_entries * sizeof(long); + + return map->value_size; +} + /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed, @@ -5997,11 +6063,11 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; + u32 mem_size = map_mem_size(map); struct btf_record *rec; int err, i; - err = check_mem_region_access(env, regno, off, size, map->value_size, - zero_size_allowed); + err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -6416,6 +6482,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; + if (reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY) + strict = true; break; case PTR_TO_CTX: pointer_desc = "context "; @@ -7039,6 +7107,9 @@ BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) { /* RCU trusted: these fields are trusted in RCU CS and can be NULL */ BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { struct file __rcu *exe_file; +#ifdef CONFIG_MEMCG + struct task_struct __rcu *owner; +#endif }; /* skb->sk, req->sk are not RCU protected, but we mark them as such @@ -7078,6 +7149,11 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) { struct sock *sk; }; +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) { + struct mm_struct *vm_mm; + struct file *vm_file; +}; + static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -7119,6 +7195,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); @@ -7502,10 +7579,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; - /* if map is read-only, track its contents as scalars */ + /* + * If map is read-only, track its contents as scalars, + * unless it is an insn array (see the special case below) + */ if (tnum_is_const(reg->var_off) && bpf_map_is_rdonly(map) && - map->ops->map_direct_value_addr) { + map->ops->map_direct_value_addr && + map->map_type != BPF_MAP_TYPE_INSN_ARRAY) { int map_off = off + reg->var_off.value; u64 val = 0; @@ -7516,6 +7597,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regs[value_regno].type = SCALAR_VALUE; __mark_reg_known(®s[value_regno], val); + } else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + if (bpf_size != BPF_DW) { + verbose(env, "Invalid read of %d bytes from insn_array\n", + size); + return -EACCES; + } + copy_register_state(®s[value_regno], reg); + regs[value_regno].type = PTR_TO_INSN; } else { mark_reg_unknown(env, regs, value_regno); } @@ -8464,6 +8553,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, case BPF_TASK_WORK: field_off = map->record->task_work_off; break; + case BPF_WORKQUEUE: + field_off = map->record->wq_off; + break; default: verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; @@ -8505,13 +8597,17 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; struct bpf_map *map = reg->map_ptr; - u64 val = reg->var_off.value; + int err; - if (map->record->wq_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", - val + reg->off, map->record->wq_off); - return -EINVAL; + err = check_map_field_pointer(env, regno, BPF_WORKQUEUE); + if (err) + return err; + + if (meta->map.ptr) { + verifier_bug(env, "Two map pointers in a bpf_wq helper"); + return -EFAULT; } + meta->map.uid = reg->map_uid; meta->map.ptr = map; return 0; @@ -8866,7 +8962,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) { struct bpf_func_state *fold, *fcur; - int i, fr; + int i, fr, num_slots; reset_idmap_scratch(env); for (fr = old->curframe; fr >= 0; fr--) { @@ -8879,7 +8975,9 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, &fcur->regs[i], &env->idmap_scratch); - for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) { + num_slots = min(fold->allocated_stack / BPF_REG_SIZE, + fcur->allocated_stack / BPF_REG_SIZE); + for (i = 0; i < num_slots; i++) { if (!is_spilled_reg(&fold->stack[i]) || !is_spilled_reg(&fcur->stack[i])) continue; @@ -9014,8 +9112,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, prev_st = find_prev_entry(env, cur_st->parent, insn_idx); /* branch out active iter state */ queued_st = push_stack(env, insn_idx + 1, insn_idx, false); - if (!queued_st) - return -ENOMEM; + if (IS_ERR(queued_st)) + return PTR_ERR(queued_st); queued_iter = get_iter_from_state(queued_st, meta); queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; @@ -10052,6 +10150,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_INSN_ARRAY: + goto error; default: break; } @@ -10366,8 +10466,6 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); -static bool is_task_work_add_kfunc(u32 func_id); - static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); @@ -10586,10 +10684,9 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm) || - is_task_work_add_kfunc(insn->imm)); - if (!async_cb) - return -EFAULT; + is_async_cb_sleepable(env, insn)); + if (IS_ERR(async_cb)) + return PTR_ERR(async_cb); callee = async_cb->frame[0]; callee->async_entry_cnt = caller->async_entry_cnt + 1; @@ -10605,8 +10702,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins * proceed with next instruction within current frame. */ callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false); - if (!callback_state) - return -ENOMEM; + if (IS_ERR(callback_state)) + return PTR_ERR(callback_state); err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb, callback_state); @@ -10646,7 +10743,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (env->subprog_info[subprog].might_sleep && - (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks || + (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks || env->cur_state->active_irq_id || !in_sleepable(env))) { verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" @@ -10660,8 +10757,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } - verbose(env, "Func#%d ('%s') is global and assumed valid.\n", - subprog, sub_name); + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Func#%d ('%s') is global and assumed valid.\n", + subprog, sub_name); if (env->subprog_info[subprog].changes_pkt_data) clear_all_pkt_pointers(env); /* mark global subprog for verifying after main prog */ @@ -10974,6 +11072,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) bool in_callback_fn; int err; + err = bpf_update_live_stack(env); + if (err) + return err; + callee = state->frame[state->curframe]; r0 = &callee->regs[BPF_REG_0]; if (r0->type == PTR_TO_STACK) { @@ -11224,7 +11326,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit return -EINVAL; } - if (check_lock && env->cur_state->active_rcu_lock) { + if (check_lock && env->cur_state->active_rcu_locks) { verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix); return -EINVAL; } @@ -11359,6 +11461,15 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return *ptr && (*ptr)->func ? 0 : -EINVAL; } +/* Check if we're in a sleepable context. */ +static inline bool in_sleepable_context(struct bpf_verifier_env *env) +{ + return !env->cur_state->active_rcu_locks && + !env->cur_state->active_preempt_locks && + !env->cur_state->active_irq_id && + in_sleepable(env); +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -11419,15 +11530,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (env->cur_state->active_rcu_lock) { + if (env->cur_state->active_rcu_locks) { if (fn->might_sleep) { verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n", func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } if (env->cur_state->active_preempt_locks) { @@ -11436,9 +11544,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } if (env->cur_state->active_irq_id) { @@ -11447,11 +11552,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } + /* Track non-sleepable context for helpers. */ + if (!in_sleepable_context(env)) + env->insn_aux_data[insn_idx].non_sleepable = true; + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -11482,15 +11588,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.release_regno) { err = -EINVAL; - /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot - * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr - * is safe to do directly. - */ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { - if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { - verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); - return -EFAULT; - } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { u32 ref_obj_id = meta.ref_obj_id; @@ -11884,6 +11982,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->prog->call_get_func_ip = true; } + if (func_id == BPF_FUNC_tail_call) { + if (env->cur_state->curframe) { + struct bpf_verifier_state *branch; + + mark_reg_scratched(env, BPF_REG_0); + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + clear_all_pkt_pointers(env); + mark_reg_unknown(env, regs, BPF_REG_0); + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + env->insn_idx--; + } else { + changes_data = false; + } + } + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -12258,9 +12375,11 @@ enum special_kfunc_type { KF_bpf_res_spin_unlock, KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, + KF_bpf_dynptr_from_file, + KF_bpf_dynptr_file_discard, KF___bpf_trap, - KF_bpf_task_work_schedule_signal, - KF_bpf_task_work_schedule_resume, + KF_bpf_task_work_schedule_signal_impl, + KF_bpf_task_work_schedule_resume_impl, }; BTF_ID_LIST(special_kfunc_list) @@ -12330,14 +12449,16 @@ BTF_ID(func, bpf_res_spin_lock) BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) +BTF_ID(func, bpf_dynptr_from_file) +BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) -BTF_ID(func, bpf_task_work_schedule_signal) -BTF_ID(func, bpf_task_work_schedule_resume) +BTF_ID(func, bpf_task_work_schedule_signal_impl) +BTF_ID(func, bpf_task_work_schedule_resume_impl) static bool is_task_work_add_kfunc(u32 func_id) { - return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || - func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; } static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) @@ -13293,6 +13414,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_XDP; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) { dynptr_arg_type |= DYNPTR_TYPE_SKB_META; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; @@ -13827,9 +13953,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_reg_state *regs; branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); - if (!branch) { + if (IS_ERR(branch)) { verbose(env, "failed to push state for failed lock acquisition\n"); - return -ENOMEM; + return PTR_ERR(branch); } regs = branch->frame[branch->curframe]->regs; @@ -13861,6 +13987,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + /* Track non-sleepable context for kfuncs, same as for helpers. */ + if (!in_sleepable_context(env)) + insn_aux->non_sleepable = true; + /* Check the arguments */ err = check_kfunc_args(env, &meta, insn_idx); if (err < 0) @@ -13907,36 +14037,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, preempt_disable = is_kfunc_bpf_preempt_disable(&meta); preempt_enable = is_kfunc_bpf_preempt_enable(&meta); - if (env->cur_state->active_rcu_lock) { + if (rcu_lock) { + env->cur_state->active_rcu_locks++; + } else if (rcu_unlock) { struct bpf_func_state *state; struct bpf_reg_state *reg; u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); - if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { - verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); - return -EACCES; - } - - if (rcu_lock) { - verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); + if (env->cur_state->active_rcu_locks == 0) { + verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); return -EINVAL; - } else if (rcu_unlock) { + } + if (--env->cur_state->active_rcu_locks == 0) { bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ if (reg->type & MEM_RCU) { reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); reg->type |= PTR_UNTRUSTED; } })); - env->cur_state->active_rcu_lock = false; - } else if (sleepable) { - verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); - return -EACCES; } - } else if (rcu_lock) { - env->cur_state->active_rcu_lock = true; - } else if (rcu_unlock) { - verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); - return -EINVAL; + } else if (sleepable && env->cur_state->active_rcu_locks) { + verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); + return -EACCES; + } + + if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { + verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); + return -EACCES; } if (env->cur_state->active_preempt_locks) { @@ -13969,12 +14096,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ if (meta.release_regno) { - err = release_reference(env, regs[meta.release_regno].ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - return err; + struct bpf_reg_state *reg = ®s[meta.release_regno]; + + if (meta.initialized_dynptr.ref_obj_id) { + err = unmark_stack_slots_dynptr(env, reg); + } else { + err = release_reference(env, reg->ref_obj_id); + if (err) + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, meta.func_id); } + if (err) + return err; } if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || @@ -14280,16 +14413,15 @@ struct bpf_sanitize_info { bool mask_to_left; }; -static struct bpf_verifier_state * -sanitize_speculative_path(struct bpf_verifier_env *env, - const struct bpf_insn *insn, - u32 next_idx, u32 curr_idx) +static int sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) { struct bpf_verifier_state *branch; struct bpf_reg_state *regs; branch = push_stack(env, next_idx, curr_idx, true); - if (branch && insn) { + if (!IS_ERR(branch) && insn) { regs = branch->frame[branch->curframe]->regs; if (BPF_SRC(insn->code) == BPF_K) { mark_reg_unknown(env, regs, insn->dst_reg); @@ -14298,7 +14430,7 @@ sanitize_speculative_path(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->src_reg); } } - return branch; + return PTR_ERR_OR_ZERO(branch); } static int sanitize_ptr_alu(struct bpf_verifier_env *env, @@ -14317,7 +14449,6 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; struct bpf_reg_state tmp; - bool ret; int err; if (can_skip_alu_sanitation(env, insn)) @@ -14390,11 +14521,12 @@ do_sim: tmp = *dst_reg; copy_register_state(dst_reg, ptr_reg); } - ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, - env->insn_idx); - if (!ptr_is_dst_reg && ret) + err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); + if (err < 0) + return REASON_STACK; + if (!ptr_is_dst_reg) *dst_reg = tmp; - return !ret ? REASON_STACK : 0; + return 0; } static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) @@ -15948,6 +16080,30 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; + if (reg1 == reg2) { + switch (opcode) { + case BPF_JGE: + case BPF_JLE: + case BPF_JSGE: + case BPF_JSLE: + case BPF_JEQ: + return 1; + case BPF_JGT: + case BPF_JLT: + case BPF_JSGT: + case BPF_JSLT: + case BPF_JNE: + return 0; + case BPF_JSET: + if (tnum_is_const(t1)) + return t1.value != 0; + else + return (smin1 <= 0 && smax1 >= 0) ? -1 : 1; + default: + return -1; + } + } + switch (opcode) { case BPF_JEQ: /* constants, umin/umax and smin/smax checks would be @@ -16394,6 +16550,13 @@ static int reg_set_min_max(struct bpf_verifier_env *env, if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) return 0; + /* We compute branch direction for same SCALAR_VALUE registers in + * is_scalar_branch_taken(). For unknown branch directions (e.g., BPF_JSET) + * on the same registers, we don't need to adjust the min/max values. + */ + if (false_reg1 == false_reg2) + return 0; + /* fallthrough (FALSE) branch */ regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32); reg_bounds_sync(false_reg1); @@ -16714,8 +16877,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* branch out 'fallthrough' insn as a new state to explore */ queued_st = push_stack(env, idx + 1, idx, false); - if (!queued_st) - return -ENOMEM; + if (IS_ERR(queued_st)) + return PTR_ERR(queued_st); queued_st->may_goto_depth++; if (prev_st) @@ -16793,10 +16956,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * the fall-through branch for simulation under speculative * execution. */ - if (!env->bypass_spec_v1 && - !sanitize_speculative_path(env, insn, *insn_idx + 1, - *insn_idx)) - return -EFAULT; + if (!env->bypass_spec_v1) { + err = sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx); + if (err < 0) + return err; + } if (env->log.level & BPF_LOG_LEVEL) print_insn_state(env, this_branch, this_branch->curframe); *insn_idx += insn->off; @@ -16806,11 +16970,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * program will go. If needed, push the goto branch for * simulation under speculative execution. */ - if (!env->bypass_spec_v1 && - !sanitize_speculative_path(env, insn, - *insn_idx + insn->off + 1, - *insn_idx)) - return -EFAULT; + if (!env->bypass_spec_v1) { + err = sanitize_speculative_path(env, insn, *insn_idx + insn->off + 1, + *insn_idx); + if (err < 0) + return err; + } if (env->log.level & BPF_LOG_LEVEL) print_insn_state(env, this_branch, this_branch->curframe); return 0; @@ -16831,10 +16996,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, - false); - if (!other_branch) - return -EFAULT; + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false); + if (IS_ERR(other_branch)) + return PTR_ERR(other_branch); other_branch_regs = other_branch->frame[other_branch->curframe]->regs; if (BPF_SRC(insn->code) == BPF_X) { @@ -17017,7 +17181,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; - WARN_ON_ONCE(map->max_entries != 1); + WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY && + map->max_entries != 1); /* We want reg->id to be same (0) as map_value is not distinct */ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { @@ -17769,6 +17934,247 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env) return 0; } +static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem) +{ + size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]); + struct bpf_iarray *new; + + new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT); + if (!new) { + /* this is what callers always want, so simplify the call site */ + kvfree(old); + return NULL; + } + + new->cnt = n_elem; + return new; +} + +static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items) +{ + struct bpf_insn_array_value *value; + u32 i; + + for (i = start; i <= end; i++) { + value = map->ops->map_lookup_elem(map, &i); + /* + * map_lookup_elem of an array map will never return an error, + * but not checking it makes some static analysers to worry + */ + if (IS_ERR(value)) + return PTR_ERR(value); + else if (!value) + return -EINVAL; + items[i - start] = value->xlated_off; + } + return 0; +} + +static int cmp_ptr_to_u32(const void *a, const void *b) +{ + return *(u32 *)a - *(u32 *)b; +} + +static int sort_insn_array_uniq(u32 *items, int cnt) +{ + int unique = 1; + int i; + + sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL); + + for (i = 1; i < cnt; i++) + if (items[i] != items[unique - 1]) + items[unique++] = items[i]; + + return unique; +} + +/* + * sort_unique({map[start], ..., map[end]}) into off + */ +static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off) +{ + u32 n = end - start + 1; + int err; + + err = copy_insn_array(map, start, end, off); + if (err) + return err; + + return sort_insn_array_uniq(off, n); +} + +/* + * Copy all unique offsets from the map + */ +static struct bpf_iarray *jt_from_map(struct bpf_map *map) +{ + struct bpf_iarray *jt; + int err; + int n; + + jt = iarray_realloc(NULL, map->max_entries); + if (!jt) + return ERR_PTR(-ENOMEM); + + n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items); + if (n < 0) { + err = n; + goto err_free; + } + if (n == 0) { + err = -EINVAL; + goto err_free; + } + jt->cnt = n; + return jt; + +err_free: + kvfree(jt); + return ERR_PTR(err); +} + +/* + * Find and collect all maps which fit in the subprog. Return the result as one + * combined jump table in jt->items (allocated with kvcalloc) + */ +static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env, + int subprog_start, int subprog_end) +{ + struct bpf_iarray *jt = NULL; + struct bpf_map *map; + struct bpf_iarray *jt_cur; + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) { + /* + * TODO (when needed): collect only jump tables, not static keys + * or maps for indirect calls + */ + map = env->insn_array_maps[i]; + + jt_cur = jt_from_map(map); + if (IS_ERR(jt_cur)) { + kvfree(jt); + return jt_cur; + } + + /* + * This is enough to check one element. The full table is + * checked to fit inside the subprog later in create_jt() + */ + if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) { + u32 old_cnt = jt ? jt->cnt : 0; + jt = iarray_realloc(jt, old_cnt + jt_cur->cnt); + if (!jt) { + kvfree(jt_cur); + return ERR_PTR(-ENOMEM); + } + memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2); + } + + kvfree(jt_cur); + } + + if (!jt) { + verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start); + return ERR_PTR(-EINVAL); + } + + jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt); + return jt; +} + +static struct bpf_iarray * +create_jt(int t, struct bpf_verifier_env *env) +{ + static struct bpf_subprog_info *subprog; + int subprog_start, subprog_end; + struct bpf_iarray *jt; + int i; + + subprog = bpf_find_containing_subprog(env, t); + subprog_start = subprog->start; + subprog_end = (subprog + 1)->start; + jt = jt_from_subprog(env, subprog_start, subprog_end); + if (IS_ERR(jt)) + return jt; + + /* Check that the every element of the jump table fits within the given subprogram */ + for (i = 0; i < jt->cnt; i++) { + if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) { + verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n", + t, subprog_start, subprog_end); + kvfree(jt); + return ERR_PTR(-EINVAL); + } + } + + return jt; +} + +/* "conditional jump with N edges" */ +static int visit_gotox_insn(int t, struct bpf_verifier_env *env) +{ + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + bool keep_exploring = false; + struct bpf_iarray *jt; + int i, w; + + jt = env->insn_aux_data[t].jt; + if (!jt) { + jt = create_jt(t, env); + if (IS_ERR(jt)) + return PTR_ERR(jt); + + env->insn_aux_data[t].jt = jt; + } + + mark_prune_point(env, t); + for (i = 0; i < jt->cnt; i++) { + w = jt->items[i]; + if (w < 0 || w >= env->prog->len) { + verbose(env, "indirect jump out of range from insn %d to %d\n", t, w); + return -EINVAL; + } + + mark_jmp_point(env, w); + + /* EXPLORED || DISCOVERED */ + if (insn_state[w]) + continue; + + if (env->cfg.cur_stack >= env->prog->len) + return -E2BIG; + + insn_stack[env->cfg.cur_stack++] = w; + insn_state[w] |= DISCOVERED; + keep_exploring = true; + } + + return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING; +} + +static int visit_tailcall_insn(struct bpf_verifier_env *env, int t) +{ + static struct bpf_subprog_info *subprog; + struct bpf_iarray *jt; + + if (env->insn_aux_data[t].jt) + return 0; + + jt = iarray_realloc(NULL, 2); + if (!jt) + return -ENOMEM; + + subprog = bpf_find_containing_subprog(env, t); + jt->items[0] = t + 1; + jt->items[1] = subprog->exit_idx; + env->insn_aux_data[t].jt = jt; + return 0; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -17829,6 +18235,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_subprog_might_sleep(env, t); if (bpf_helper_changes_pkt_data(insn->imm)) mark_subprog_changes_pkt_data(env, t); + if (insn->imm == BPF_FUNC_tail_call) + visit_tailcall_insn(env, t); } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; @@ -17861,8 +18269,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); case BPF_JA: - if (BPF_SRC(insn->code) != BPF_K) - return -EINVAL; + if (BPF_SRC(insn->code) == BPF_X) + return visit_gotox_insn(t, env); if (BPF_CLASS(insn->code) == BPF_JMP) off = insn->off; @@ -17989,8 +18397,9 @@ err_free: */ static int compute_postorder(struct bpf_verifier_env *env) { - u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2]; + u32 cur_postorder, i, top, stack_sz, s; int *stack = NULL, *postorder = NULL, *state = NULL; + struct bpf_iarray *succ; postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); @@ -18014,11 +18423,11 @@ static int compute_postorder(struct bpf_verifier_env *env) stack_sz--; continue; } - succ_cnt = bpf_insn_successors(env->prog, top, succ); - for (s = 0; s < succ_cnt; ++s) { - if (!state[succ[s]]) { - stack[stack_sz++] = succ[s]; - state[succ[s]] |= DISCOVERED; + succ = bpf_insn_successors(env, top); + for (s = 0; s < succ->cnt; ++s) { + if (!state[succ->items[s]]) { + stack[stack_sz++] = succ->items[s]; + state[succ->items[s]] |= DISCOVERED; } } state[top] |= EXPLORED; @@ -18790,6 +19199,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; case PTR_TO_ARENA: return true; + case PTR_TO_INSN: + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && + rold->off == rcur->off && range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); default: return regs_exact(rold, rcur, idmap); } @@ -18970,7 +19383,7 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c if (old->active_preempt_locks != cur->active_preempt_locks) return false; - if (old->active_rcu_lock != cur->active_rcu_lock) + if (old->active_rcu_locks != cur->active_rcu_locks) return false; if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) @@ -19142,7 +19555,7 @@ static int propagate_precision(struct bpf_verifier_env *env, bt_set_frame_slot(&env->bt, fr, i); first = false; } - if (!first) + if (!first && (env->log.level & BPF_LOG_LEVEL2)) verbose(env, "\n"); } @@ -19782,9 +20195,6 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, return PROCESS_BPF_EXIT; if (env->cur_state->curframe) { - err = bpf_update_live_stack(env); - if (err) - return err; /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); if (err) @@ -19799,6 +20209,99 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, return PROCESS_BPF_EXIT; } +static int indirect_jump_min_max_index(struct bpf_verifier_env *env, + int regno, + struct bpf_map *map, + u32 *pmin_index, u32 *pmax_index) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + u64 min_index, max_index; + const u32 size = 8; + + if (check_add_overflow(reg->umin_value, reg->off, &min_index) || + (min_index > (u64) U32_MAX * size)) { + verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n", + regno, reg->umin_value, reg->off); + return -ERANGE; + } + if (check_add_overflow(reg->umax_value, reg->off, &max_index) || + (max_index > (u64) U32_MAX * size)) { + verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n", + regno, reg->umax_value, reg->off); + return -ERANGE; + } + + min_index /= size; + max_index /= size; + + if (max_index >= map->max_entries) { + verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n", + regno, min_index, max_index, map->max_entries); + return -EINVAL; + } + + *pmin_index = min_index; + *pmax_index = max_index; + return 0; +} + +/* gotox *dst_reg */ +static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *dst_reg; + struct bpf_map *map; + u32 min_index, max_index; + int err = 0; + int n; + int i; + + dst_reg = reg_state(env, insn->dst_reg); + if (dst_reg->type != PTR_TO_INSN) { + verbose(env, "R%d has type %s, expected PTR_TO_INSN\n", + insn->dst_reg, reg_type_str(env, dst_reg->type)); + return -EINVAL; + } + + map = dst_reg->map_ptr; + if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg)) + return -EFAULT; + + if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env, + "R%d has incorrect map type %d", insn->dst_reg, map->map_type)) + return -EFAULT; + + err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index); + if (err) + return err; + + /* Ensure that the buffer is large enough */ + if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) { + env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf, + max_index - min_index + 1); + if (!env->gotox_tmp_buf) + return -ENOMEM; + } + + n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items); + if (n < 0) + return n; + if (n == 0) { + verbose(env, "register R%d doesn't point to any offset in map id=%d\n", + insn->dst_reg, map->id); + return -EINVAL; + } + + for (i = 0; i < n - 1; i++) { + other_branch = push_stack(env, env->gotox_tmp_buf->items[i], + env->insn_idx, env->cur_state->speculative); + if (IS_ERR(other_branch)) + return PTR_ERR(other_branch); + } + env->insn_idx = env->gotox_tmp_buf->items[n-1]; + return 0; +} + static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) { int err; @@ -19901,6 +20404,15 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) mark_reg_scratched(env, BPF_REG_0); } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->src_reg != BPF_REG_0 || + insn->imm != 0 || insn->off != 0) { + verbose(env, "BPF_JA|BPF_X uses reserved fields\n"); + return -EINVAL; + } + return check_indirect_jump(env, insn); + } + if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || @@ -20417,6 +20929,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: break; default: verbose(env, @@ -20488,6 +21001,15 @@ static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map) env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + err = bpf_insn_array_init(map, env->prog); + if (err) { + verbose(env, "Failed to properly initialize insn array\n"); + return err; + } + env->insn_array_maps[env->insn_array_map_cnt++] = map; + } + return env->used_map_cnt - 1; } @@ -20734,6 +21256,33 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } +static void release_insn_arrays(struct bpf_verifier_env *env) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_release(env->insn_array_maps[i]); +} + +static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust(env->insn_array_maps[i], off, len); +} + +static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len); +} + static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; @@ -20775,6 +21324,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of } adjust_insn_aux_data(env, new_prog, off, len); adjust_subprog_starts(env, off, len); + adjust_insn_arrays(env, off, len); adjust_poke_descs(new_prog, off, len); return new_prog; } @@ -20937,6 +21487,27 @@ static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, return 0; } +/* + * Clean up dynamically allocated fields of aux data for instructions [start, ...] + */ +static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + int end = start + len; + int i; + + for (i = start; i < end; i++) { + if (aux_data[i].jt) { + kvfree(aux_data[i].jt); + aux_data[i].jt = NULL; + } + + if (bpf_is_ldimm64(&insns[i])) + i++; + } +} + static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; @@ -20946,6 +21517,9 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) if (bpf_prog_is_offloaded(env->prog->aux)) bpf_prog_offload_remove_insns(env, off, cnt); + /* Should be called before bpf_remove_insns, as it uses prog->insnsi */ + clear_insn_aux_data(env, off, cnt); + err = bpf_remove_insns(env->prog, off, cnt); if (err) return err; @@ -20958,6 +21532,8 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) if (err) return err; + adjust_insn_arrays_after_remove(env, off, cnt); + memmove(aux_data + off, aux_data + off + cnt, sizeof(*aux_data) * (orig_prog_len - off - cnt)); @@ -21497,6 +22073,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) struct bpf_insn *insn; void *old_bpf_func; int err, num_exentries; + int old_len, subprog_start_adjustment = 0; if (env->subprog_cnt <= 1) return 0; @@ -21571,6 +22148,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->func_idx = i; /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; + func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment; func[i]->aux->func_info = prog->aux->func_info; func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; @@ -21600,6 +22178,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; func[i]->aux->arena = prog->aux->arena; + func[i]->aux->used_maps = env->used_maps; + func[i]->aux->used_map_cnt = env->used_map_cnt; num_exentries = 0; insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { @@ -21624,7 +22204,15 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; if (!i) func[i]->aux->exception_boundary = env->seen_exception; + + /* + * To properly pass the absolute subprog start to jit + * all instruction adjustments should be accumulated + */ + old_len = func[i]->len; func[i] = bpf_int_jit_compile(func[i]); + subprog_start_adjustment += func[i]->len - old_len; + if (!func[i]->jited) { err = -ENOTSUPP; goto out_free; @@ -21677,6 +22265,15 @@ static int jit_subprogs(struct bpf_verifier_env *env) cond_resched(); } + /* + * Cleanup func[i]->aux fields which aren't required + * or can become invalid in future + */ + for (i = 0; i < env->subprog_cnt; i++) { + func[i]->aux->used_maps = NULL; + func[i]->aux->used_map_cnt = 0; + } + /* finally lock prog and jit images for all functions and * populate kallsysm. Begin at the first subprogram, since * bpf_prog_load will add the kallsyms for the main program. @@ -21806,46 +22403,47 @@ static int fixup_call_args(struct bpf_verifier_env *env) } /* replace a generic kfunc with a specialized version if necessary */ -static void specialize_kfunc(struct bpf_verifier_env *env, - u32 func_id, u16 offset, unsigned long *addr) +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx) { struct bpf_prog *prog = env->prog; bool seen_direct_write; void *xdp_kfunc; bool is_rdonly; + u32 func_id = desc->func_id; + u16 offset = desc->offset; + unsigned long addr = desc->addr; + + if (offset) /* return if module BTF is used */ + return 0; if (bpf_dev_bound_kfunc_id(func_id)) { xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id); - if (xdp_kfunc) { - *addr = (unsigned long)xdp_kfunc; - return; - } + if (xdp_kfunc) + addr = (unsigned long)xdp_kfunc; /* fallback to default kfunc when not supported by netdev */ - } - - if (offset) - return; - - if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { seen_direct_write = env->seen_direct_write; is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); if (is_rdonly) - *addr = (unsigned long)bpf_dynptr_from_skb_rdonly; + addr = (unsigned long)bpf_dynptr_from_skb_rdonly; /* restore env->seen_direct_write to its original value, since * may_access_direct_pkt_data mutates it */ env->seen_direct_write = seen_direct_write; + } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_set_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_remove_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + if (!env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_dynptr_from_file_sleepable; } - - if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] && - bpf_lsm_has_d_inode_locked(prog)) - *addr = (unsigned long)bpf_set_dentry_xattr_locked; - - if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] && - bpf_lsm_has_d_inode_locked(prog)) - *addr = (unsigned long)bpf_remove_dentry_xattr_locked; + desc->addr = addr; + return 0; } static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, @@ -21868,7 +22466,8 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt) { - const struct bpf_kfunc_desc *desc; + struct bpf_kfunc_desc *desc; + int err; if (!insn->imm) { verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); @@ -21888,6 +22487,10 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } + err = specialize_kfunc(env, desc, insn_idx); + if (err) + return err; + if (!bpf_jit_supports_far_kfunc_call()) insn->imm = BPF_CALL_IMM(desc->addr); if (insn->off) @@ -22483,8 +23086,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) } if (is_storage_get_function(insn->imm)) { - if (!in_sleepable(env) || - env->insn_aux_data[i + delta].storage_get_func_atomic) + if (env->insn_aux_data[i + delta].non_sleepable) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); else insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); @@ -22917,7 +23519,9 @@ next_insn: } } - sort_kfunc_descs_by_imm_off(env->prog); + ret = sort_kfunc_descs_by_imm_off(env); + if (ret) + return ret; return 0; } @@ -23154,6 +23758,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) state->curframe = 0; state->speculative = false; state->branches = 1; + state->in_sleepable = env->prog->sleepable; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT); if (!state->frame[0]) { kfree(state); @@ -23173,7 +23778,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) struct bpf_subprog_arg_info *arg; struct bpf_reg_state *reg; - verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); ret = btf_prepare_func_args(env, subprog); if (ret) goto out; @@ -24276,14 +24882,13 @@ static int compute_live_registers(struct bpf_verifier_env *env) for (i = 0; i < env->cfg.cur_postorder; ++i) { int insn_idx = env->cfg.insn_postorder[i]; struct insn_live_regs *live = &state[insn_idx]; - int succ_num; - u32 succ[2]; + struct bpf_iarray *succ; u16 new_out = 0; u16 new_in = 0; - succ_num = bpf_insn_successors(env->prog, insn_idx, succ); - for (int s = 0; s < succ_num; ++s) - new_out |= state[succ[s]].in; + succ = bpf_insn_successors(env, insn_idx); + for (int s = 0; s < succ->cnt; ++s) + new_out |= state[succ->items[s]].in; new_in = (new_out & ~live->def) | live->use; if (new_out != live->out || new_in != live->in) { live->in = new_in; @@ -24336,11 +24941,11 @@ static int compute_scc(struct bpf_verifier_env *env) const u32 insn_cnt = env->prog->len; int stack_sz, dfs_sz, err = 0; u32 *stack, *pre, *low, *dfs; - u32 succ_cnt, i, j, t, w; + u32 i, j, t, w; u32 next_preorder_num; u32 next_scc_id; bool assign_scc; - u32 succ[2]; + struct bpf_iarray *succ; next_preorder_num = 1; next_scc_id = 1; @@ -24447,12 +25052,12 @@ dfs_continue: stack[stack_sz++] = w; } /* Visit 'w' successors */ - succ_cnt = bpf_insn_successors(env->prog, w, succ); - for (j = 0; j < succ_cnt; ++j) { - if (pre[succ[j]]) { - low[w] = min(low[w], low[succ[j]]); + succ = bpf_insn_successors(env, w); + for (j = 0; j < succ->cnt; ++j) { + if (pre[succ->items[j]]) { + low[w] = min(low[w], low[succ->items[j]]); } else { - dfs[dfs_sz++] = succ[j]; + dfs[dfs_sz++] = succ->items[j]; goto dfs_continue; } } @@ -24469,8 +25074,8 @@ dfs_continue: * or if component has a self reference. */ assign_scc = stack[stack_sz - 1] != w; - for (j = 0; j < succ_cnt; ++j) { - if (succ[j] == w) { + for (j = 0; j < succ->cnt; ++j) { + if (succ->items[j] == w) { assign_scc = true; break; } @@ -24532,6 +25137,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 goto err_free_env; for (i = 0; i < len; i++) env->insn_aux_data[i].orig_idx = i; + env->succ = iarray_realloc(NULL, 2); + if (!env->succ) + goto err_free_env; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; @@ -24755,6 +25363,8 @@ skip_full_check: adjust_btf_func(env); err_release_maps: + if (ret) + release_insn_arrays(env); if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_used_maps() will release them. @@ -24775,11 +25385,14 @@ err_release_maps: err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); + clear_insn_aux_data(env, 0, env->prog->len); vfree(env->insn_aux_data); err_free_env: bpf_stack_liveness_free(env); kvfree(env->cfg.insn_postorder); kvfree(env->scc_info); + kvfree(env->succ); + kvfree(env->gotox_tmp_buf); kvfree(env); return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6ae5f48cf64e..fa08ea288737 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -60,6 +60,7 @@ #include <linux/sched/deadline.h> #include <linux/psi.h> #include <linux/nstree.h> +#include <linux/irq_work.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -250,12 +251,9 @@ bool cgroup_enable_per_threadgroup_rwsem __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.__ns_ref = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_cgroup_ns), .user_ns = &init_user_ns, - .ns.ops = &cgroupns_operations, - .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, - .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; @@ -290,6 +288,7 @@ static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static void cgroup_rt_init(void); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline @@ -944,7 +943,8 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit()/cgroup_free() dropping the css_set. + * against cgroup_task_dead()/cgroup_task_free() dropping + * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -1522,9 +1522,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void) } else { /* * NOTE: This function may be called from bpf_cgroup_from_id() - * on a task which has already passed exit_task_namespaces() and - * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all - * cgroups visible for lookups. + * on a task which has already passed exit_nsproxy_namespaces() + * and nsproxy == NULL. Fall back to cgrp_dfl_root which will + * make all cgroups visible for lookups. */ return &cgrp_dfl_root.cgrp; } @@ -5363,7 +5363,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; - const struct cred *saved_cred; ssize_t ret; enum cgroup_attach_lock_mode lock_mode; @@ -5386,11 +5385,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, * permissions using the credentials from file open to protect against * inherited fd attacks. */ - saved_cred = override_creds(of->file->f_cred); - ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, - threadgroup, ctx->ns); - revert_creds(saved_cred); + scoped_with_creds(of->file->f_cred) + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); if (ret) goto out_finish; @@ -5892,7 +5890,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; - seqcount_init(&cgrp->freezer.freeze_seq); + seqcount_spinlock_init(&cgrp->freezer.freeze_seq, &css_set_lock); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be @@ -6359,6 +6357,7 @@ int __init cgroup_init(void) BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); + cgroup_rt_init(); cgroup_lock(); @@ -6972,19 +6971,29 @@ void cgroup_post_fork(struct task_struct *child, } /** - * cgroup_exit - detach cgroup from exiting task + * cgroup_task_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ -void cgroup_exit(struct task_struct *tsk) +void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; - struct css_set *cset; int i; - spin_lock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ + do_each_subsys_mask(ss, i, have_exit_callback) { + ss->exit(tsk); + } while_each_subsys_mask(); +} + +static void do_cgroup_task_dead(struct task_struct *tsk) +{ + struct css_set *cset; + unsigned long flags; + + spin_lock_irqsave(&css_set_lock, flags); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); @@ -7002,15 +7011,61 @@ void cgroup_exit(struct task_struct *tsk) test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, flags); +} - /* see cgroup_post_fork() for details */ - do_each_subsys_mask(ss, i, have_exit_callback) { - ss->exit(tsk); - } while_each_subsys_mask(); +#ifdef CONFIG_PREEMPT_RT +/* + * cgroup_task_dead() is called from finish_task_switch() which doesn't allow + * scheduling even in RT. As the task_dead path requires grabbing css_set_lock, + * this lead to sleeping in the invalid context warning bug. css_set_lock is too + * big to become a raw_spinlock. The task_dead path doesn't need to run + * synchronously but can't be delayed indefinitely either as the dead task pins + * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy + * irq_work to allow batching while ensuring timely completion. + */ +static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks); +static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork); + +static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork) +{ + struct llist_node *lnode; + struct task_struct *task, *next; + + lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks)); + llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) { + do_cgroup_task_dead(task); + put_task_struct(task); + } +} + +static void __init cgroup_rt_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu)); + per_cpu(cgrp_dead_tasks_iwork, cpu) = + IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn); + } +} + +void cgroup_task_dead(struct task_struct *task) +{ + get_task_struct(task); + llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks)); + irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork)); } +#else /* CONFIG_PREEMPT_RT */ +static void __init cgroup_rt_init(void) {} -void cgroup_release(struct task_struct *task) +void cgroup_task_dead(struct task_struct *task) +{ + do_cgroup_task_dead(task); +} +#endif /* CONFIG_PREEMPT_RT */ + +void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; @@ -7018,6 +7073,11 @@ void cgroup_release(struct task_struct *task) do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); +} + +void cgroup_task_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); if (!list_empty(&task->cg_list)) { spin_lock_irq(&css_set_lock); @@ -7025,11 +7085,7 @@ void cgroup_release(struct task_struct *task) list_del_init(&task->cg_list); spin_unlock_irq(&css_set_lock); } -} -void cgroup_free(struct task_struct *task) -{ - struct css_set *cset = task_css_set(task); put_css_set(cset); } diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 337608f408ce..01976c8e7d49 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -155,13 +155,17 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of valid local child partitions */ - int nr_subparts; - /* partition root state */ int partition_root_state; /* + * Whether cpuset is a remote partition. + * It used to be a list anchoring all remote partitions — we can switch back + * to a list if we need to iterate over the remote partitions. + */ + bool remote_partition; + + /* * number of SCHED_DEADLINE tasks attached to this cpuset, so that we * know when to rebuild associated root domain bandwidth information. */ @@ -175,9 +179,6 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; - /* Remote partition silbling list anchored at remote_children */ - struct list_head remote_sibling; - /* Used to merge intersecting subsets for generate_sched_domains */ struct uf_node node; }; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 52468d2c178a..6e6eb09b8db6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -82,14 +82,18 @@ static cpumask_var_t subpartitions_cpus; static cpumask_var_t isolated_cpus; /* + * isolated_cpus updating flag (protected by cpuset_mutex) + * Set if isolated_cpus is going to be updated in the current + * cpuset_mutex crtical section. + */ +static bool isolated_cpus_updating; + +/* * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot */ static cpumask_var_t boot_hk_cpus; static bool have_boot_isolcpus; -/* List of remote partition root children */ -static struct list_head remote_children; - /* * A flag to force sched domain rebuild at the end of an operation. * It can be set in @@ -212,7 +216,7 @@ static struct cpuset top_cpuset = { BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, - .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), + .remote_partition = false, }; /* @@ -352,33 +356,55 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked * @excluded_child: a child cpuset to be excluded in task checking * Return: true if there are tasks, false otherwise * - * It is assumed that @cs is a valid partition root. @excluded_child should - * be non-NULL when this cpuset is going to become a partition itself. + * @cs should be a valid partition root or going to become a partition root. + * @excluded_child should be non-NULL when this cpuset is going to become a + * partition itself. + * + * Note that a remote partition is not allowed underneath a valid local + * or remote partition. So if a non-partition root child is populated, + * the whole partition is considered populated. */ static inline bool partition_is_populated(struct cpuset *cs, struct cpuset *excluded_child) { - struct cgroup_subsys_state *css; - struct cpuset *child; + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; - if (cs->css.cgroup->nr_populated_csets) + /* + * We cannot call cs_is_populated(cs) directly, as + * nr_populated_domain_children may include populated + * csets from descendants that are partitions. + */ + if (cs->css.cgroup->nr_populated_csets || + cs->attach_in_progress) return true; - if (!excluded_child && !cs->nr_subparts) - return cgroup_is_populated(cs->css.cgroup); rcu_read_lock(); - cpuset_for_each_child(child, css, cs) { - if (child == excluded_child) + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + if (cp == cs || cp == excluded_child) continue; - if (is_partition_valid(child)) + + if (is_partition_valid(cp)) { + pos_css = css_rightmost_descendant(pos_css); continue; - if (cgroup_is_populated(child->css.cgroup)) { + } + + if (cpuset_is_populated(cp)) { rcu_read_unlock(); return true; } @@ -663,7 +689,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { + if (cpuset_is_populated(cur)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; @@ -1302,7 +1328,6 @@ static void reset_partition_data(struct cpuset *cs) lockdep_assert_held(&callback_lock); - cs->nr_subparts = 0; if (cpumask_empty(cs->exclusive_cpus)) { cpumask_clear(cs->effective_xcpus); if (is_cpu_exclusive(cs)) @@ -1325,6 +1350,8 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus cpumask_or(isolated_cpus, isolated_cpus, xcpus); else cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); + + isolated_cpus_updating = true; } /* @@ -1332,15 +1359,12 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus * @new_prs: new partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be added - * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static bool partition_xcpus_add(int new_prs, struct cpuset *parent, +static void partition_xcpus_add(int new_prs, struct cpuset *parent, struct cpumask *xcpus) { - bool isolcpus_updated; - WARN_ON_ONCE(new_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) @@ -1350,13 +1374,11 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, if (parent == &top_cpuset) cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); - isolcpus_updated = (new_prs != parent->partition_root_state); - if (isolcpus_updated) + if (new_prs != parent->partition_root_state) isolated_cpus_update(parent->partition_root_state, new_prs, xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); - return isolcpus_updated; } /* @@ -1364,15 +1386,12 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, * @old_prs: old partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be removed - * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static bool partition_xcpus_del(int old_prs, struct cpuset *parent, +static void partition_xcpus_del(int old_prs, struct cpuset *parent, struct cpumask *xcpus) { - bool isolcpus_updated; - WARN_ON_ONCE(old_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) @@ -1381,27 +1400,95 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, if (parent == &top_cpuset) cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); - isolcpus_updated = (old_prs != parent->partition_root_state); - if (isolcpus_updated) + if (old_prs != parent->partition_root_state) isolated_cpus_update(old_prs, parent->partition_root_state, xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); - return isolcpus_updated; } -static void update_unbound_workqueue_cpumask(bool isolcpus_updated) +/* + * isolated_cpus_can_update - check for isolated & nohz_full conflicts + * @add_cpus: cpu mask for cpus that are going to be isolated + * @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL + * Return: false if there is conflict, true otherwise + * + * If nohz_full is enabled and we have isolated CPUs, their combination must + * still leave housekeeping CPUs. + * + * TBD: Should consider merging this function into + * prstate_housekeeping_conflict(). + */ +static bool isolated_cpus_can_update(struct cpumask *add_cpus, + struct cpumask *del_cpus) { - int ret; + cpumask_var_t full_hk_cpus; + int res = true; - lockdep_assert_cpus_held(); + if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE)) + return true; + + if (del_cpus && cpumask_weight_and(del_cpus, + housekeeping_cpumask(HK_TYPE_KERNEL_NOISE))) + return true; + + if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL)) + return false; + + cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE), + housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus); + cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask); + if (!cpumask_weight_andnot(full_hk_cpus, add_cpus)) + res = false; + + free_cpumask_var(full_hk_cpus); + return res; +} + +/* + * prstate_housekeeping_conflict - check for partition & housekeeping conflicts + * @prstate: partition root state to be checked + * @new_cpus: cpu mask + * Return: true if there is conflict, false otherwise + * + * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * isolated partition. + */ +static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) +{ + if (!have_boot_isolcpus) + return false; + + if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) + return true; + + return false; +} + +/* + * update_isolation_cpumasks - Update external isolation related CPU masks + * + * The following external CPU masks will be updated if necessary: + * - workqueue unbound cpumask + */ +static void update_isolation_cpumasks(void) +{ + int ret; - if (!isolcpus_updated) + if (!isolated_cpus_updating) return; + lockdep_assert_cpus_held(); + ret = workqueue_unbound_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0); + + ret = tmigr_isolated_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); + + isolated_cpus_updating = false; } /** @@ -1505,7 +1592,7 @@ static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) static inline bool is_remote_partition(struct cpuset *cs) { - return !list_empty(&cs->remote_sibling); + return cs->remote_partition; } static inline bool is_local_partition(struct cpuset *cs) @@ -1526,8 +1613,6 @@ static inline bool is_local_partition(struct cpuset *cs) static int remote_partition_enable(struct cpuset *cs, int new_prs, struct tmpmasks *tmp) { - bool isolcpus_updated; - /* * The user must have sysadmin privilege. */ @@ -1549,13 +1634,17 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) return PERR_INVCPUS; + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->new_cpus, NULL)) || + prstate_housekeeping_conflict(new_prs, tmp->new_cpus)) + return PERR_HKEEPING; spin_lock_irq(&callback_lock); - isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); - list_add(&cs->remote_sibling, &remote_children); + partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + cs->remote_partition = true; cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(); cpuset_force_rebuild(); cs->prs_err = 0; @@ -1578,15 +1667,12 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, */ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { - bool isolcpus_updated; - WARN_ON_ONCE(!is_remote_partition(cs)); WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); - list_del_init(&cs->remote_sibling); - isolcpus_updated = partition_xcpus_del(cs->partition_root_state, - NULL, cs->effective_xcpus); + cs->remote_partition = false; + partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus); if (cs->prs_err) cs->partition_root_state = -cs->partition_root_state; else @@ -1596,7 +1682,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(); cpuset_force_rebuild(); /* @@ -1621,7 +1707,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, { bool adding, deleting; int prs = cs->partition_root_state; - int isolcpus_updated = 0; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; @@ -1648,15 +1733,18 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) cs->prs_err = PERR_NOCPUS; + else if ((prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + cs->prs_err = PERR_HKEEPING; if (cs->prs_err) goto invalidate; } spin_lock_irq(&callback_lock); if (adding) - isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); + partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) - isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); + partition_xcpus_del(prs, NULL, tmp->delmask); /* * Need to update effective_xcpus and exclusive_cpus now as * update_sibling_cpumasks() below may iterate back to the same cs. @@ -1665,7 +1753,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(); if (adding || deleting) cpuset_force_rebuild(); @@ -1680,26 +1768,6 @@ invalidate: remote_partition_disable(cs, tmp); } -/* - * prstate_housekeeping_conflict - check for partition & housekeeping conflicts - * @prstate: partition root state to be checked - * @new_cpus: cpu mask - * Return: true if there is conflict, false otherwise - * - * CPUs outside of boot_hk_cpus, if defined, can only be used in an - * isolated partition. - */ -static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) -{ - if (!have_boot_isolcpus) - return false; - - if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) - return true; - - return false; -} - /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state @@ -1746,9 +1814,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int deleting; /* Deleting cpus from parent's effective_cpus */ int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ - int subparts_delta = 0; - int isolcpus_updated = 0; struct cpumask *xcpus = user_xcpus(cs); + int parent_prs = parent->partition_root_state; bool nocpu; lockdep_assert_held(&cpuset_mutex); @@ -1771,10 +1838,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - if (old_prs > 0) { + if (old_prs > 0) new_prs = -old_prs; - subparts_delta--; - } + goto write_error; } @@ -1813,6 +1879,10 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; + if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) && + !isolated_cpus_can_update(xcpus, NULL)) + return PERR_HKEEPING; + if (tasks_nocpu_error(parent, cs, xcpus)) return PERR_NOCPUS; @@ -1829,7 +1899,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); deleting = true; - subparts_delta++; } else if (cmd == partcmd_disable) { /* * May need to add cpus back to parent's effective_cpus @@ -1840,7 +1909,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(cs)) { cpumask_copy(tmp->addmask, cs->effective_xcpus); adding = true; - subparts_delta--; } new_prs = PRS_MEMBER; } else if (newmask) { @@ -1868,6 +1936,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * * For invalid partition: * delmask = newmask & parent->effective_xcpus + * The partition may become valid soon. */ if (is_partition_invalid(cs)) { adding = false; @@ -1882,6 +1951,23 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, deleting = cpumask_and(tmp->delmask, tmp->delmask, parent->effective_xcpus); } + + /* + * TBD: Invalidate a currently valid child root partition may + * still break isolated_cpus_can_update() rule if parent is an + * isolated partition. + */ + if (is_partition_valid(cs) && (old_prs != parent_prs)) { + if ((parent_prs == PRS_ROOT) && + /* Adding to parent means removing isolated CPUs */ + !isolated_cpus_can_update(tmp->delmask, tmp->addmask)) + part_error = PERR_HKEEPING; + if ((parent_prs == PRS_ISOLATED) && + /* Adding to parent means adding isolated CPUs */ + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + part_error = PERR_HKEEPING; + } + /* * The new CPUs to be removed from parent's effective CPUs * must be present. @@ -1963,17 +2049,13 @@ write_error: switch (cs->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: - if (part_error) { + if (part_error) new_prs = -old_prs; - subparts_delta--; - } break; case PRS_INVALID_ROOT: case PRS_INVALID_ISOLATED: - if (!part_error) { + if (!part_error) new_prs = -old_prs; - subparts_delta++; - } break; } } @@ -2002,28 +2084,20 @@ write_error: * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); - if (old_prs != new_prs) { + if (old_prs != new_prs) cs->partition_root_state = new_prs; - if (new_prs <= 0) - cs->nr_subparts = 0; - } + /* * Adding to parent's effective_cpus means deletion CPUs from cs * and vice versa. */ if (adding) - isolcpus_updated += partition_xcpus_del(old_prs, parent, - tmp->addmask); + partition_xcpus_del(old_prs, parent, tmp->addmask); if (deleting) - isolcpus_updated += partition_xcpus_add(new_prs, parent, - tmp->delmask); + partition_xcpus_add(new_prs, parent, tmp->delmask); - if (is_partition_valid(parent)) { - parent->nr_subparts += subparts_delta; - WARN_ON_ONCE(parent->nr_subparts < 0); - } spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); @@ -2105,8 +2179,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, */ spin_lock_irq(&callback_lock); make_partition_invalid(child); - cs->nr_subparts--; - child->nr_subparts = 0; spin_unlock_irq(&callback_lock); notify_partition_change(child, old_prs); continue; @@ -2135,7 +2207,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, { struct cpuset *cp; struct cgroup_subsys_state *pos_css; - bool need_rebuild_sched_domains = false; int old_prs, new_prs; rcu_read_lock(); @@ -2299,15 +2370,12 @@ get_css: if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && (!cpuset_v2() || is_partition_valid(cp))) - need_rebuild_sched_domains = true; + cpuset_force_rebuild(); rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); - - if (need_rebuild_sched_domains) - cpuset_force_rebuild(); } /** @@ -2845,21 +2913,19 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, */ retval = nodelist_parse(buf, trialcs->mems_allowed); if (retval < 0) - goto done; + return retval; if (!nodes_subset(trialcs->mems_allowed, - top_cpuset.mems_allowed)) { - retval = -EINVAL; - goto done; - } + top_cpuset.mems_allowed)) + return -EINVAL; + + /* No change? nothing to do */ + if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) + return 0; - if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { - retval = 0; /* Too easy - nothing to do */ - goto done; - } retval = validate_change(cs, trialcs); if (retval < 0) - goto done; + return retval; check_insane_mems_config(&trialcs->mems_allowed); @@ -2869,8 +2935,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &trialcs->mems_allowed); -done: - return retval; + return 0; } bool current_cpuset_is_being_rebound(void) @@ -3008,7 +3073,12 @@ static int update_prstate(struct cpuset *cs, int new_prs) * A change in load balance state only, no change in cpumasks. * Need to update isolated_cpus. */ - isolcpus_updated = true; + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(cs->effective_xcpus, NULL)) || + prstate_housekeeping_conflict(new_prs, cs->effective_xcpus)) + err = PERR_HKEEPING; + else + isolcpus_updated = true; } else { /* * Switching back to member is always allowed even if it @@ -3043,7 +3113,7 @@ out: else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); @@ -3549,7 +3619,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; - INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) @@ -3820,7 +3889,6 @@ int __init cpuset_init(void) nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4021,7 +4089,6 @@ static void cpuset_handle_hotplug(void) */ if (!cpumask_empty(subpartitions_cpus)) { if (cpumask_subset(&new_cpus, subpartitions_cpus)) { - top_cpuset.nr_subparts = 0; cpumask_clear(subpartitions_cpus); } else { cpumask_andnot(&new_cpus, &new_cpus, @@ -4116,24 +4183,13 @@ void __init cpuset_init_smp(void) BUG_ON(!cpuset_migrate_mm_wq); } -/** - * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. - * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. - * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. - * - * Description: Returns the cpumask_var_t cpus_allowed of the cpuset - * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_active_mask, even if this means going outside the - * tasks cpuset, except when the task is in the top cpuset. - **/ - -void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) +/* + * Return cpus_allowed mask from a task's cpuset. + */ +static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { - unsigned long flags; struct cpuset *cs; - spin_lock_irqsave(&callback_lock, flags); - cs = task_cs(tsk); if (cs != &top_cpuset) guarantee_active_cpus(tsk, pmask); @@ -4153,7 +4209,39 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) if (!cpumask_intersects(pmask, cpu_active_mask)) cpumask_copy(pmask, possible_mask); } +} +/** + * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Similir to cpuset_cpus_allowed() except that the caller must have acquired + * cpuset_mutex. + */ +void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) +{ + lockdep_assert_held(&cpuset_mutex); + __cpuset_cpus_allowed_locked(tsk, pmask); +} + +/** + * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Description: Returns the cpumask_var_t cpus_allowed of the cpuset + * attached to the specified @tsk. Guaranteed to return some non-empty + * subset of cpu_active_mask, even if this means going outside the + * tasks cpuset, except when the task is in the top cpuset. + **/ + +void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) +{ + unsigned long flags; + + spin_lock_irqsave(&callback_lock, flags); + __cpuset_cpus_allowed_locked(tsk, pmask); spin_unlock_irqrestore(&callback_lock, flags); } @@ -4180,7 +4268,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { - do_set_cpus_allowed(tsk, cs_mask); + set_cpus_allowed_force(tsk, cs_mask); changed = true; } rcu_read_unlock(); diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index dd9417425d92..915b02f65980 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -63,7 +63,7 @@ static struct freezer *parent_freezer(struct freezer *freezer) return css_freezer(freezer->css.parent); } -bool cgroup_freezing(struct task_struct *task) +bool cgroup1_freezing(struct task_struct *task) { bool ret; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index fdbe57578e68..db9617556dd7 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -30,7 +30,6 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) ret = ns_common_init(new_ns); if (ret) return ERR_PTR(ret); - ns_tree_add(new_ns); return no_free_ptr(new_ns); } @@ -86,6 +85,7 @@ struct cgroup_namespace *copy_cgroup_ns(u64 flags, new_ns->ucounts = ucounts; new_ns->root_cset = cset; + ns_tree_add(new_ns); return new_ns; } diff --git a/kernel/cpu.c b/kernel/cpu.c index db9f6c539b28..b674fdf96208 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3085,10 +3085,13 @@ EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE struct cpumask __cpu_possible_mask __ro_after_init = {CPU_BITS_ALL}; +unsigned int __num_possible_cpus __ro_after_init = NR_CPUS; #else struct cpumask __cpu_possible_mask __ro_after_init; +unsigned int __num_possible_cpus __ro_after_init; #endif EXPORT_SYMBOL(__cpu_possible_mask); +EXPORT_SYMBOL(__num_possible_cpus); struct cpumask __cpu_online_mask __read_mostly; EXPORT_SYMBOL(__cpu_online_mask); @@ -3116,6 +3119,7 @@ void init_cpu_present(const struct cpumask *src) void init_cpu_possible(const struct cpumask *src) { cpumask_copy(&__cpu_possible_mask, src); + __num_possible_cpus = cpumask_weight(&__cpu_possible_mask); } void set_cpu_online(unsigned int cpu, bool online) @@ -3140,6 +3144,21 @@ void set_cpu_online(unsigned int cpu, bool online) } /* + * This should be marked __init, but there is a boatload of call sites + * which need to be fixed up to do so. Sigh... + */ +void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) { + if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask)) + __num_possible_cpus++; + } else { + if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask)) + __num_possible_cpus--; + } +} + +/* * Activate the first processor. */ void __init boot_cpu_init(void) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 3b1c43382eec..99dac1aa972a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -373,7 +373,7 @@ static int __crash_shrink_memory(struct resource *old_res, old_res->start = 0; old_res->end = 0; } else { - crashk_res.end = ram_res->start - 1; + old_res->end = ram_res->start - 1; } crash_free_reserved_phys_range(ram_res->start, ram_res->end); diff --git a/kernel/cred.c b/kernel/cred.c index dbf6b687dc5c..a6f686b30da1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,33 +35,6 @@ do { \ static struct kmem_cache *cred_jar; -/* init to 2 - one for init_task, one to ensure it is never freed */ -static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; - -/* - * The initial credentials for the initial task - */ -struct cred init_cred = { - .usage = ATOMIC_INIT(4), - .uid = GLOBAL_ROOT_UID, - .gid = GLOBAL_ROOT_GID, - .suid = GLOBAL_ROOT_UID, - .sgid = GLOBAL_ROOT_GID, - .euid = GLOBAL_ROOT_UID, - .egid = GLOBAL_ROOT_GID, - .fsuid = GLOBAL_ROOT_UID, - .fsgid = GLOBAL_ROOT_GID, - .securebits = SECUREBITS_DEFAULT, - .cap_inheritable = CAP_EMPTY_SET, - .cap_permitted = CAP_FULL_SET, - .cap_effective = CAP_FULL_SET, - .cap_bset = CAP_FULL_SET, - .user = INIT_USER, - .user_ns = &init_user_ns, - .group_info = &init_groups, - .ucounts = &init_ucounts, -}; - /* * The RCU callback to actually dispose of a set of credentials */ @@ -306,6 +279,7 @@ int copy_creds(struct task_struct *p, u64 clone_flags) kdebug("share_creds(%p{%ld})", p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); return 0; } @@ -343,6 +317,8 @@ int copy_creds(struct task_struct *p, u64 clone_flags) p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); + return 0; error_put: @@ -435,10 +411,13 @@ int commit_creds(struct cred *new) */ if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); + rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); + if (new->user_ns != old->user_ns) + switch_cred_namespaces(old, new); /* send notifications */ if (!uid_eq(new->uid, old->uid) || diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index b12b9db75c1d..61c1690058ed 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -589,24 +589,41 @@ static void kdb_msg_write(const char *msg, int msg_len) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if (!(console_srcu_read_flags(c) & CON_ENABLED)) + short flags = console_srcu_read_flags(c); + + if (!console_is_usable(c, flags, true)) continue; if (c == dbg_io_ops->cons) continue; - if (!c->write) - continue; - /* - * Set oops_in_progress to encourage the console drivers to - * disregard their internal spin locks: in the current calling - * context the risk of deadlock is a bigger problem than risks - * due to re-entering the console driver. We operate directly on - * oops_in_progress rather than using bust_spinlocks() because - * the calls bust_spinlocks() makes on exit are not appropriate - * for this calling context. - */ - ++oops_in_progress; - c->write(c, msg, msg_len); - --oops_in_progress; + + if (flags & CON_NBCON) { + struct nbcon_write_context wctxt = { }; + + /* + * Do not continue if the console is NBCON and the context + * can't be acquired. + */ + if (!nbcon_kdb_try_acquire(c, &wctxt)) + continue; + + nbcon_write_context_set_buf(&wctxt, (char *)msg, msg_len); + + c->write_atomic(c, &wctxt); + nbcon_kdb_release(&wctxt); + } else { + /* + * Set oops_in_progress to encourage the console drivers to + * disregard their internal spin locks: in the current calling + * context the risk of deadlock is a bigger problem than risks + * due to re-entering the console driver. We operate directly on + * oops_in_progress rather than using bust_spinlocks() because + * the calls bust_spinlocks() makes on exit are not appropriate + * for this calling context. + */ + ++oops_in_progress; + c->write(c, msg, msg_len); + --oops_in_progress; + } touch_nmi_watchdog(); } console_srcu_read_unlock(cookie); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 1e5c64cb6a42..138ede653de4 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -23,6 +23,7 @@ #include <linux/ctype.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/swiotlb.h> #include <asm/sections.h> #include "debug.h" @@ -594,7 +595,9 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) if (rc == -ENOMEM) { pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; - } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { + } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + !(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && + is_swiotlb_active(entry->dev))) { err_printk(entry->dev, entry, "cacheline tracking EEXIST, overlapping mappings aren't supported\n"); } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 1f9ee9759426..f973e7e73c90 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -481,6 +481,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, case PCI_P2PDMA_MAP_BUS_ADDR: sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, sg_phys(sg)); + sg_dma_len(sg) = sg->length; sg_dma_mark_bus_address(sg); continue; default: diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f62e1d1b2063..5c792b30c58a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -11,19 +11,20 @@ /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } -/** - * exit_to_user_mode_loop - do any pending work before leaving to user space - * @regs: Pointer to pt_regs on entry stack - * @ti_work: TIF work flags as read by the caller - */ -__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work) +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ) +#else +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) +#endif + +static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { /* * Before returning to user space ensure that all pending work * items have been completed. */ - while (ti_work & EXIT_TO_USER_MODE_WORK) { + while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { local_irq_enable_exit_to_user(ti_work); @@ -62,17 +63,21 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + * @regs: Pointer to pt_regs on entry stack + * @ti_work: TIF work flags as read by the caller + */ +__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { - enter_from_user_mode(regs); -} + for (;;) { + ti_work = __exit_to_user_mode_loop(regs, ti_work); -noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - exit_to_user_mode_prepare(regs); - instrumentation_end(); - exit_to_user_mode(); + if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work))) + return ti_work; + ti_work = read_thread_flags(); + } } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c index 66e6ba7fa80c..940a597ded40 100644 --- a/kernel/entry/syscall-common.c +++ b/kernel/entry/syscall-common.c @@ -63,14 +63,6 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret ? : syscall; } -noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) -{ - enter_from_user_mode(regs); - instrumentation_begin(); - local_irq_enable(); - instrumentation_end(); -} - /* * If SYSCALL_EMU is set, then the only reason to report is when * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 808c0d7a31fa..b9c7e00725d6 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; @@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, regs = task_pt_regs(current); } + if (defer_cookie) { + /* + * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED + * which can be stitched to this one, and add + * the cookie after it (it will be cut off when the + * user stack is copied to the callchain). + */ + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED); + perf_callchain_store_context(&ctx, defer_cookie); + goto exit_put; + } + if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); diff --git a/kernel/events/core.c b/kernel/events/core.c index 7541f6f85fcb..ece716879cbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -56,6 +56,7 @@ #include <linux/buildid.h> #include <linux/task_work.h> #include <linux/percpu-rwsem.h> +#include <linux/unwind_deferred.h> #include "internal.h" @@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; +static struct unwind_work perf_unwind_work; + struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { @@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && + event->attr.defer_callchain; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + u64 defer_cookie; if (!current->mm) user = false; @@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, kernel, user, - max_stack, crosstask, true); + if (!(user && defer_user && !crosstask && + unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0)) + defer_cookie = 0; + + callchain = get_perf_callchain(regs, kernel, user, max_stack, + crosstask, true, defer_cookie); + return callchain ?: &__empty_callchain; } @@ -9403,7 +9414,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) flags |= MAP_HUGETLB; if (file) { - struct inode *inode; + const struct inode *inode; dev_t dev; buf = kmalloc(PATH_MAX, GFP_KERNEL); @@ -9416,12 +9427,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - name = file_path(file, buf, PATH_MAX - sizeof(u64)); + name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; } - inode = file_inode(vma->vm_file); + inode = file_user_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; gen = inode->i_generation; @@ -9492,7 +9503,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, if (!filter->path.dentry) return false; - if (d_inode(filter->path.dentry) != file_inode(file)) + if (d_inode(filter->path.dentry) != file_user_inode(file)) return false; if (filter->offset > offset + size) @@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_callchain_deferred_event { + struct unwind_stacktrace *trace; + struct { + struct perf_event_header header; + u64 cookie; + u64 nr; + u64 ips[]; + } event; +}; + +static void perf_callchain_deferred_output(struct perf_event *event, void *data) +{ + struct perf_callchain_deferred_event *deferred_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret, size = deferred_event->event.header.size; + + if (!event->attr.defer_output) + return; + + /* XXX do we really need sample_id_all for this ??? */ + perf_event_header__init_id(&deferred_event->event.header, &sample, event); + + ret = perf_output_begin(&handle, &sample, event, + deferred_event->event.header.size); + if (ret) + goto out; + + perf_output_put(&handle, deferred_event->event); + for (int i = 0; i < deferred_event->trace->nr; i++) { + u64 entry = deferred_event->trace->entries[i]; + perf_output_put(&handle, entry); + } + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + deferred_event->event.header.size = size; +} + +static void perf_unwind_deferred_callback(struct unwind_work *work, + struct unwind_stacktrace *trace, u64 cookie) +{ + struct perf_callchain_deferred_event deferred_event = { + .trace = trace, + .event = { + .header = { + .type = PERF_RECORD_CALLCHAIN_DEFERRED, + .misc = PERF_RECORD_MISC_USER, + .size = sizeof(deferred_event.event) + + (trace->nr * sizeof(u64)), + }, + .cookie = cookie, + .nr = trace->nr, + }, + }; + + perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL); +} + struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; @@ -11773,7 +11844,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event = container_of(hrtimer, struct perf_event, hw.hrtimer); - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state != PERF_EVENT_STATE_ACTIVE || + event->hw.state & PERF_HES_STOPPED) return HRTIMER_NORESTART; event->pmu->read(event); @@ -11819,15 +11891,20 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; /* - * The throttle can be triggered in the hrtimer handler. - * The HRTIMER_NORESTART should be used to stop the timer, - * rather than hrtimer_cancel(). See perf_swevent_hrtimer() + * Careful: this function can be triggered in the hrtimer handler, + * for cpu-clock events, so hrtimer_cancel() would cause a + * deadlock. + * + * So use hrtimer_try_to_cancel() to try to stop the hrtimer, + * and the cpu-clock handler also sets the PERF_HES_STOPPED flag, + * which guarantees that perf_swevent_hrtimer() will stop the + * hrtimer once it sees the PERF_HES_STOPPED flag. */ if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); - hrtimer_cancel(&hwc->hrtimer); + hrtimer_try_to_cancel(&hwc->hrtimer); } } @@ -11871,12 +11948,14 @@ static void cpu_clock_event_update(struct perf_event *event) static void cpu_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); } static void cpu_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) cpu_clock_event_update(event); @@ -11893,7 +11972,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) static void cpu_clock_event_del(struct perf_event *event, int flags) { - cpu_clock_event_stop(event, flags); + cpu_clock_event_stop(event, PERF_EF_UPDATE); } static void cpu_clock_event_read(struct perf_event *event) @@ -11950,12 +12029,14 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) task_clock_event_update(event, event->ctx->time); @@ -14799,6 +14880,9 @@ void __init perf_event_init(void) idr_init(&pmu_idr); + unwind_deferred_init(&perf_unwind_work, + perf_unwind_deferred_callback); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8709c69118b5..f11ceb8be8c4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2765,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + /* * If user decided to take execution elsewhere, it makes little sense * to execute the original instruction, so let's skip it. @@ -2772,9 +2775,6 @@ static void handle_swbp(struct pt_regs *regs) if (instruction_pointer(regs) != bp_vaddr) goto out; - /* Try to optimize after first hit. */ - arch_uprobe_optimize(&uprobe->arch, bp_vaddr); - if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; diff --git a/kernel/exit.c b/kernel/exit.c index 9f74e8f1c431..4dc1918db67b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -257,7 +257,7 @@ repeat: rcu_read_unlock(); pidfs_exit(p); - cgroup_release(p); + cgroup_task_release(p); /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ thread_pid = task_pid(p); @@ -291,6 +291,7 @@ repeat: write_unlock_irq(&tasklist_lock); /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); + exit_cred_namespaces(p); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); @@ -910,6 +911,7 @@ void __noreturn do_exit(long code) user_events_exit(tsk); io_uring_files_cancel(); + sched_mm_cid_exit(tsk); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); @@ -939,7 +941,6 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); - unwind_deferred_task_exit(tsk); trace_sched_process_exit(tsk, group_dead); /* @@ -950,6 +951,12 @@ void __noreturn do_exit(long code) * gets woken up by child-exit notifications. */ perf_event_exit_task(tsk); + /* + * PF_EXITING (above) ensures unwind_deferred_request() will no + * longer add new unwinds. While exit_mm() (below) will destroy the + * abaility to do unwinds. So flush any pending unwinds here. + */ + unwind_deferred_task_exit(tsk); exit_mm(); @@ -962,12 +969,12 @@ void __noreturn do_exit(long code) exit_fs(tsk); if (group_dead) disassociate_ctty(1); - exit_task_namespaces(tsk); + exit_nsproxy_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); sched_autogroup_exit_task(tsk); - cgroup_exit(tsk); + cgroup_task_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..25d243718048 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -736,9 +736,8 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); unwind_task_free(tsk); - sched_ext_free(tsk); io_uring_free(tsk); - cgroup_free(tsk); + cgroup_task_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); @@ -955,10 +954,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #endif #ifdef CONFIG_SCHED_MM_CID - tsk->mm_cid = -1; - tsk->last_mm_cid = -1; - tsk->mm_cid_active = 0; - tsk->migrate_from_cpu = -1; + tsk->mm_cid.cid = MM_CID_UNSET; + tsk->mm_cid.active = 0; #endif return tsk; @@ -2453,9 +2450,10 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - exit_task_namespaces(p); + exit_nsproxy_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { + sched_mm_cid_exit(p); mm_clear_owner(p->mm, p); mmput(p->mm); } @@ -2487,6 +2485,7 @@ bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + exit_cred_namespaces(p); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); diff --git a/kernel/freezer.c b/kernel/freezer.c index ddc11a8bd2ea..a76bf957fb32 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,7 +44,7 @@ bool freezing_slow_path(struct task_struct *p) if (tsk_is_oom_victim(p)) return false; - if (pm_nosig_freezing || cgroup_freezing(p)) + if (pm_nosig_freezing || cgroup1_freezing(p)) return true; if (pm_freezing && !(p->flags & PF_KTHREAD)) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 125804fbb5cb..cf7e610eac42 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -581,7 +581,7 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, if (flags & FLAGS_NUMA) { u32 __user *naddr = (void *)uaddr + size / 2; - if (futex_get_value(&node, naddr)) + if (get_user_inline(node, naddr)) return -EFAULT; if ((node != FUTEX_NO_NODE) && @@ -601,7 +601,7 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, node = numa_node_id(); node_updated = true; } - if (node_updated && futex_put_value(node, naddr)) + if (node_updated && put_user_inline(node, naddr)) return -EFAULT; } @@ -1680,10 +1680,10 @@ static bool futex_ref_get(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; - guard(rcu)(); + guard(preempt)(); - if (smp_load_acquire(&fph->state) == FR_PERCPU) { - this_cpu_inc(*mm->futex_ref); + if (READ_ONCE(fph->state) == FR_PERCPU) { + __this_cpu_inc(*mm->futex_ref); return true; } @@ -1694,10 +1694,10 @@ static bool futex_ref_put(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; - guard(rcu)(); + guard(preempt)(); - if (smp_load_acquire(&fph->state) == FR_PERCPU) { - this_cpu_dec(*mm->futex_ref); + if (READ_ONCE(fph->state) == FR_PERCPU) { + __this_cpu_dec(*mm->futex_ref); return false; } diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 2cd57096c38e..30c2afa03889 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -281,63 +281,11 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 return ret; } -/* - * This does a plain atomic user space read, and the user pointer has - * already been verified earlier by get_futex_key() to be both aligned - * and actually in user space, just like futex_atomic_cmpxchg_inatomic(). - * - * We still want to avoid any speculation, and while __get_user() is - * the traditional model for this, it's actually slower than doing - * this manually these days. - * - * We could just have a per-architecture special function for it, - * the same way we do futex_atomic_cmpxchg_inatomic(), but rather - * than force everybody to do that, write it out long-hand using - * the low-level user-access infrastructure. - * - * This looks a bit overkill, but generally just results in a couple - * of instructions. - */ -static __always_inline int futex_get_value(u32 *dest, u32 __user *from) -{ - u32 val; - - if (can_do_masked_user_access()) - from = masked_user_access_begin(from); - else if (!user_read_access_begin(from, sizeof(*from))) - return -EFAULT; - unsafe_get_user(val, from, Efault); - user_read_access_end(); - *dest = val; - return 0; -Efault: - user_read_access_end(); - return -EFAULT; -} - -static __always_inline int futex_put_value(u32 val, u32 __user *to) -{ - if (can_do_masked_user_access()) - to = masked_user_access_begin(to); - else if (!user_write_access_begin(to, sizeof(*to))) - return -EFAULT; - unsafe_put_user(val, to, Efault); - user_write_access_end(); - return 0; -Efault: - user_write_access_end(); - return -EFAULT; -} - +/* Read from user memory with pagefaults disabled */ static inline int futex_get_value_locked(u32 *dest, u32 __user *from) { - int ret; - - pagefault_disable(); - ret = futex_get_value(dest, from); - pagefault_enable(); - - return ret; + guard(pagefault)(); + return get_user_inline(*dest, from); } extern void __futex_unqueue(struct futex_q *q); diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index a08cc076f332..ffde93d051a4 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include <linux/mm.h> #include "gcov.h" -#if (__GNUC__ >= 14) +#if (__GNUC__ >= 15) +#define GCOV_COUNTERS 10 +#elif (__GNUC__ >= 14) #define GCOV_COUNTERS 9 #elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3ffa0d80ddd1..678f094d261a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -897,8 +897,9 @@ void handle_percpu_irq(struct irq_desc *desc) void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; unsigned int irq = irq_desc_get_irq(desc); + unsigned int cpu = smp_processor_id(); + struct irqaction *action; irqreturn_t res; /* @@ -910,12 +911,15 @@ void handle_percpu_devid_irq(struct irq_desc *desc) if (chip->irq_ack) chip->irq_ack(&desc->irq_data); + for (action = desc->action; action; action = action->next) + if (cpumask_test_cpu(cpu, action->affinity)) + break; + if (likely(action)) { trace_irq_handler_entry(irq, action); res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); trace_irq_handler_exit(irq, action, res); } else { - unsigned int cpu = smp_processor_id(); bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); if (enabled) @@ -929,31 +933,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } -/** - * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu - * dev ids - * @desc: the interrupt description structure for this irq - * - * Similar to handle_fasteoi_nmi, but handling the dev_id cookie - * as a percpu pointer. - */ -void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; - unsigned int irq = irq_desc_get_irq(desc); - irqreturn_t res; - - __kstat_incr_irqs_this_cpu(desc); - - trace_irq_handler_entry(irq, action); - res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); - trace_irq_handler_exit(irq, action, res); - - if (chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); -} - static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) @@ -1030,7 +1009,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { - scoped_irqdesc_get_and_lock(irq, 0) + scoped_irqdesc_get_and_buslock(irq, 0) __irq_do_set_handler(scoped_irqdesc, handle, is_chained, name); } EXPORT_SYMBOL_GPL(__irq_set_handler); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e103451243a0..786f5570a640 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -133,7 +133,15 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) */ atomic_inc(&desc->threads_active); - wake_up_process(action->thread); + /* + * This might be a premature wakeup before the thread reached the + * thread function and set the IRQTF_READY bit. It's waiting in + * kthread code with state UNINTERRUPTIBLE. Once it reaches the + * thread function it waits with INTERRUPTIBLE. The wakeup is not + * lost in that case because the thread is guaranteed to observe + * the RUN flag before it goes to sleep in wait_for_interrupt(). + */ + wake_up_state(action->thread, TASK_INTERRUPTIBLE); } static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index db714d3014b5..6acf268f005b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -879,8 +879,7 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) chip_bus_sync_unlock(desc); } -int irq_set_percpu_devid_partition(unsigned int irq, - const struct cpumask *affinity) +int irq_set_percpu_devid(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -892,31 +891,10 @@ int irq_set_percpu_devid_partition(unsigned int irq, if (!desc->percpu_enabled) return -ENOMEM; - desc->percpu_affinity = affinity ? : cpu_possible_mask; - irq_set_percpu_devid_flags(irq); return 0; } -int irq_set_percpu_devid(unsigned int irq) -{ - return irq_set_percpu_devid_partition(irq, NULL); -} - -int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || !desc->percpu_enabled) - return -EINVAL; - - if (affinity) - cpumask_copy(affinity, desc->percpu_affinity); - - return 0; -} -EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); - void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index dc473faadcc8..2652c4cfd877 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -867,13 +867,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, } EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec); -unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) +static struct irq_domain *fwspec_to_domain(struct irq_fwspec *fwspec) { struct irq_domain *domain; - struct irq_data *irq_data; - irq_hw_number_t hwirq; - unsigned int type = IRQ_TYPE_NONE; - int virq; if (fwspec->fwnode) { domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED); @@ -883,6 +879,32 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) domain = irq_default_domain; } + return domain; +} + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) +{ + struct irq_domain *domain = fwspec_to_domain(fwspec); + + memset(info, 0, sizeof(*info)); + + if (!domain || !domain->ops->get_fwspec_info) + return 0; + + return domain->ops->get_fwspec_info(fwspec, info); +} +#endif + +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) +{ + unsigned int type = IRQ_TYPE_NONE; + struct irq_domain *domain; + struct irq_data *irq_data; + irq_hw_number_t hwirq; + int virq; + + domain = fwspec_to_domain(fwspec); if (!domain) { pr_warn("no irq domain found for %s !\n", of_node_full_name(to_of_node(fwspec->fwnode))); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c94837382037..0bb29316b436 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -547,7 +547,7 @@ int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *noti INIT_WORK(¬ify->work, irq_affinity_notify); } - scoped_guard(raw_spinlock_irqsave, &desc->lock) { + scoped_guard(raw_spinlock_irq, &desc->lock) { old_notify = desc->affinity_notify; desc->affinity_notify = notify; } @@ -659,7 +659,7 @@ void __disable_irq(struct irq_desc *desc) static int __disable_irq_nosync(unsigned int irq) { - scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { __disable_irq(scoped_irqdesc); return 0; } @@ -789,7 +789,7 @@ void __enable_irq(struct irq_desc *desc) */ void enable_irq(unsigned int irq) { - scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq)) @@ -1001,7 +1001,6 @@ static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; - bool valid = false; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; @@ -1018,21 +1017,13 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *a } scoped_guard(raw_spinlock_irq, &desc->lock) { - /* - * This code is triggered unconditionally. Check the affinity - * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. - */ - if (cpumask_available(desc->irq_common_data.affinity)) { - const struct cpumask *m; + const struct cpumask *m; - m = irq_data_get_effective_affinity_mask(&desc->irq_data); - cpumask_copy(mask, m); - valid = true; - } + m = irq_data_get_effective_affinity_mask(&desc->irq_data); + cpumask_copy(mask, m); } - if (valid) - set_cpus_allowed_ptr(current, mask); + set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else @@ -1239,7 +1230,10 @@ static int irq_thread(void *data) irq_thread_set_ready(desc, action); - sched_set_fifo(current); + if (action->handler == irq_forced_secondary_handler) + sched_set_fifo_secondary(current); + else + sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) @@ -1405,19 +1399,39 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) * references an already freed task_struct. */ new->thread = get_task_struct(t); + + /* + * The affinity can not be established yet, but it will be once the + * interrupt is enabled. Delay and defer the actual setting to the + * thread itself once it is ready to run. In the meantime, prevent + * it from ever being re-affined directly by cpuset or + * housekeeping. The proper way to do it is to re-affine the whole + * vector. + */ + kthread_bind_mask(t, cpu_possible_mask); + /* - * Tell the thread to set its affinity. This is - * important for shared interrupt handlers as we do - * not invoke setup_affinity() for the secondary - * handlers as everything is already set up. Even for - * interrupts marked with IRQF_NO_BALANCE this is - * correct as we want the thread to move to the cpu(s) - * on which the requesting code placed the interrupt. + * Ensure the thread adjusts the affinity once it reaches the + * thread function. */ - set_bit(IRQTF_AFFINITY, &new->thread_flags); + new->thread_flags = BIT(IRQTF_AFFINITY); + return 0; } +static bool valid_percpu_irqaction(struct irqaction *old, struct irqaction *new) +{ + do { + if (cpumask_intersects(old->affinity, new->affinity) || + old->percpu_dev_id == new->percpu_dev_id) + return false; + + old = old->next; + } while (old); + + return true; +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1438,6 +1452,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; + bool per_cpu_devid; if (!desc) return -EINVAL; @@ -1447,6 +1462,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!try_module_get(desc->owner)) return -ENODEV; + per_cpu_devid = irq_settings_is_per_cpu_devid(desc); + new->irq = irq; /* @@ -1554,13 +1571,20 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ unsigned int oldtype; - if (irq_is_nmi(desc)) { + if (irq_is_nmi(desc) && !per_cpu_devid) { pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", new->name, irq, desc->irq_data.chip->name); ret = -EINVAL; goto out_unlock; } + if (per_cpu_devid && !valid_percpu_irqaction(old, new)) { + pr_err("Overlapping affinities for %s (irq %d) on irqchip %s.\n", + new->name, irq, desc->irq_data.chip->name); + ret = -EINVAL; + goto out_unlock; + } + /* * If nobody did set the configuration before, inherit * the one provided by the requester. @@ -1711,7 +1735,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!(new->flags & IRQF_NO_AUTOEN) && irq_settings_can_autoenable(desc)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); - } else { + } else if (!per_cpu_devid) { /* * Shared interrupts do not go well with disabling * auto enable. The sharing interrupt might request @@ -2346,7 +2370,7 @@ void disable_percpu_nmi(unsigned int irq) static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; + struct irqaction *action, **action_ptr; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -2354,21 +2378,33 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ return NULL; scoped_guard(raw_spinlock_irqsave, &desc->lock) { - action = desc->action; - if (!action || action->percpu_dev_id != dev_id) { - WARN(1, "Trying to free already-free IRQ %d\n", irq); - return NULL; + action_ptr = &desc->action; + for (;;) { + action = *action_ptr; + + if (!action) { + WARN(1, "Trying to free already-free IRQ %d\n", irq); + return NULL; + } + + if (action->percpu_dev_id == dev_id) + break; + + action_ptr = &action->next; } - if (!cpumask_empty(desc->percpu_enabled)) { - WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", - irq, cpumask_first(desc->percpu_enabled)); + if (cpumask_intersects(desc->percpu_enabled, action->affinity)) { + WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq, + cpumask_first_and(desc->percpu_enabled, action->affinity)); return NULL; } /* Found it - now remove it from the list of entries: */ - desc->action = NULL; - desc->istate &= ~IRQS_NMI; + *action_ptr = action->next; + + /* Demote from NMI if we killed the last action */ + if (!desc->action) + desc->istate &= ~IRQS_NMI; } unregister_handler_proc(irq, action); @@ -2442,17 +2478,49 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) return retval; } +static +struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long flags, + const char *devname, const cpumask_t *affinity, + void __percpu *dev_id) +{ + struct irqaction *action; + + if (!affinity) + affinity = cpu_possible_mask; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return NULL; + + action->handler = handler; + action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; + action->name = devname; + action->percpu_dev_id = dev_id; + action->affinity = affinity; + + /* + * We allow some form of sharing for non-overlapping affinity + * masks. Obviously, covering all CPUs prevents any sharing in + * the first place. + */ + if (!cpumask_equal(affinity, cpu_possible_mask)) + action->flags |= IRQF_SHARED; + + return action; +} + /** * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device + * @affinity: A cpumask describing the target CPUs for this interrupt * @dev_id: A percpu cookie passed back to the handler function * - * This call allocates interrupt resources and enables the interrupt on the - * local CPU. If the interrupt is supposed to be enabled on other CPUs, it - * has to be done on each CPU using enable_percpu_irq(). + * This call allocates interrupt resources, but doesn't enable the interrupt + * on any CPU, as all percpu-devid interrupts are flagged with IRQ_NOAUTOEN. + * It has to be done on each CPU using enable_percpu_irq(). * * @dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of @@ -2460,7 +2528,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, - void __percpu *dev_id) + const cpumask_t *affinity, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -2477,15 +2545,10 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, if (flags && flags != IRQF_TIMER) return -EINVAL; - action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + action = create_percpu_irqaction(handler, flags, devname, affinity, dev_id); if (!action) return -ENOMEM; - action->handler = handler; - action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; - action->name = devname; - action->percpu_dev_id = dev_id; - retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); @@ -2508,6 +2571,7 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq); * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @name: An ascii name for the claiming device + * @affinity: A cpumask describing the target CPUs for this interrupt * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs @@ -2524,8 +2588,8 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq); * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ -int request_percpu_nmi(unsigned int irq, irq_handler_t handler, - const char *name, void __percpu *dev_id) +int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, + const struct cpumask *affinity, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -2542,20 +2606,16 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, !irq_supports_nmi(desc)) return -EINVAL; - /* The line cannot already be NMI */ - if (irq_is_nmi(desc)) + /* The line cannot be NMI already if the new request covers all CPUs */ + if (irq_is_nmi(desc) && + (!affinity || cpumask_equal(affinity, cpu_possible_mask))) return -EINVAL; - action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + action = create_percpu_irqaction(handler, IRQF_NO_THREAD | IRQF_NOBALANCING, + name, affinity, dev_id); if (!action) return -ENOMEM; - action->handler = handler; - action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD - | IRQF_NOBALANCING; - action->name = name; - action->percpu_dev_id = dev_id; - retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index e7ad99254841..68886881fe10 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -706,7 +706,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq = ops->get_hwirq(info, arg); int i, ret; - if (irq_find_mapping(domain, hwirq) > 0) + if (irq_resolve_mapping(domain, hwirq)) return -EEXIST; if (domain->parent) { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 29c2404e743b..77258eafbf63 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -48,6 +48,8 @@ static int show_irq_affinity(int type, struct seq_file *m) struct irq_desc *desc = irq_to_desc((long)m->private); const struct cpumask *mask; + guard(raw_spinlock_irq)(&desc->lock); + switch (type) { case AFFINITY: case AFFINITY_LIST: diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 1e7635864124..049e296f586c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -103,8 +103,11 @@ static char kallsyms_get_symbol_type(unsigned int off) { /* * Get just the first code, look it up in the token table, - * and return the first char from this token. + * and return the first char from this token. If MSB of length + * is 1, it is a "big" symbol, so needs an additional byte. */ + if (kallsyms_names[off] & 0x80) + off++; return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]]; } diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 76f0940fb485..03d12e27189f 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "KHO: " fmt +#include <linux/cleanup.h> #include <linux/cma.h> #include <linux/count_zeros.h> #include <linux/debugfs.h> @@ -22,6 +23,7 @@ #include <asm/early_ioremap.h> +#include "kexec_handover_internal.h" /* * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. @@ -67,10 +69,10 @@ early_param("kho", kho_parse_enable); * Keep track of memory that is to be preserved across KHO. * * The serializing side uses two levels of xarrays to manage chunks of per-order - * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a - * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations - * each bitmap will cover 16M of address space. Thus, for 16G of memory at most - * 512K of bitmap memory will be needed for order 0. + * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order + * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 + * allocations each bitmap will cover 128M of address space. Thus, for 16G of + * memory at most 512K of bitmap memory will be needed for order 0. * * This approach is fully incremental, as the serialization progresses folios * can continue be aggregated to the tracker. The final step, immediately prior @@ -78,12 +80,14 @@ early_param("kho", kho_parse_enable); * successor kernel to parse. */ -#define PRESERVE_BITS (512 * 8) +#define PRESERVE_BITS (PAGE_SIZE * 8) struct kho_mem_phys_bits { DECLARE_BITMAP(preserve, PRESERVE_BITS); }; +static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); + struct kho_mem_phys { /* * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized @@ -131,28 +135,28 @@ static struct kho_out kho_out = { .finalized = false, }; -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) { - void *elm, *res; + void *res = xa_load(xa, index); + + if (res) + return res; - elm = xa_load(xa, index); - if (elm) - return elm; + void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); - elm = kzalloc(sz, GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); + if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); if (xa_is_err(res)) - res = ERR_PTR(xa_err(res)); - - if (res) { - kfree(elm); + return ERR_PTR(xa_err(res)); + else if (res) return res; - } - return elm; + return no_free_ptr(elm); } static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, @@ -167,12 +171,12 @@ static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, const unsigned long pfn_high = pfn >> order; physxa = xa_load(&track->orders, order); - if (!physxa) - continue; + if (WARN_ON_ONCE(!physxa)) + return; bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (!bits) - continue; + if (WARN_ON_ONCE(!bits)) + return; clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); @@ -216,8 +220,7 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } } - bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, - sizeof(*bits)); + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); if (IS_ERR(bits)) return PTR_ERR(bits); @@ -345,15 +348,19 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, unsigned long order) { - struct khoser_mem_chunk *chunk; + struct khoser_mem_chunk *chunk __free(free_page) = NULL; - chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + chunk = (void *)get_zeroed_page(GFP_KERNEL); if (!chunk) - return NULL; + return ERR_PTR(-ENOMEM); + + if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + chunk->hdr.order = order; if (cur_chunk) KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); - return chunk; + return no_free_ptr(chunk); } static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) @@ -374,14 +381,17 @@ static int kho_mem_serialize(struct kho_serialization *ser) struct khoser_mem_chunk *chunk = NULL; struct kho_mem_phys *physxa; unsigned long order; + int err = -ENOMEM; xa_for_each(&ser->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys; chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } if (!first_chunk) first_chunk = chunk; @@ -391,8 +401,10 @@ static int kho_mem_serialize(struct kho_serialization *ser) if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } } elm = &chunk->bitmaps[chunk->hdr.num_elms]; @@ -409,7 +421,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) err_free: kho_mem_ser_free(first_chunk); - return -ENOMEM; + return err; } static void __init deserialize_bitmap(unsigned int order, @@ -465,8 +477,8 @@ static void __init kho_mem_deserialize(const void *fdt) * area for early allocations that happen before page allocator is * initialized. */ -static struct kho_scratch *kho_scratch; -static unsigned int kho_scratch_cnt; +struct kho_scratch *kho_scratch; +unsigned int kho_scratch_cnt; /* * The scratch areas are scaled by default as percent of memory allocated from @@ -752,6 +764,9 @@ int kho_preserve_folio(struct folio *folio) const unsigned int order = folio_order(folio); struct kho_mem_track *track = &kho_out.ser.track; + if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) + return -EINVAL; + return __kho_preserve_order(track, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); @@ -775,6 +790,11 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) unsigned long failed_pfn = 0; int err = 0; + if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT))) { + return -EINVAL; + } + while (pfn < end_pfn) { const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); @@ -862,16 +882,17 @@ err_free: return NULL; } -static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, + unsigned short order) { struct kho_mem_track *track = &kho_out.ser.track; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); __kho_unpreserve(track, pfn, pfn + 1); - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { pfn = PHYS_PFN(chunk->phys[i]); - __kho_unpreserve(track, pfn, pfn + 1); + __kho_unpreserve(track, pfn, pfn + (1 << order)); } } @@ -882,7 +903,7 @@ static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) while (chunk) { struct kho_vmalloc_chunk *tmp = chunk; - kho_vmalloc_unpreserve_chunk(chunk); + kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order); chunk = KHOSER_LOAD_PTR(chunk->hdr.next); free_page((unsigned long)tmp); @@ -992,7 +1013,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) while (chunk) { struct page *page; - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { phys_addr_t phys = chunk->phys[i]; if (idx + contig_pages > total_pages) diff --git a/kernel/kexec_handover_debug.c b/kernel/kexec_handover_debug.c new file mode 100644 index 000000000000..6efb696f5426 --- /dev/null +++ b/kernel/kexec_handover_debug.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debug.c - kexec handover optional debug functionality + * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include "kexec_handover_internal.h" + +bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + phys_addr_t scratch_start, scratch_end; + unsigned int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + scratch_start = kho_scratch[i].addr; + scratch_end = kho_scratch[i].addr + kho_scratch[i].size; + + if (phys < scratch_end && (phys + size) > scratch_start) + return true; + } + + return false; +} diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h new file mode 100644 index 000000000000..3c3c7148ceed --- /dev/null +++ b/kernel/kexec_handover_internal.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H +#define LINUX_KEXEC_HANDOVER_INTERNAL_H + +#include <linux/kexec_handover.h> +#include <linux/types.h> + +extern struct kho_scratch *kho_scratch; +extern unsigned int kho_scratch_cnt; + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUG +bool kho_scratch_overlap(phys_addr_t phys, size_t size); +#else +static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ + +#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 31b072e8d427..99a3808d086f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { - unsigned long flags; - if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, mask); + /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; - raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) @@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; - unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { @@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); - /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, affinity); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, affinity); mutex_unlock(&kthreads_hotplug_lock); out: diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 53d51ed619a3..4c0a9c18d0b2 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -18,3 +18,15 @@ config LIVEPATCH module uses the interface provided by this option to register a patch, causing calls to patched functions to be redirected to new function code contained in the patch module. + +config HAVE_KLP_BUILD + bool + help + Arch supports klp-build + +config KLP_BUILD + def_bool y + depends on LIVEPATCH && HAVE_KLP_BUILD + select OBJTOOL + help + Enable klp-build support diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 0e73fac55f8e..9917756dae46 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -88,8 +88,14 @@ static struct klp_func *klp_find_func(struct klp_object *obj, struct klp_func *func; klp_for_each_func(obj, func) { + /* + * Besides identical old_sympos, also consider old_sympos + * of 0 and 1 are identical. + */ if ((strcmp(old_func->old_name, func->old_name) == 0) && - (old_func->old_sympos == func->old_sympos)) { + ((old_func->old_sympos == func->old_sympos) || + (old_func->old_sympos == 0 && func->old_sympos == 1) || + (old_func->old_sympos == 1 && func->old_sympos == 0))) { return func; } } @@ -217,14 +223,14 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { - pr_err("symbol %s is not marked as a livepatch symbol\n", - strtab + sym->st_name); + pr_err("symbol %s at rela sec %u idx %d is not marked as a livepatch symbol\n", + strtab + sym->st_name, symndx, i); return -EINVAL; } /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, - ".klp.sym.%55[^.].%511[^,],%lu", + KLP_SYM_PREFIX "%55[^.].%511[^,],%lu", sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", @@ -303,7 +309,7 @@ static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, * See comment in klp_resolve_symbols() for an explanation * of the selected field width value. */ - cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]", + cnt = sscanf(shstrtab + sec->sh_name, KLP_RELOC_SEC_PREFIX "%55[^.]", sec_objname); if (cnt != 1) { pr_err("section %s has an incorrectly formatted name\n", diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index ce0362f0a871..6567e5eeacc0 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -103,8 +103,8 @@ static const struct kernel_param_ops lt_bind_ops = { .get = param_get_cpumask, }; -module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0644); -module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0644); +module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0444); +module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0444); long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); @@ -1211,6 +1211,10 @@ end: cxt.cur_ops->exit(); cxt.init_called = false; } + + free_cpumask_var(bind_readers); + free_cpumask_var(bind_writers); + torture_cleanup_end(); } diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 949103fd8e9b..2c6b02d4699b 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -78,16 +78,8 @@ void debug_mutex_unlock(struct mutex *lock) } } -void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key) +void debug_mutex_init(struct mutex *lock) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); -#endif lock->magic = lock; } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index de7d6702cd96..2a1d165b3167 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -43,8 +43,7 @@ # define MUTEX_WARN_ON(cond) #endif -void -__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) +static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); raw_spin_lock_init(&lock->wait_lock); @@ -52,10 +51,8 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif - - debug_mutex_init(lock, name, key); + debug_mutex_init(lock); } -EXPORT_SYMBOL(__mutex_init); static inline struct task_struct *__owner_task(unsigned long owner) { @@ -142,6 +139,11 @@ static inline bool __mutex_trylock(struct mutex *lock) * There is nothing that would stop spreading the lockdep annotations outwards * except more code. */ +void mutex_init_generic(struct mutex *lock) +{ + __mutex_init_generic(lock); +} +EXPORT_SYMBOL(mutex_init_generic); /* * Optimistic trylock that only works in the uncontended case. Make sure to @@ -166,7 +168,21 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL); } -#endif + +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + +void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key) +{ + __mutex_init_generic(lock); + + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_init_lockep); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) { diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 2e8080a9bee3..9ad4da8cea00 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -59,8 +59,7 @@ extern void debug_mutex_add_waiter(struct mutex *lock, extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); +extern void debug_mutex_init(struct mutex *lock); #else /* CONFIG_DEBUG_MUTEXES */ # define debug_mutex_lock_common(lock, waiter) do { } while (0) # define debug_mutex_wake_waiter(lock, waiter) do { } while (0) @@ -68,6 +67,6 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, # define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_unlock(lock) do { } while (0) -# define debug_mutex_init(lock, name, key) do { } while (0) +# define debug_mutex_init(lock) do { } while (0) #endif /* !CONFIG_DEBUG_MUTEXES */ #endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index bafd5af98eae..59dbd29cb219 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -515,13 +515,11 @@ void rt_mutex_debug_task_free(struct task_struct *task) #ifdef CONFIG_PREEMPT_RT /* Mutexes */ -void __mutex_rt_init(struct mutex *mutex, const char *name, - struct lock_class_key *key) +static void __mutex_rt_init_generic(struct mutex *mutex) { + rt_mutex_base_init(&mutex->rtmutex); debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); - lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); } -EXPORT_SYMBOL(__mutex_rt_init); static __always_inline int __mutex_lock_common(struct mutex *lock, unsigned int state, @@ -542,6 +540,13 @@ static __always_inline int __mutex_lock_common(struct mutex *lock, } #ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key) +{ + __mutex_rt_init_generic(mutex); + lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_rt_init_lockdep); + void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); @@ -598,6 +603,12 @@ int __sched _mutex_trylock_nest_lock(struct mutex *lock, EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock); #else /* CONFIG_DEBUG_LOCK_ALLOC */ +void mutex_rt_init_generic(struct mutex *mutex) +{ + __mutex_rt_init_generic(mutex); +} +EXPORT_SYMBOL(mutex_rt_init_generic); + void __sched mutex_lock(struct mutex *lock) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 87b03d2e41db..2338b3adfb55 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -184,8 +184,8 @@ void do_raw_read_unlock(rwlock_t *lock) static inline void debug_write_lock_before(rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); - RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion"); + RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(), lock, "cpu recursion"); } diff --git a/kernel/module/main.c b/kernel/module/main.c index c66b26184936..7b3ec2fa6e7c 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3675,24 +3675,35 @@ static int idempotent_wait_for_completion(struct idempotent *u) static int init_module_from_file(struct file *f, const char __user * uargs, int flags) { + bool compressed = !!(flags & MODULE_INIT_COMPRESSED_FILE); struct load_info info = { }; void *buf = NULL; int len; + int err; - len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE); + len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, + compressed ? READING_MODULE_COMPRESSED : + READING_MODULE); if (len < 0) { mod_stat_inc(&failed_kreads); return len; } - if (flags & MODULE_INIT_COMPRESSED_FILE) { - int err = module_decompress(&info, buf, len); + if (compressed) { + err = module_decompress(&info, buf, len); vfree(buf); /* compressed data is no longer needed */ if (err) { mod_stat_inc(&failed_decompress); mod_stat_add_long(len, &invalid_decompress_bytes); return err; } + err = security_kernel_post_read_file(f, (char *)info.hdr, info.len, + READING_MODULE); + if (err) { + mod_stat_inc(&failed_kreads); + free_copy(&info, flags); + return err; + } } else { info.hdr = buf; info.len = len; diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c1fb2bad6d72..bdc3c86231d3 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #include <linux/ns_common.h> +#include <linux/nstree.h> #include <linux/proc_ns.h> +#include <linux/user_namespace.h> #include <linux/vfsdebug.h> #ifdef CONFIG_DEBUG_VFS @@ -52,26 +55,257 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { + int ret = 0; + refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; ns->ns_type = ns_type; - RB_CLEAR_NODE(&ns->ns_tree_node); - INIT_LIST_HEAD(&ns->ns_list_node); + ns_tree_node_init(&ns->ns_tree_node); + ns_tree_node_init(&ns->ns_unified_node); + ns_tree_node_init(&ns->ns_owner_node); + ns_tree_root_init(&ns->ns_owner_root); #ifdef CONFIG_DEBUG_VFS ns_debug(ns, ops); #endif - if (inum) { + if (inum) ns->inum = inum; - return 0; - } - return proc_alloc_inum(&ns->inum); + else + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + /* + * Tree ref starts at 0. It's incremented when namespace enters + * active use (installed in nsproxy) and decremented when all + * active uses are gone. Initial namespaces are always active. + */ + if (is_ns_init_inum(ns)) + atomic_set(&ns->__ns_ref_active, 1); + else + atomic_set(&ns->__ns_ref_active, 0); + return 0; } void __ns_common_free(struct ns_common *ns) { proc_free_inum(ns->inum); } + +struct ns_common *__must_check ns_owner(struct ns_common *ns) +{ + struct user_namespace *owner; + + if (unlikely(!ns->ops)) + return NULL; + VFS_WARN_ON_ONCE(!ns->ops->owner); + owner = ns->ops->owner(ns); + VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); + if (!owner) + return NULL; + /* Skip init_user_ns as it's always active */ + if (owner == &init_user_ns) + return NULL; + return to_ns_common(owner); +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * The iteration stops once we reach a namespace that still has active + * references. + */ +void __ns_ref_active_put(struct ns_common *ns) +{ + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + } +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. This makes it possible to efficiently + * resurrect a namespace tree: + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Assume the whole tree is dead but all namespaces are still active: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - - - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): + * + * net_ns pid_ns + * \ / + * + - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - + - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If net_ns had a zero reference count and we bumped it we also need to + * take another reference on its owning user namespace. Similarly, if + * pid_ns had a zero reference count it also needs to take another + * reference on its owning user namespace. So both net_ns and pid_ns + * will each have their own reference on the owning user namespace. + * + * If the owning user namespace user_ns1 had a zero reference count then + * it also needs to take another reference on its owning user namespace + * and so on. + */ +void __ns_ref_active_get(struct ns_common *ns) +{ + int prev; + + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + /* If we didn't resurrect the namespace we're done. */ + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + + /* + * We did resurrect it. Walk the ownership hierarchy upwards + * until we found an owning user namespace that is active. + */ + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + } +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 19aa64ab08c8..259c4b4f1eeb 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -26,6 +26,7 @@ #include <linux/syscalls.h> #include <linux/cgroup.h> #include <linux/perf_event.h> +#include <linux/nstree.h> static struct kmem_cache *nsproxy_cachep; @@ -59,6 +60,25 @@ static inline struct nsproxy *create_nsproxy(void) return nsproxy; } +static inline void nsproxy_free(struct nsproxy *ns) +{ + put_mnt_ns(ns->mnt_ns); + put_uts_ns(ns->uts_ns); + put_ipc_ns(ns->ipc_ns); + put_pid_ns(ns->pid_ns_for_children); + put_time_ns(ns->time_ns); + put_time_ns(ns->time_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); + put_net(ns->net_ns); + kmem_cache_free(nsproxy_cachep, ns); +} + +void deactivate_nsproxy(struct nsproxy *ns) +{ + nsproxy_ns_active_put(ns); + nsproxy_free(ns); +} + /* * Create new nsproxy and all of its the associated namespaces. * Return the newly created nsproxy. Do not attach this to the task, @@ -179,23 +199,11 @@ int copy_namespaces(u64 flags, struct task_struct *tsk) if ((flags & CLONE_VM) == 0) timens_on_fork(new_ns, tsk); + nsproxy_ns_active_get(new_ns); tsk->nsproxy = new_ns; return 0; } -void free_nsproxy(struct nsproxy *ns) -{ - put_mnt_ns(ns->mnt_ns); - put_uts_ns(ns->uts_ns); - put_ipc_ns(ns->ipc_ns); - put_pid_ns(ns->pid_ns_for_children); - put_time_ns(ns->time_ns); - put_time_ns(ns->time_ns_for_children); - put_cgroup_ns(ns->cgroup_ns); - put_net(ns->net_ns); - kmem_cache_free(nsproxy_cachep, ns); -} - /* * Called from unshare. Unshare all the namespaces part of nsproxy. * On success, returns the new nsproxy. @@ -232,6 +240,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + if (new) + nsproxy_ns_active_get(new); + task_lock(p); ns = p->nsproxy; p->nsproxy = new; @@ -241,11 +252,27 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) put_nsproxy(ns); } -void exit_task_namespaces(struct task_struct *p) +void exit_nsproxy_namespaces(struct task_struct *p) { switch_task_namespaces(p, NULL); } +void switch_cred_namespaces(const struct cred *old, const struct cred *new) +{ + ns_ref_active_get(new->user_ns); + ns_ref_active_put(old->user_ns); +} + +void get_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_get(tsk->real_cred->user_ns); +} + +void exit_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_put(tsk->real_cred->user_ns); +} + int exec_task_namespaces(void) { struct task_struct *tsk = current; @@ -315,7 +342,7 @@ static void put_nsset(struct nsset *nsset) if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) free_fs_struct(nsset->fs); if (nsset->nsproxy) - free_nsproxy(nsset->nsproxy); + nsproxy_free(nsset->nsproxy); } static int prepare_nsset(unsigned flags, struct nsset *nsset) diff --git a/kernel/nstree.c b/kernel/nstree.c index b24a320a11a6..f36c59e6951d 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -1,140 +1,261 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #include <linux/nstree.h> #include <linux/proc_ns.h> +#include <linux/rculist.h> #include <linux/vfsdebug.h> +#include <linux/syscalls.h> +#include <linux/user_namespace.h> -/** - * struct ns_tree - Namespace tree - * @ns_tree: Rbtree of namespaces of a particular type - * @ns_list: Sequentially walkable list of all namespaces of this type - * @ns_tree_lock: Seqlock to protect the tree and list - * @type: type of namespaces in this tree - */ -struct ns_tree { - struct rb_root ns_tree; - struct list_head ns_list; - seqlock_t ns_tree_lock; - int type; +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); + +DEFINE_LOCK_GUARD_0(ns_tree_writer, + write_seqlock(&ns_tree_lock), + write_sequnlock(&ns_tree_lock)) + +DEFINE_LOCK_GUARD_0(ns_tree_locked_reader, + read_seqlock_excl(&ns_tree_lock), + read_sequnlock_excl(&ns_tree_lock)) + +static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */ + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head), }; -struct ns_tree mnt_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), - .type = CLONE_NEWNS, +struct ns_tree_root mnt_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head), }; -struct ns_tree net_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), - .type = CLONE_NEWNET, +struct ns_tree_root net_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head), }; EXPORT_SYMBOL_GPL(net_ns_tree); -struct ns_tree uts_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), - .type = CLONE_NEWUTS, +struct ns_tree_root uts_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head), }; -struct ns_tree user_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), - .type = CLONE_NEWUSER, +struct ns_tree_root user_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head), }; -struct ns_tree ipc_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), - .type = CLONE_NEWIPC, +struct ns_tree_root ipc_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head), }; -struct ns_tree pid_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), - .type = CLONE_NEWPID, +struct ns_tree_root pid_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head), }; -struct ns_tree cgroup_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), - .type = CLONE_NEWCGROUP, +struct ns_tree_root cgroup_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head), }; -struct ns_tree time_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), - .type = CLONE_NEWTIME, +struct ns_tree_root time_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head), }; -DEFINE_COOKIE(namespace_cookie); +/** + * ns_tree_node_init - Initialize a namespace tree node + * @node: The node to initialize + * + * Initializes both the rbtree node and list entry. + */ +void ns_tree_node_init(struct ns_tree_node *node) +{ + RB_CLEAR_NODE(&node->ns_node); + INIT_LIST_HEAD(&node->ns_list_entry); +} + +/** + * ns_tree_root_init - Initialize a namespace tree root + * @root: The root to initialize + * + * Initializes both the rbtree root and list head. + */ +void ns_tree_root_init(struct ns_tree_root *root) +{ + root->ns_rb = RB_ROOT; + INIT_LIST_HEAD(&root->ns_list_head); +} + +/** + * ns_tree_node_empty - Check if a namespace tree node is empty + * @node: The node to check + * + * Returns true if the node is not in any tree. + */ +bool ns_tree_node_empty(const struct ns_tree_node *node) +{ + return RB_EMPTY_NODE(&node->ns_node); +} + +/** + * ns_tree_node_add - Add a node to a namespace tree + * @node: The node to add + * @root: The tree root to add to + * @cmp: Comparison function for rbtree insertion + * + * Adds the node to both the rbtree and the list, maintaining sorted order. + * The list is maintained in the same order as the rbtree to enable efficient + * iteration. + * + * Returns: NULL if insertion succeeded, existing node if duplicate found + */ +struct rb_node *ns_tree_node_add(struct ns_tree_node *node, + struct ns_tree_root *root, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node *ret, *prev; + + /* Add to rbtree */ + ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp); + + /* Add to list in sorted order */ + prev = rb_prev(&node->ns_node); + if (!prev) { + /* No previous node, add at head */ + list_add_rcu(&node->ns_list_entry, &root->ns_list_head); + } else { + /* Add after previous node */ + struct ns_tree_node *prev_node; + prev_node = rb_entry(prev, struct ns_tree_node, ns_node); + list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry); + } + + return ret; +} + +/** + * ns_tree_node_del - Remove a node from a namespace tree + * @node: The node to remove + * @root: The tree root to remove from + * + * Removes the node from both the rbtree and the list atomically. + */ +void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root) +{ + rb_erase(&node->ns_node, &root->ns_rb); + RB_CLEAR_NODE(&node->ns_node); + list_bidir_del_rcu(&node->ns_list_entry); +} static inline struct ns_common *node_to_ns(const struct rb_node *node) { if (!node) return NULL; - return rb_entry(node, struct ns_common, ns_tree_node); + return rb_entry(node, struct ns_common, ns_tree_node.ns_node); } -static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) +static inline struct ns_common *node_to_ns_unified(const struct rb_node *node) { - struct ns_common *ns_a = node_to_ns(a); - struct ns_common *ns_b = node_to_ns(b); - u64 ns_id_a = ns_a->ns_id; - u64 ns_id_b = ns_b->ns_id; + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_unified_node.ns_node); +} - if (ns_id_a < ns_id_b) +static inline struct ns_common *node_to_ns_owner(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_owner_node.ns_node); +} + +static int ns_id_cmp(u64 id_a, u64 id_b) +{ + if (id_a < id_b) return -1; - if (ns_id_a > ns_id_b) + if (id_a > id_b) return 1; return 0; } -void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +static int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id); +} + +static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b) +{ + return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id); +} + +static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b) { - struct rb_node *node, *prev; + return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id); +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree) +{ + struct rb_node *node; + const struct proc_ns_operations *ops = ns->ops; VFS_WARN_ON_ONCE(!ns->ns_id); - write_seqlock(&ns_tree->ns_tree_lock); + guard(ns_tree_writer)(); - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + /* Add to per-type tree and list */ + node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp); - node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); - /* - * If there's no previous entry simply add it after the - * head and if there is add it after the previous entry. - */ - prev = rb_prev(&ns->ns_tree_node); - if (!prev) - list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); - else - list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + /* Add to unified tree and list */ + ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified); + + /* Add to owner's tree if applicable */ + if (ops) { + struct user_namespace *user_ns; - write_sequnlock(&ns_tree->ns_tree_lock); + VFS_WARN_ON_ONCE(!ops->owner); + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + /* Insert into owner's tree and list */ + ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner); + } else { + /* Only the initial user namespace doesn't have an owner. */ + VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns)); + } + } VFS_WARN_ON_ONCE(node); } -void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree) { - VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); - VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + const struct proc_ns_operations *ops = ns->ops; + struct user_namespace *user_ns; + + VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry)); + + write_seqlock(&ns_tree_lock); + + /* Remove from per-type tree and list */ + ns_tree_node_del(&ns->ns_tree_node, ns_tree); + + /* Remove from unified tree and list */ + ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root); - write_seqlock(&ns_tree->ns_tree_lock); - rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); - list_bidir_del_rcu(&ns->ns_list_node); - RB_CLEAR_NODE(&ns->ns_tree_node); - write_sequnlock(&ns_tree->ns_tree_lock); + /* Remove from owner's tree if applicable */ + if (ops) { + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root); + } + } + + write_sequnlock(&ns_tree_lock); } EXPORT_SYMBOL_GPL(__ns_tree_remove); @@ -150,8 +271,19 @@ static int ns_find(const void *key, const struct rb_node *node) return 0; } +static int ns_find_unified(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns_unified(node); -static struct ns_tree *ns_tree_from_type(int ns_type) + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + +static struct ns_tree_root *ns_tree_from_type(int ns_type) { switch (ns_type) { case CLONE_NEWCGROUP: @@ -175,73 +307,507 @@ static struct ns_tree *ns_tree_from_type(int ns_type) return NULL; } -struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id) { - struct ns_tree *ns_tree; struct rb_node *node; unsigned int seq; - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + do { + seq = read_seqbegin(&ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified); + if (node) + break; + } while (read_seqretry(&ns_tree_lock, seq)); + + return node_to_ns_unified(node); +} + +static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree_root *ns_tree; + struct rb_node *node; + unsigned int seq; ns_tree = ns_tree_from_type(ns_type); if (!ns_tree) return NULL; do { - seq = read_seqbegin(&ns_tree->ns_tree_lock); - node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + seq = read_seqbegin(&ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find); if (node) break; - } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + } while (read_seqretry(&ns_tree_lock, seq)); - if (!node) - return NULL; + return node_to_ns(node); +} - VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); - return node_to_ns(node); + if (ns_type) + return __ns_tree_lookup_rcu(ns_id, ns_type); + + return __ns_unified_tree_lookup_rcu(ns_id); } /** - * ns_tree_adjoined_rcu - find the next/previous namespace in the same + * __ns_tree_adjoined_rcu - find the next/previous namespace in the same * tree * @ns: namespace to start from + * @ns_tree: namespace tree to search in * @previous: if true find the previous namespace, otherwise the next * * Find the next or previous namespace in the same tree as @ns. If * there is no next/previous namespace, -ENOENT is returned. */ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, - struct ns_tree *ns_tree, bool previous) + struct ns_tree_root *ns_tree, bool previous) { struct list_head *list; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); if (previous) - list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry)); else - list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); - if (list_is_head(list, &ns_tree->ns_list)) + list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry)); + if (list_is_head(list, &ns_tree->ns_list_head)) return ERR_PTR(-ENOENT); - VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); - - return list_entry_rcu(list, struct ns_common, ns_list_node); + return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry); } /** - * ns_tree_gen_id - generate a new namespace id + * __ns_tree_gen_id - generate a new namespace id * @ns: namespace to generate id for + * @id: if non-zero, this is the initial namespace and this is a fixed id * * Generates a new namespace id and assigns it to the namespace. All * namespaces types share the same id space and thus can be compared * directly. IOW, when two ids of two namespace are equal, they are * identical. */ -u64 ns_tree_gen_id(struct ns_common *ns) +u64 __ns_tree_gen_id(struct ns_common *ns, u64 id) { - guard(preempt)(); - ns->ns_id = gen_cookie_next(&namespace_cookie); + static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1); + + if (id) + ns->ns_id = id; + else + ns->ns_id = atomic64_inc_return(&namespace_cookie); return ns->ns_id; } + +struct klistns { + u64 __user *uns_ids; + u32 nr_ns_ids; + u64 last_ns_id; + u64 user_ns_id; + u32 ns_type; + struct user_namespace *user_ns; + bool userns_capable; + struct ns_common *first_ns; +}; + +static void __free_klistns_free(const struct klistns *kls) +{ + if (kls->user_ns_id != LISTNS_CURRENT_USER) + put_user_ns(kls->user_ns); + if (kls->first_ns && kls->first_ns->ops) + kls->first_ns->ops->put(kls->first_ns); +} + +#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS) + +static int copy_ns_id_req(const struct ns_id_req __user *req, + struct ns_id_req *kreq) +{ + int ret; + size_t usize; + + BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0); + + ret = get_user(usize, &req->size); + if (ret) + return -EFAULT; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < NS_ID_REQ_SIZE_VER0)) + return -EINVAL; + memset(kreq, 0, sizeof(*kreq)); + ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); + if (ret) + return ret; + if (kreq->spare != 0) + return -EINVAL; + if (kreq->ns_type & ~NS_ALL) + return -EOPNOTSUPP; + return 0; +} + +static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq, + u64 __user *ns_ids, size_t nr_ns_ids) +{ + kls->last_ns_id = kreq->ns_id; + kls->user_ns_id = kreq->user_ns_id; + kls->nr_ns_ids = nr_ns_ids; + kls->ns_type = kreq->ns_type; + kls->uns_ids = ns_ids; + return 0; +} + +/* + * Lookup a namespace owned by owner with id >= ns_id. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner) +{ + struct ns_common *ret = NULL; + struct rb_node *node; + + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + guard(ns_tree_locked_reader)(); + + node = owner->ns_owner_root.ns_rb.rb_node; + while (node) { + struct ns_common *ns; + + ns = node_to_ns_owner(node); + if (ns_id <= ns->ns_id) { + ret = ns; + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + return ret; +} + +static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type) +{ + struct ns_common *ns; + + guard(rcu)(); + ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type); + if (!ns) + return NULL; + + if (!ns_get_unless_inactive(ns)) + return NULL; + + return ns; +} + +static inline bool __must_check ns_requested(const struct klistns *kls, + const struct ns_common *ns) +{ + return !kls->ns_type || (kls->ns_type & ns->ns_type); +} + +static inline bool __must_check may_list_ns(const struct klistns *kls, + struct ns_common *ns) +{ + if (kls->user_ns) { + if (kls->userns_capable) + return true; + } else { + struct ns_common *owner; + struct user_namespace *user_ns; + + owner = ns_owner(ns); + if (owner) + user_ns = to_user_ns(owner); + else + user_ns = &init_user_ns; + if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN)) + return true; + } + + if (is_current_namespace(ns)) + return true; + + if (ns->ns_type != CLONE_NEWUSER) + return false; + + if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN)) + return true; + + return false; +} + +static inline void ns_put(struct ns_common *ns) +{ + if (ns && ns->ops) + ns->ops->put(ns); +} + +DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T)) + +static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, + struct ns_common *candidate) +{ + struct ns_common *ns __free(ns_put) = NULL; + + if (!ns_requested(kls, candidate)) + return NULL; + + ns = ns_get_unless_inactive(candidate); + if (!ns) + return NULL; + + if (!may_list_ns(kls, ns)) + return NULL; + + return no_free_ptr(ns); +} + +static ssize_t do_listns_userns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL; + const struct list_head *head; + ssize_t ret; + + VFS_WARN_ON_ONCE(!kls->user_ns_id); + + if (kls->user_ns_id == LISTNS_CURRENT_USER) + ns = to_ns_common(current_user_ns()); + else if (kls->user_ns_id) + ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER); + if (!ns) + return -EINVAL; + kls->user_ns = to_user_ns(ns); + + /* + * Use the rbtree to find the first namespace we care about and + * then use it's list entry to iterate from there. + */ + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head; + kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); + + rcu_read_lock(); + + if (!first_ns) + first_ns = list_entry_rcu(head->next, typeof(*first_ns), ns_owner_node.ns_list_entry); + + ns = first_ns; + list_for_each_entry_from_rcu(ns, head, ns_owner_node.ns_list_entry) { + struct ns_common *valid; + + if (!nr_ns_ids) + break; + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + ns_put(prev); + return ret; +} + +/* + * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type) +{ + struct ns_common *ret = NULL; + struct ns_tree_root *ns_tree = NULL; + struct rb_node *node; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + } + + guard(ns_tree_locked_reader)(); + + if (ns_tree) + node = ns_tree->ns_rb.rb_node; + else + node = ns_unified_root.ns_rb.rb_node; + + while (node) { + struct ns_common *ns; + + if (ns_type) + ns = node_to_ns(node); + else + ns = node_to_ns_unified(node); + + if (ns_id <= ns->ns_id) { + if (ns_type) + ret = node_to_ns(node); + else + ret = node_to_ns_unified(node); + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + return ret; +} + +static inline struct ns_common *first_ns_common(const struct list_head *head, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry); +} + +static inline struct ns_common *next_ns_common(struct ns_common *ns, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry); +} + +static inline bool ns_common_is_head(struct ns_common *ns, + const struct list_head *head, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return &ns->ns_tree_node.ns_list_entry == head; + return &ns->ns_unified_node.ns_list_entry == head; +} + +static ssize_t do_listns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns, *first_ns = NULL, *prev = NULL; + struct ns_tree_root *ns_tree = NULL; + const struct list_head *head; + u32 ns_type; + ssize_t ret; + + if (hweight32(kls->ns_type) == 1) + ns_type = kls->ns_type; + else + ns_type = 0; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return -EINVAL; + } + + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + if (ns_tree) + head = &ns_tree->ns_list_head; + else + head = &ns_unified_root.ns_list_head; + + rcu_read_lock(); + + if (!first_ns) + first_ns = first_ns_common(head, ns_tree); + + for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; + ns = next_ns_common(ns, ns_tree)) { + struct ns_common *valid; + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + ns_put(prev); + return ret; +} + +SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req, + u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags) +{ + struct klistns klns __free(klistns_free) = {}; + const size_t maxcount = 1000000; + struct ns_id_req kreq; + ssize_t ret; + + if (flags) + return -EINVAL; + + if (unlikely(nr_ns_ids > maxcount)) + return -EOVERFLOW; + + if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids))) + return -EFAULT; + + ret = copy_ns_id_req(req, &kreq); + if (ret) + return ret; + + ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids); + if (ret) + return ret; + + if (kreq.user_ns_id) + return do_listns_userns(&klns); + + return do_listns(&klns); +} diff --git a/kernel/panic.c b/kernel/panic.c index 24cc3eec1805..b2f2470af7e5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -873,13 +873,15 @@ void __warn(const char *file, int line, void *caller, unsigned taint, disable_trace_on_warning(); - if (file) - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", - raw_smp_processor_id(), current->pid, file, line, - caller); - else - pr_warn("WARNING: CPU: %d PID: %d at %pS\n", - raw_smp_processor_id(), current->pid, caller); + if (file) { + pr_warn("WARNING: %s:%d at %pS, CPU#%d: %s/%d\n", + file, line, caller, + raw_smp_processor_id(), current->comm, current->pid); + } else { + pr_warn("WARNING: at %pS, CPU#%d: %s/%d\n", + caller, + raw_smp_processor_id(), current->comm, current->pid); + } #pragma GCC diagnostic push #ifndef __clang__ diff --git a/kernel/pid.c b/kernel/pid.c index 4fffec767a63..a31771bc89c1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,21 +71,16 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.__ns_ref = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_pid_ns), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .ns.inum = ns_init_inum(&init_pid_ns), -#ifdef CONFIG_PID_NS - .ns.ops = &pidns_operations, -#endif .pid_max = PID_MAX_DEFAULT, #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif - .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -117,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { int i; + struct pid_namespace *active_ns; lockdep_assert_not_held(&tasklist_lock); + active_ns = pid->numbers[pid->level].ns; + ns_ref_active_put(active_ns); + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; @@ -283,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } spin_unlock(&pidmap_lock); idr_preload_end(); + ns_ref_active_get(ns); return pid; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 650be58d8d18..e48f5de41361 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(u64 flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && ns_ref_put(ns)) + if (ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 54a623680019..05337f437cca 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -202,6 +202,17 @@ config PM_WAKELOCKS_GC depends on PM_WAKELOCKS default y +config PM_QOS_CPU_SYSTEM_WAKEUP + bool "User space interface for CPU system wakeup QoS" + depends on CPU_IDLE + help + Enable this to allow user space via the cpu_wakeup_latency file to + specify a CPU system wakeup latency limit. + + This may be particularly useful for platforms supporting multiple low + power states for CPUs during system-wide suspend and s2idle in + particular. + config PM bool "Device power management core functionality" help diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 874ad834dc8d..773e2789412b 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -21,4 +21,6 @@ obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o -obj-$(CONFIG_ENERGY_MODEL) += energy_model.o +obj-$(CONFIG_ENERGY_MODEL) += em.o +em-y := energy_model.o +em-$(CONFIG_NET) += em_netlink_autogen.o em_netlink.o diff --git a/kernel/power/console.c b/kernel/power/console.c index 19c48aa5355d..a906a0ac0f9b 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -44,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list); * no_console_suspend argument has been passed on the command line, VT * switches will occur. */ -void pm_vt_switch_required(struct device *dev, bool required) +int pm_vt_switch_required(struct device *dev, bool required) { struct pm_vt_switch *entry, *tmp; + int ret = 0; mutex_lock(&vt_switch_mutex); list_for_each_entry(tmp, &pm_vt_switch_list, head) { @@ -58,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required) } entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) + if (!entry) { + ret = -ENOMEM; goto out; + } entry->required = required; entry->dev = dev; @@ -67,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required) list_add(&entry->head, &pm_vt_switch_list); out: mutex_unlock(&vt_switch_mutex); + return ret; } EXPORT_SYMBOL(pm_vt_switch_required); diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c new file mode 100644 index 000000000000..4b85da138a06 --- /dev/null +++ b/kernel/power/em_netlink.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ + +#define pr_fmt(fmt) "energy_model: " fmt + +#include <linux/energy_model.h> +#include <net/sock.h> +#include <net/genetlink.h> +#include <uapi/linux/energy_model.h> + +#include "em_netlink.h" +#include "em_netlink_autogen.h" + +#define EM_A_PD_CPUS_LEN 256 + +/*************************** Command encoding ********************************/ +static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + int *tot_msg_sz = data; + int msg_sz, cpus_sz; + + cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + + msg_sz = nla_total_size(0) + /* EM_A_PDS_PD */ + nla_total_size(sizeof(u32)) + /* EM_A_PD_PD_ID */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PD_FLAGS */ + nla_total_size(cpus_sz); /* EM_A_PD_CPUS */ + + *tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz)); + return 0; +} + +static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + struct sk_buff *msg = data; + struct nlattr *entry; + + entry = nla_nest_start(msg, EM_A_PDS_PD); + if (!entry) + goto out_cancel_nest; + + if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id)) + goto out_cancel_nest; + + if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD)) + goto out_cancel_nest; + + snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf)) + goto out_cancel_nest; + + nla_nest_end(msg, entry); + + return 0; + +out_cancel_nest: + nla_nest_cancel(msg, entry); + + return -EMSGSIZE; +} + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr; + int cmd = info->genlhdr->cmd; + int ret = -EMSGSIZE, msg_sz = 0; + + for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = for_each_em_perf_domain(__em_nl_get_pd, msg); + if (ret) + goto out_cancel_msg; + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); + + return ret; +} + +static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs) +{ + struct em_perf_domain *pd; + int id; + + if (!attrs[EM_A_PD_TABLE_PD_ID]) + return NULL; + + id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]); + pd = em_perf_domain_get_by_id(id); + return pd; +} + +static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd) +{ + int id_sz, ps_sz; + + id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + ps_sz = nla_total_size(0) + /* EM_A_PD_TABLE_PS */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_PERFORMANCE */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_FREQUENCY */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_POWER */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_COST */ + nla_total_size_64bit(sizeof(u64)); /* EM_A_PS_FLAGS */ + ps_sz *= pd->nr_perf_states; + + return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz)); +} + +static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd) +{ + struct em_perf_state *table, *ps; + struct nlattr *entry; + int i; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) + goto out_err; + + rcu_read_lock(); + table = em_perf_state_from_pd((struct em_perf_domain *)pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &table[i]; + + entry = nla_nest_start(msg, EM_A_PD_TABLE_PS); + if (!entry) + goto out_unlock_ps; + + if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE, + ps->performance, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY, + ps->frequency, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_POWER, + ps->power, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_COST, + ps->cost, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS, + ps->flags, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + + nla_nest_end(msg, entry); + } + rcu_read_unlock(); + return 0; + +out_cancel_ps_nest: + nla_nest_cancel(msg, entry); +out_unlock_ps: + rcu_read_unlock(); +out_err: + return -EMSGSIZE; +} + +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) +{ + int cmd = info->genlhdr->cmd; + int msg_sz, ret = -EMSGSIZE; + struct em_perf_domain *pd; + struct sk_buff *msg; + void *hdr; + + pd = __em_nl_get_pd_table_id(info->attrs); + if (!pd) + return -EINVAL; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +out_free_msg: + nlmsg_free(msg); + return ret; +} + + +/**************************** Event encoding *********************************/ +static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) +{ + struct sk_buff *msg; + int msg_sz, ret = -EMSGSIZE; + void *hdr; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +void em_notify_pd_created(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_CREATED); +} + +void em_notify_pd_updated(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_UPDATED); +} + +static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd) +{ + int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + + return nlmsg_total_size(genlmsg_msg_size(id_sz)); +} + +void em_notify_pd_deleted(const struct em_perf_domain *pd) +{ + struct sk_buff *msg; + void *hdr; + int msg_sz; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_notify_pd_deleted_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED); + if (!hdr) + goto out_free_msg; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) { + goto out_free_msg; + } + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +/**************************** Initialization *********************************/ +static int __init em_netlink_init(void) +{ + return genl_register_family(&em_nl_family); +} +postcore_initcall(em_netlink_init); diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h new file mode 100644 index 000000000000..583d7f1c3939 --- /dev/null +++ b/kernel/power/em_netlink.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ +#ifndef _EM_NETLINK_H +#define _EM_NETLINK_H + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data); +struct em_perf_domain *em_perf_domain_get_by_id(int id); +void em_notify_pd_created(const struct em_perf_domain *pd); +void em_notify_pd_deleted(const struct em_perf_domain *pd); +void em_notify_pd_updated(const struct em_perf_domain *pd); +#else +static inline +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + return -EINVAL; +} +static inline +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + return NULL; +} + +static inline void em_notify_pd_created(const struct em_perf_domain *pd) {} + +static inline void em_notify_pd_deleted(const struct em_perf_domain *pd) {} + +static inline void em_notify_pd_updated(const struct em_perf_domain *pd) {} +#endif + +#endif /* _EM_NETLINK_H */ diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c new file mode 100644 index 000000000000..a7a09ab1d1c2 --- /dev/null +++ b/kernel/power/em_netlink_autogen.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "em_netlink_autogen.h" + +#include <uapi/linux/energy_model.h> + +/* EM_CMD_GET_PD_TABLE - do */ +static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = { + [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, }, +}; + +/* Ops table for em */ +static const struct genl_split_ops em_nl_ops[] = { + { + .cmd = EM_CMD_GET_PDS, + .doit = em_nl_get_pds_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = EM_CMD_GET_PD_TABLE, + .doit = em_nl_get_pd_table_doit, + .policy = em_get_pd_table_nl_policy, + .maxattr = EM_A_PD_TABLE_PD_ID, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group em_nl_mcgrps[] = { + [EM_NLGRP_EVENT] = { "event", }, +}; + +struct genl_family em_nl_family __ro_after_init = { + .name = EM_FAMILY_NAME, + .version = EM_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = em_nl_ops, + .n_split_ops = ARRAY_SIZE(em_nl_ops), + .mcgrps = em_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps), +}; diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h new file mode 100644 index 000000000000..78ce609641f1 --- /dev/null +++ b/kernel/power/em_netlink_autogen.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_EM_GEN_H +#define _LINUX_EM_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/energy_model.h> + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info); +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + EM_NLGRP_EVENT, +}; + +extern struct genl_family em_nl_family; + +#endif /* _LINUX_EM_GEN_H */ diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 5f17d2e8e954..11af9f64aa82 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -17,12 +17,24 @@ #include <linux/sched/topology.h> #include <linux/slab.h> +#include "em_netlink.h" + /* * Mutex serializing the registrations of performance domains and letting * callbacks defined by drivers sleep. */ static DEFINE_MUTEX(em_pd_mutex); +/* + * Manage performance domains with IDs. One can iterate the performance domains + * through the list and pick one with their associated ID. The mutex serializes + * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be + * taken to avoid potential deadlock. + */ +static DEFINE_IDA(em_pd_ida); +static LIST_HEAD(em_pd_list); +static DEFINE_MUTEX(em_pd_list_mutex); + static void em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table); static void em_check_capacity_update(void); @@ -116,6 +128,16 @@ static int em_debug_flags_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_flags); +static int em_debug_id_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + + seq_printf(s, "%d\n", pd->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_id); + static void em_debug_create_pd(struct device *dev) { struct em_dbg_info *em_dbg; @@ -132,6 +154,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("flags", 0444, d, dev->em_pd, &em_debug_flags_fops); + debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops); + em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states, sizeof(*em_dbg), GFP_KERNEL); if (!em_dbg) @@ -328,6 +352,8 @@ int em_dev_update_perf_domain(struct device *dev, em_table_free(old_table); mutex_unlock(&em_pd_mutex); + + em_notify_pd_updated(pd); return 0; } EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); @@ -396,7 +422,7 @@ static int em_create_pd(struct device *dev, int nr_states, struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; - int cpu, ret, num_cpus; + int cpu, ret, num_cpus, id; if (_is_cpu_device(dev)) { num_cpus = cpumask_weight(cpus); @@ -420,6 +446,13 @@ static int em_create_pd(struct device *dev, int nr_states, pd->nr_perf_states = nr_states; + INIT_LIST_HEAD(&pd->node); + + id = ida_alloc(&em_pd_ida, GFP_KERNEL); + if (id < 0) + return -ENOMEM; + pd->id = id; + em_table = em_table_alloc(pd); if (!em_table) goto free_pd; @@ -444,6 +477,7 @@ free_pd_table: kfree(em_table); free_pd: kfree(pd); + ida_free(&em_pd_ida, id); return -EINVAL; } @@ -659,8 +693,16 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); + if (ret) + return ret; - return ret; + mutex_lock(&em_pd_list_mutex); + list_add_tail(&dev->em_pd->node, &em_pd_list); + mutex_unlock(&em_pd_list_mutex); + + em_notify_pd_created(dev->em_pd); + + return 0; } EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); @@ -678,6 +720,12 @@ void em_dev_unregister_perf_domain(struct device *dev) if (_is_cpu_device(dev)) return; + mutex_lock(&em_pd_list_mutex); + list_del_init(&dev->em_pd->node); + mutex_unlock(&em_pd_list_mutex); + + em_notify_pd_deleted(dev->em_pd); + /* * The mutex separates all register/unregister requests and protects * from potential clean-up/setup issues in the debugfs directories. @@ -689,6 +737,8 @@ void em_dev_unregister_perf_domain(struct device *dev) em_table_free(rcu_dereference_protected(dev->em_pd->em_table, lockdep_is_held(&em_pd_mutex))); + ida_free(&em_pd_ida, dev->em_pd->id); + kfree(dev->em_pd); dev->em_pd = NULL; mutex_unlock(&em_pd_mutex); @@ -958,3 +1008,39 @@ void em_rebuild_sched_domains(void) */ schedule_work(&rebuild_sd_work); } + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + int ret; + + ret = cb(pd, data); + if (ret) + return ret; + } + + return 0; +} + +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + if (pd->id == id) + return pd; + } + + return NULL; +} +#endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 14e85ff23551..af8d07bafe02 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -706,7 +706,6 @@ static void power_down(void) #ifdef CONFIG_SUSPEND if (hibernation_mode == HIBERNATION_SUSPEND) { - pm_restore_gfp_mask(); error = suspend_devices_and_enter(mem_sleep_current); if (!error) goto exit; @@ -746,9 +745,6 @@ static void power_down(void) cpu_relax(); exit: - /* Match the pm_restore_gfp_mask() call in hibernate(). */ - pm_restrict_gfp_mask(); - /* Restore swap signature. */ error = swsusp_unmark(); if (error) @@ -824,9 +820,11 @@ int hibernate(void) if (error) goto Restore; - ksys_sync_helper(); - if (filesystem_freeze_enabled) - filesystems_freeze(); + error = pm_sleep_fs_sync(); + if (error) + goto Notify; + + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -896,6 +894,7 @@ int hibernate(void) freezer_test_done = false; Exit: filesystems_thaw(); + Notify: pm_notifier_call_chain(PM_POST_HIBERNATION); Restore: pm_restore_console(); @@ -932,8 +931,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -1083,8 +1081,7 @@ static int software_resume(void) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 3cf2d7e72567..03b2c5495c77 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -18,6 +18,8 @@ #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/pm_runtime.h> +#include <linux/atomic.h> +#include <linux/wait.h> #include "power.h" @@ -31,23 +33,35 @@ * held, unless the suspend/hibernate code is guaranteed not to run in parallel * with that modification). */ +static unsigned int saved_gfp_count; static gfp_t saved_gfp_mask; void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); - if (saved_gfp_mask) { - gfp_allowed_mask = saved_gfp_mask; - saved_gfp_mask = 0; - } + + if (WARN_ON(!saved_gfp_count) || --saved_gfp_count) + return; + + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + + pm_pr_dbg("GFP mask restored\n"); } void pm_restrict_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); - WARN_ON(saved_gfp_mask); + + if (saved_gfp_count++) { + WARN_ON((saved_gfp_mask & ~(__GFP_IO | __GFP_FS)) != gfp_allowed_mask); + return; + } + saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); + + pm_pr_dbg("GFP mask restricted\n"); } unsigned int lock_system_sleep(void) @@ -80,6 +94,61 @@ void ksys_sync_helper(void) } EXPORT_SYMBOL_GPL(ksys_sync_helper); +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +/* Wakeup events handling resolution while syncing file systems in jiffies */ +#define PM_FS_SYNC_WAKEUP_RESOLUTION 5 + +static atomic_t pm_fs_sync_count = ATOMIC_INIT(0); +static struct workqueue_struct *pm_fs_sync_wq; +static DECLARE_WAIT_QUEUE_HEAD(pm_fs_sync_wait); + +static bool pm_fs_sync_completed(void) +{ + return atomic_read(&pm_fs_sync_count) == 0; +} + +static void pm_fs_sync_work_fn(struct work_struct *work) +{ + ksys_sync_helper(); + + if (atomic_dec_and_test(&pm_fs_sync_count)) + wake_up(&pm_fs_sync_wait); +} +static DECLARE_WORK(pm_fs_sync_work, pm_fs_sync_work_fn); + +/** + * pm_sleep_fs_sync() - Sync file systems in an interruptible way + * + * Return: 0 on successful file system sync, or -EBUSY if the file system sync + * was aborted. + */ +int pm_sleep_fs_sync(void) +{ + pm_wakeup_clear(0); + + /* + * Take back-to-back sleeps into account by queuing a subsequent fs sync + * only if the previous fs sync is running or is not queued. Multiple fs + * syncs increase the likelihood of saving the latest files immediately + * before sleep. + */ + if (!work_pending(&pm_fs_sync_work)) { + atomic_inc(&pm_fs_sync_count); + queue_work(pm_fs_sync_wq, &pm_fs_sync_work); + } + + while (!pm_fs_sync_completed()) { + if (pm_wakeup_pending()) + return -EBUSY; + + wait_event_timeout(pm_fs_sync_wait, pm_fs_sync_completed(), + PM_FS_SYNC_WAKEUP_RESOLUTION); + } + + return 0; +} +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); @@ -219,10 +288,10 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr power_attr(mem_sleep); /* - * sync_on_suspend: invoke ksys_sync_helper() before suspend. + * sync_on_suspend: Sync file systems before suspend. * - * show() returns whether ksys_sync_helper() is invoked before suspend. - * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. + * show() returns whether file systems sync before suspend is enabled. + * store() accepts 0 or 1. 0 disables file systems sync and 1 enables it. */ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); @@ -1054,16 +1123,26 @@ static const struct attribute_group *attr_groups[] = { struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); -static int __init pm_start_workqueue(void) +static int __init pm_start_workqueues(void) { - pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0); + if (!pm_wq) + return -ENOMEM; - return pm_wq ? 0 : -ENOMEM; +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + pm_fs_sync_wq = alloc_ordered_workqueue("pm_fs_sync", 0); + if (!pm_fs_sync_wq) { + destroy_workqueue(pm_wq); + return -ENOMEM; + } +#endif + + return 0; } static int __init pm_init(void) { - int error = pm_start_workqueue(); + int error = pm_start_workqueues(); if (error) return error; hibernate_image_size_init(); diff --git a/kernel/power/power.h b/kernel/power/power.h index 7ccd709af93f..75b63843886e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -19,6 +19,7 @@ struct swsusp_info { } __aligned(PAGE_SIZE); #if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +extern int pm_sleep_fs_sync(void); extern bool filesystem_freeze_enabled; #endif diff --git a/kernel/power/process.c b/kernel/power/process.c index 8ff68ebaa1e0..dc0dfc349f22 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -132,6 +132,7 @@ int freeze_processes(void) if (!pm_freezing) static_branch_inc(&freezer_active); + pm_wakeup_clear(0); pm_freezing = true; error = try_to_freeze_tasks(true); if (!error) diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 4244b069442e..f7d8064e9adc 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -415,6 +415,105 @@ static struct miscdevice cpu_latency_qos_miscdev = { .fops = &cpu_latency_qos_fops, }; +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP +/* The CPU system wakeup latency QoS. */ +static struct pm_qos_constraints cpu_wakeup_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list), + .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .type = PM_QOS_MIN, +}; + +/** + * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit. + * + * Returns the current CPU system wakeup latency QoS limit that may have been + * requested by user space. + */ +s32 cpu_wakeup_latency_qos_limit(void) +{ + return pm_qos_read_value(&cpu_wakeup_latency_constraints); +} + +static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp) +{ + struct pm_qos_request *req; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->qos = &cpu_wakeup_latency_constraints; + pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + filp->private_data = req; + + return 0; +} + +static int cpu_wakeup_latency_qos_release(struct inode *inode, + struct file *filp) +{ + struct pm_qos_request *req = filp->private_data; + + filp->private_data = NULL; + pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + kfree(req); + + return 0; +} + +static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + +static ssize_t cpu_wakeup_latency_qos_write(struct file *filp, + const char __user *buf, + size_t count, loff_t *f_pos) +{ + struct pm_qos_request *req = filp->private_data; + s32 value; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else { + int ret; + + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; + } + + if (value < 0) + return -EINVAL; + + pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value); + + return count; +} + +static const struct file_operations cpu_wakeup_latency_qos_fops = { + .open = cpu_wakeup_latency_qos_open, + .release = cpu_wakeup_latency_qos_release, + .read = cpu_wakeup_latency_qos_read, + .write = cpu_wakeup_latency_qos_write, + .llseek = noop_llseek, +}; + +static struct miscdevice cpu_wakeup_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_wakeup_latency", + .fops = &cpu_wakeup_latency_qos_fops, +}; +#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */ + static int __init cpu_latency_qos_init(void) { int ret; @@ -424,6 +523,13 @@ static int __init cpu_latency_qos_init(void) pr_err("%s: %s setup failed\n", __func__, cpu_latency_qos_miscdev.name); +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP + ret = misc_register(&cpu_wakeup_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_wakeup_latency_qos_miscdev.name); +#endif + return ret; } late_initcall(cpu_latency_qos_init); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 645f42e40478..0a946932d5c1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2110,22 +2110,20 @@ asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - pr_info("Creating image:\n"); + pm_deferred_pr_dbg("Creating image\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); nr_highmem = count_highmem_pages(); - pr_info("Need to copy %u pages\n", nr_pages + nr_highmem); + pm_deferred_pr_dbg("Need to copy %u pages\n", nr_pages + nr_highmem); if (!enough_free_mem(nr_pages, nr_highmem)) { - pr_err("Not enough free memory\n"); + pm_deferred_pr_dbg("Not enough free memory for image creation\n"); return -ENOMEM; } - if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { - pr_err("Memory allocation failed\n"); + if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) return -ENOMEM; - } /* * During allocating of suspend pagedir, new cold pages may appear. @@ -2144,7 +2142,8 @@ asmlinkage __visible int swsusp_save(void) nr_zero_pages = nr_pages - nr_copy_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - pr_info("Image created (%d pages copied, %d zero pages)\n", nr_copy_pages, nr_zero_pages); + pm_deferred_pr_dbg("Image created (%d pages copied, %d zero pages)\n", + nr_copy_pages, nr_zero_pages); return 0; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4bb4686c1c08..2da4482bb6eb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -344,10 +344,14 @@ MODULE_PARM_DESC(pm_test_delay, static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG + int i; + if (pm_test_level == level) { pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); - mdelay(pm_test_delay * 1000); + for (i = 0; i < pm_test_delay && !pm_wakeup_pending(); i++) + msleep(1000); + return 1; } #endif /* !CONFIG_PM_DEBUG */ @@ -375,8 +379,7 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); @@ -590,12 +593,15 @@ static int enter_state(suspend_state_t state) if (sync_on_suspend_enabled) { trace_suspend_resume(TPS("sync_filesystems"), 0, true); - ksys_sync_helper(); + + error = pm_sleep_fs_sync(); + if (error) + goto Unlock; + trace_suspend_resume(TPS("sync_filesystems"), 0, false); } pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); - pm_wakeup_clear(0); pm_suspend_clear_flags(); error = suspend_prepare(state); if (error) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 0beff7eeaaba..33a186373bef 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -46,19 +46,18 @@ static bool clean_pages_on_read; static bool clean_pages_on_decompress; /* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page structures + * that contain each an array of MAP_PAGE_ENTRIES swap entries. These + * structures are stored on the swap and linked together with the help of the + * .next_swap member. * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. + * The swap map is created during suspend. The swap map pages are allocated and + * populated one at a time, so we only need one memory page to set up the entire + * structure. * - * During resume we pick up all swap_map_page structures into a list. + * During resume we pick up all swap_map_page structures into a list. */ - #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) /* @@ -89,10 +88,8 @@ struct swap_map_page_list { }; /* - * The swap_map_handle structure is used for handling swap in - * a file-alike way + * The swap_map_handle structure is used for handling swap in a file-alike way. */ - struct swap_map_handle { struct swap_map_page *cur; struct swap_map_page_list *maps; @@ -117,10 +114,9 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; /* - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. + * The following functions are used for tracing the allocated swap pages, so + * that they can be freed in case of an error. */ - struct swsusp_extent { struct rb_node node; unsigned long start; @@ -170,15 +166,14 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/* - * alloc_swapdev_block - allocate a swap page and register that it has - * been allocated, so that it can be freed in case of an error. - */ - sector_t alloc_swapdev_block(int swap) { unsigned long offset; + /* + * Allocate a swap page and register that it has been allocated, so that + * it can be freed in case of an error. + */ offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { if (swsusp_extents_insert(offset)) @@ -189,16 +184,14 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/* - * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entries had been - * allocated. - */ - void free_all_swap_pages(int swap) { struct rb_node *node; + /* + * Free swap pages allocated for saving image data. It also frees the + * extents used to register which swap entries had been allocated. + */ while ((node = swsusp_extents.rb_node)) { struct swsusp_extent *ext; @@ -303,6 +296,7 @@ static int hib_wait_io(struct hib_bio_batch *hb) /* * Saving part */ + static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; @@ -336,16 +330,14 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) */ unsigned int swsusp_header_flags; -/** - * swsusp_swap_check - check if the resume device is a swap device - * and get its index (if so) - * - * This is called before saving image - */ static int swsusp_swap_check(void) { int res; + /* + * Check if the resume device is a swap device and get its index (if so). + * This is called before saving the image. + */ if (swsusp_resume_device) res = swap_type_of(swsusp_resume_device, swsusp_resume_block); else @@ -362,13 +354,6 @@ static int swsusp_swap_check(void) return 0; } -/** - * write_page - Write one page to given swap location. - * @buf: Address we're writing. - * @offset: Offset of the swap page we're writing to. - * @hb: bio completion batch - */ - static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; @@ -519,17 +504,14 @@ static int swap_writer_finish(struct swap_map_handle *handle, CMP_HEADER, PAGE_SIZE) #define CMP_SIZE (CMP_PAGES * PAGE_SIZE) -/* Maximum number of threads for compression/decompression. */ -#define CMP_THREADS 3 +/* Default number of threads for compression/decompression. */ +#define CMP_THREADS 3 +static unsigned int hibernate_compression_threads = CMP_THREADS; /* Minimum/maximum number of pages for read buffering. */ #define CMP_MIN_RD_PAGES 1024 #define CMP_MAX_RD_PAGES 8192 -/** - * save_image - save the suspend image data - */ - static int save_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) @@ -585,13 +567,48 @@ struct crc_data { wait_queue_head_t go; /* start crc update */ wait_queue_head_t done; /* crc update done */ u32 *crc32; /* points to handle's crc32 */ - size_t *unc_len[CMP_THREADS]; /* uncompressed lengths */ - unsigned char *unc[CMP_THREADS]; /* uncompressed data */ + size_t **unc_len; /* uncompressed lengths */ + unsigned char **unc; /* uncompressed data */ }; -/* - * CRC32 update function that runs in its own thread. - */ +static struct crc_data *alloc_crc_data(int nr_threads) +{ + struct crc_data *crc; + + crc = kzalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) + return NULL; + + crc->unc = kcalloc(nr_threads, sizeof(*crc->unc), GFP_KERNEL); + if (!crc->unc) + goto err_free_crc; + + crc->unc_len = kcalloc(nr_threads, sizeof(*crc->unc_len), GFP_KERNEL); + if (!crc->unc_len) + goto err_free_unc; + + return crc; + +err_free_unc: + kfree(crc->unc); +err_free_crc: + kfree(crc); + return NULL; +} + +static void free_crc_data(struct crc_data *crc) +{ + if (!crc) + return; + + if (crc->thr) + kthread_stop(crc->thr); + + kfree(crc->unc_len); + kfree(crc->unc); + kfree(crc); +} + static int crc32_threadfn(void *data) { struct crc_data *d = data; @@ -616,6 +633,7 @@ static int crc32_threadfn(void *data) } return 0; } + /* * Structure used for data compression. */ @@ -635,11 +653,8 @@ struct cmp_data { }; /* Indicates the image size after compression */ -static atomic_t compressed_size = ATOMIC_INIT(0); +static atomic64_t compressed_size = ATOMIC_INIT(0); -/* - * Compression function that runs in its own thread. - */ static int compress_threadfn(void *data) { struct cmp_data *d = data; @@ -664,19 +679,13 @@ static int compress_threadfn(void *data) d->ret = crypto_acomp_compress(d->cr); d->cmp_len = d->cr->dlen; - atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); + atomic64_add(d->cmp_len, &compressed_size); atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } -/** - * save_compressed_image - Save the suspend image data after compression. - * @handle: Swap map handle to use for saving the image. - * @snapshot: Image to read data from. - * @nr_to_write: Number of pages to save. - */ static int save_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) @@ -689,21 +698,21 @@ static int save_compressed_image(struct swap_map_handle *handle, ktime_t start; ktime_t stop; size_t off; - unsigned thr, run_threads, nr_threads; + unsigned int thr, run_threads, nr_threads; unsigned char *page = NULL; struct cmp_data *data = NULL; struct crc_data *crc = NULL; hib_init_batch(&hb); - atomic_set(&compressed_size, 0); + atomic64_set(&compressed_size, 0); /* * We'll limit the number of threads for compression to limit memory * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { @@ -719,7 +728,7 @@ static int save_compressed_image(struct swap_map_handle *handle, goto out_clean; } - crc = kzalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; @@ -877,19 +886,18 @@ out_finish: stop = ktime_get(); if (!ret) ret = err2; - if (!ret) + if (!ret) { + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + pr_info("Image size after compression: %lld kbytes\n", + (atomic64_read(&compressed_size) / 1024)); pr_info("Image saving done\n"); - swsusp_show_speed(start, stop, nr_to_write, "Wrote"); - pr_info("Image size after compression: %d kbytes\n", - (atomic_read(&compressed_size) / 1024)); + } else { + pr_err("Image saving failed: %d\n", ret); + } out_clean: hib_finish_batch(&hb); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } + free_crc_data(crc); if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) @@ -899,18 +907,12 @@ out_clean: } vfree(data); } - if (page) free_page((unsigned long)page); + if (page) + free_page((unsigned long)page); return ret; } -/** - * enough_swap - Make sure we have enough swap to save the image. - * - * Returns TRUE or FALSE after checking the total amount of swap - * space available from the resume partition. - */ - static int enough_swap(unsigned int nr_pages) { unsigned int free_swap = count_swap_pages(root_swap, 1); @@ -923,15 +925,16 @@ static int enough_swap(unsigned int nr_pages) } /** - * swsusp_write - Write entire image and metadata. - * @flags: flags to pass to the "boot" kernel in the image header + * swsusp_write - Write entire image and metadata. + * @flags: flags to pass to the "boot" kernel in the image header + * + * It is important _NOT_ to umount filesystems at this point. We want them + * synced (in case something goes wrong) but we DO not want to mark filesystem + * clean: it is not. (And it does not matter, if we resume correctly, we'll mark + * system clean, anyway.) * - * It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) + * Return: 0 on success, negative error code on failure. */ - int swsusp_write(unsigned int flags) { struct swap_map_handle handle; @@ -974,8 +977,8 @@ out_finish: } /* - * The following functions allow us to read data using a swap map - * in a file-like way. + * The following functions allow us to read data using a swap map in a file-like + * way. */ static void release_swap_reader(struct swap_map_handle *handle) @@ -1077,12 +1080,6 @@ static int swap_reader_finish(struct swap_map_handle *handle) return 0; } -/** - * load_image - load the image using the swap map handle - * @handle and the snapshot handle @snapshot - * (assume there are @nr_pages pages to load) - */ - static int load_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) @@ -1153,9 +1150,6 @@ struct dec_data { unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; -/* - * Decompression function that runs in its own thread. - */ static int decompress_threadfn(void *data) { struct dec_data *d = data; @@ -1190,12 +1184,6 @@ static int decompress_threadfn(void *data) return 0; } -/** - * load_compressed_image - Load compressed image data and decompress it. - * @handle: Swap map handle to use for loading data. - * @snapshot: Image to copy uncompressed data into. - * @nr_to_read: Number of pages to load. - */ static int load_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) @@ -1223,7 +1211,7 @@ static int load_compressed_image(struct swap_map_handle *handle, * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); page = vmalloc_array(CMP_MAX_RD_PAGES, sizeof(*page)); if (!page) { @@ -1239,7 +1227,7 @@ static int load_compressed_image(struct swap_map_handle *handle, goto out_clean; } - crc = kzalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; @@ -1506,11 +1494,7 @@ out_clean: hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } + free_crc_data(crc); if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) @@ -1529,8 +1513,9 @@ out_clean: * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should * be written into this memory location + * + * Return: 0 on success, negative error code on failure. */ - int swsusp_read(unsigned int *flags_p) { int error; @@ -1567,8 +1552,9 @@ static void *swsusp_holder; /** * swsusp_check - Open the resume device and check for the swsusp signature. * @exclusive: Open the resume device exclusively. + * + * Return: 0 if a valid image is found, negative error code otherwise. */ - int swsusp_check(bool exclusive) { void *holder = exclusive ? &swsusp_holder : NULL; @@ -1618,7 +1604,6 @@ put: /** * swsusp_close - close resume device. */ - void swsusp_close(void) { if (IS_ERR(hib_resume_bdev_file)) { @@ -1630,9 +1615,10 @@ void swsusp_close(void) } /** - * swsusp_unmark - Unmark swsusp signature in the resume device + * swsusp_unmark - Unmark swsusp signature in the resume device + * + * Return: 0 on success, negative error code on failure. */ - #ifdef CONFIG_SUSPEND int swsusp_unmark(void) { @@ -1658,8 +1644,46 @@ int swsusp_unmark(void) } #endif +static ssize_t hibernate_compression_threads_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", hibernate_compression_threads); +} + +static ssize_t hibernate_compression_threads_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 0, &val)) + return -EINVAL; + + if (val < 1) + return -EINVAL; + + hibernate_compression_threads = val; + return n; +} +power_attr(hibernate_compression_threads); + +static struct attribute *g[] = { + &hibernate_compression_threads_attr.attr, + NULL, +}; + +static const struct attribute_group attr_group = { + .attrs = g, +}; + static int __init swsusp_header_init(void) { + int error; + + error = sysfs_create_group(power_kobj, &attr_group); + if (error) + return -ENOMEM; + swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); if (!swsusp_header) panic("Could not allocate memory for swsusp_header\n"); @@ -1667,3 +1691,19 @@ static int __init swsusp_header_init(void) } core_initcall(swsusp_header_init); + +static int __init hibernate_compression_threads_setup(char *str) +{ + int rc = kstrtouint(str, 0, &hibernate_compression_threads); + + if (rc) + return rc; + + if (hibernate_compression_threads < 1) + hibernate_compression_threads = CMP_THREADS; + + return 1; + +} + +__setup("hibernate_compression_threads=", hibernate_compression_threads_setup); diff --git a/kernel/power/user.c b/kernel/power/user.c index 3f9e3efb9f6e..4401cfe26e5c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -278,7 +278,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (data->frozen) break; - ksys_sync_helper(); + error = pm_sleep_fs_sync(); + if (error) + break; error = freeze_processes(); if (error) diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index f72bbfa266d6..5f5f626f4279 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -3,7 +3,6 @@ * internal.h - printk internal definitions */ #include <linux/console.h> -#include <linux/percpu.h> #include <linux/types.h> #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) @@ -112,47 +111,6 @@ bool nbcon_kthread_create(struct console *con); void nbcon_kthread_stop(struct console *con); void nbcon_kthreads_wake(void); -/* - * Check if the given console is currently capable and allowed to print - * records. Note that this function does not consider the current context, - * which can also play a role in deciding if @con can be used to print - * records. - */ -static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) -{ - if (!(flags & CON_ENABLED)) - return false; - - if ((flags & CON_SUSPENDED)) - return false; - - if (flags & CON_NBCON) { - /* The write_atomic() callback is optional. */ - if (use_atomic && !con->write_atomic) - return false; - - /* - * For the !use_atomic case, @printk_kthreads_running is not - * checked because the write_thread() callback is also used - * via the legacy loop when the printer threads are not - * available. - */ - } else { - if (!con->write) - return false; - } - - /* - * Console drivers may assume that per-cpu resources have been - * allocated. So unless they're explicitly marked as being able to - * cope (CON_ANYTIME) don't call them until this CPU is officially up. - */ - if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) - return false; - - return true; -} - /** * nbcon_kthread_wake - Wake up a console printing thread * @con: Console to operate on @@ -204,9 +162,6 @@ static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *hand static inline void nbcon_kthread_wake(struct console *con) { } static inline void nbcon_kthreads_wake(void) { } -static inline bool console_is_usable(struct console *con, short flags, - bool use_atomic) { return false; } - #endif /* CONFIG_PRINTK */ extern bool have_boot_console; @@ -230,6 +185,8 @@ struct console_flush_type { bool legacy_offload; }; +extern bool console_irqwork_blocked; + /* * Identify which console flushing methods should be used in the context of * the caller. @@ -241,7 +198,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft) switch (nbcon_get_default_prio()) { case NBCON_PRIO_NORMAL: if (have_nbcon_console && !have_boot_console) { - if (printk_kthreads_running) + if (printk_kthreads_running && !console_irqwork_blocked) ft->nbcon_offload = true; else ft->nbcon_atomic = true; @@ -251,7 +208,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft) if (have_legacy_console || have_boot_console) { if (!is_printk_legacy_deferred()) ft->legacy_direct = true; - else + else if (!console_irqwork_blocked) ft->legacy_offload = true; } break; @@ -264,7 +221,7 @@ static inline void printk_get_console_flush_type(struct console_flush_type *ft) if (have_legacy_console || have_boot_console) { if (!is_printk_legacy_deferred()) ft->legacy_direct = true; - else + else if (!console_irqwork_blocked) ft->legacy_offload = true; } break; diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 558ef3177976..3fa403f9831f 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -10,6 +10,7 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/irqflags.h> +#include <linux/kdb.h> #include <linux/kthread.h> #include <linux/minmax.h> #include <linux/panic.h> @@ -118,6 +119,9 @@ * from scratch. */ +/* Counter of active nbcon emergency contexts. */ +static atomic_t nbcon_cpu_emergency_cnt = ATOMIC_INIT(0); + /** * nbcon_state_set - Helper function to set the console state * @con: Console to update @@ -249,13 +253,16 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, * since all non-panic CPUs are stopped during panic(), it * is safer to have them avoid gaining console ownership. * - * If this acquire is a reacquire (and an unsafe takeover + * One exception is when kdb has locked for printing on this CPU. + * + * Second exception is a reacquire (and an unsafe takeover * has not previously occurred) then it is allowed to attempt * a direct acquire in panic. This gives console drivers an * opportunity to perform any necessary cleanup if they were * interrupted by the panic CPU while printing. */ if (panic_on_other_cpu() && + !kdb_printf_on_this_cpu() && (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; } @@ -850,8 +857,8 @@ out: return nbcon_context_can_proceed(ctxt, &cur); } -static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, - char *buf, unsigned int len) +void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; @@ -1163,6 +1170,17 @@ static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_contex if (kthread_should_stop()) return true; + /* + * Block the kthread when the system is in an emergency or panic mode. + * It increases the chance that these contexts would be able to show + * the messages directly. And it reduces the risk of interrupted writes + * where the context with a higher priority takes over the nbcon console + * ownership in the middle of a message. + */ + if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) || + unlikely(panic_in_progress())) + return false; + cookie = console_srcu_read_lock(); flags = console_srcu_read_flags(con); @@ -1214,6 +1232,14 @@ wait_for_event: if (kthread_should_stop()) return 0; + /* + * Block the kthread when the system is in an emergency or panic + * mode. See nbcon_kthread_should_wakeup() for more details. + */ + if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) || + unlikely(panic_in_progress())) + goto wait_for_event; + backlog = false; /* @@ -1276,6 +1302,13 @@ void nbcon_kthreads_wake(void) if (!printk_kthreads_running) return; + /* + * It is not allowed to call this function when console irq_work + * is blocked. + */ + if (WARN_ON_ONCE(console_irqwork_blocked)) + return; + cookie = console_srcu_read_lock(); for_each_console_srcu(con) { if (!(console_srcu_read_flags(con) & CON_NBCON)) @@ -1404,6 +1437,26 @@ enum nbcon_prio nbcon_get_default_prio(void) return NBCON_PRIO_NORMAL; } +/* + * Track if it is allowed to perform unsafe hostile takeovers of console + * ownership. When true, console drivers might perform unsafe actions while + * printing. It is externally available via nbcon_allow_unsafe_takeover(). + */ +static bool panic_nbcon_allow_unsafe_takeover; + +/** + * nbcon_allow_unsafe_takeover - Check if unsafe console takeovers are allowed + * + * Return: True, when it is permitted to perform unsafe console printing + * + * This is also used by console_is_usable() to determine if it is allowed to + * call write_atomic() callbacks flagged as unsafe (CON_NBCON_ATOMIC_UNSAFE). + */ +bool nbcon_allow_unsafe_takeover(void) +{ + return panic_on_this_cpu() && panic_nbcon_allow_unsafe_takeover; +} + /** * nbcon_legacy_emit_next_record - Print one record for an nbcon console * in legacy contexts @@ -1474,7 +1527,6 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on * failure. @@ -1493,8 +1545,7 @@ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, * returned, it cannot be expected that the unfinalized record will become * available. */ -static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, - bool allow_unsafe_takeover) +static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); @@ -1503,12 +1554,12 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, ctxt->console = con; ctxt->spinwait_max_us = 2000; ctxt->prio = nbcon_get_default_prio(); - ctxt->allow_unsafe_takeover = allow_unsafe_takeover; - - if (!nbcon_context_try_acquire(ctxt, false)) - return -EPERM; + ctxt->allow_unsafe_takeover = nbcon_allow_unsafe_takeover(); while (nbcon_seq_read(con) < stop_seq) { + if (!nbcon_context_try_acquire(ctxt, false)) + return -EPERM; + /* * nbcon_emit_next_record() returns false when the console was * handed over or taken over. In both cases the context is no @@ -1517,6 +1568,8 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, if (!nbcon_emit_next_record(&wctxt, true)) return -EAGAIN; + nbcon_context_release(ctxt); + if (!ctxt->backlog) { /* Are there reserved but not yet finalized records? */ if (nbcon_seq_read(con) < stop_seq) @@ -1525,7 +1578,6 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, } } - nbcon_context_release(ctxt); return err; } @@ -1534,15 +1586,13 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * This will stop flushing before @stop_seq if another context has ownership. * That context is then responsible for the flushing. Likewise, if new records * are added while this context was flushing and there is no other context * to handle the printing, this context must also flush those records. */ -static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, - bool allow_unsafe_takeover) +static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq) { struct console_flush_type ft; unsigned long flags; @@ -1557,7 +1607,7 @@ again: */ local_irq_save(flags); - err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + err = __nbcon_atomic_flush_pending_con(con, stop_seq); local_irq_restore(flags); @@ -1589,9 +1639,8 @@ again: * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * @stop_seq: Flush up until this record - * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers */ -static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover) +static void __nbcon_atomic_flush_pending(u64 stop_seq) { struct console *con; int cookie; @@ -1609,7 +1658,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove if (nbcon_seq_read(con) >= stop_seq) continue; - nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + nbcon_atomic_flush_pending_con(con, stop_seq); } console_srcu_read_unlock(cookie); } @@ -1625,7 +1674,7 @@ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeove */ void nbcon_atomic_flush_pending(void) { - __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false); + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); } /** @@ -1637,7 +1686,9 @@ void nbcon_atomic_flush_pending(void) */ void nbcon_atomic_flush_unsafe(void) { - __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true); + panic_nbcon_allow_unsafe_takeover = true; + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb)); + panic_nbcon_allow_unsafe_takeover = false; } /** @@ -1655,6 +1706,8 @@ void nbcon_cpu_emergency_enter(void) preempt_disable(); + atomic_inc(&nbcon_cpu_emergency_cnt); + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); (*cpu_emergency_nesting)++; } @@ -1669,10 +1722,24 @@ void nbcon_cpu_emergency_exit(void) unsigned int *cpu_emergency_nesting; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); - if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0)) (*cpu_emergency_nesting)--; + /* + * Wake up kthreads because there might be some pending messages + * added by other CPUs with normal priority since the last flush + * in the emergency context. + */ + if (!WARN_ON_ONCE(atomic_read(&nbcon_cpu_emergency_cnt) == 0)) { + if (atomic_dec_return(&nbcon_cpu_emergency_cnt) == 0) { + struct console_flush_type ft; + + printk_get_console_flush_type(&ft); + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + } + } + preempt_enable(); } @@ -1844,14 +1911,75 @@ void nbcon_device_release(struct console *con) * using the legacy loop. */ if (ft.nbcon_atomic) { - __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false); + __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb)); } else if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } else if (ft.legacy_offload) { - printk_trigger_flush(); + defer_console_output(); } } console_srcu_read_unlock(cookie); } EXPORT_SYMBOL_GPL(nbcon_device_release); + +/** + * nbcon_kdb_try_acquire - Try to acquire nbcon console and enter unsafe + * section + * @con: The nbcon console to acquire + * @wctxt: The nbcon write context to be used on success + * + * Context: Under console_srcu_read_lock() for emitting a single kdb message + * using the given con->write_atomic() callback. Can be called + * only when the console is usable at the moment. + * + * Return: True if the console was acquired. False otherwise. + * + * kdb emits messages on consoles registered for printk() without + * storing them into the ring buffer. It has to acquire the console + * ownerhip so that it could call con->write_atomic() callback a safe way. + * + * This function acquires the nbcon console using priority NBCON_PRIO_EMERGENCY + * and marks it unsafe for handover/takeover. + */ +bool nbcon_kdb_try_acquire(struct console *con, + struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->console = con; + ctxt->prio = NBCON_PRIO_EMERGENCY; + + if (!nbcon_context_try_acquire(ctxt, false)) + return false; + + if (!nbcon_context_enter_unsafe(ctxt)) + return false; + + return true; +} + +/** + * nbcon_kdb_release - Exit unsafe section and release the nbcon console + * + * @wctxt: The nbcon write context initialized by a successful + * nbcon_kdb_try_acquire() + */ +void nbcon_kdb_release(struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + if (!nbcon_context_exit_unsafe(ctxt)) + return; + + nbcon_context_release(ctxt); + + /* + * Flush any new printk() messages added when the console was blocked. + * Only the console used by the given write context was blocked. + * The console was locked only when the write_atomic() callback + * was usable. + */ + __nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb)); +} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5aee9ffb16b9..7394f1b6033b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -462,6 +462,9 @@ bool have_boot_console; /* See printk_legacy_allow_panic_sync() for details. */ bool legacy_allow_panic_sync; +/* Avoid using irq_work when suspending. */ +bool console_irqwork_blocked; + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); @@ -2390,7 +2393,7 @@ asmlinkage int vprintk_emit(int facility, int level, /* If called from the scheduler, we can not call up(). */ if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; - ft.legacy_offload |= ft.legacy_direct; + ft.legacy_offload |= ft.legacy_direct && !console_irqwork_blocked; ft.legacy_direct = false; } @@ -2426,7 +2429,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (ft.legacy_offload) defer_console_output(); - else + else if (!console_irqwork_blocked) wake_up_klogd(); return printed_len; @@ -2730,10 +2733,20 @@ void console_suspend_all(void) { struct console *con; + if (console_suspend_enabled) + pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); + + /* + * Flush any console backlog and then avoid queueing irq_work until + * console_resume_all(). Until then deferred printing is no longer + * triggered, NBCON consoles transition to atomic flushing, and + * any klogd waiters are not triggered. + */ + pr_flush(1000, true); + console_irqwork_blocked = true; + if (!console_suspend_enabled) return; - pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); - pr_flush(1000, true); console_list_lock(); for_each_console(con) @@ -2754,26 +2767,34 @@ void console_resume_all(void) struct console_flush_type ft; struct console *con; - if (!console_suspend_enabled) - return; - - console_list_lock(); - for_each_console(con) - console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); - console_list_unlock(); - /* - * Ensure that all SRCU list walks have completed. All printing - * contexts must be able to see they are no longer suspended so - * that they are guaranteed to wake up and resume printing. + * Allow queueing irq_work. After restoring console state, deferred + * printing and any klogd waiters need to be triggered in case there + * is now a console backlog. */ - synchronize_srcu(&console_srcu); + console_irqwork_blocked = false; + + if (console_suspend_enabled) { + console_list_lock(); + for_each_console(con) + console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All printing + * contexts must be able to see they are no longer suspended so + * that they are guaranteed to wake up and resume printing. + */ + synchronize_srcu(&console_srcu); + } printk_get_console_flush_type(&ft); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); + else + wake_up_klogd(); pr_flush(1000, true); } @@ -3002,21 +3023,18 @@ out: } /* - * Legacy console printing from printk() caller context does not respect - * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a - * false positive. For PREEMPT_RT the false positive condition does not - * occur. - * - * This map is used to temporarily establish LD_WAIT_SLEEP context for the - * console write() callback when legacy printing to avoid false positive - * lockdep complaints, thus allowing lockdep to continue to function for - * real issues. + * The legacy console always acquires a spinlock_t from its printing + * callback. This violates lock nesting if the caller acquired an always + * spinning lock (raw_spinlock_t) while invoking printk(). This is not a + * problem on PREEMPT_RT because legacy consoles print always from a + * dedicated thread and never from within printk(). Therefore we tell + * lockdep that a sleeping spin lock (spinlock_t) is valid here. */ #ifdef CONFIG_PREEMPT_RT static inline void printk_legacy_allow_spinlock_enter(void) { } static inline void printk_legacy_allow_spinlock_exit(void) { } #else -static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); +static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_CONFIG); static inline void printk_legacy_allow_spinlock_enter(void) { @@ -3134,104 +3152,147 @@ static inline void printk_kthreads_check_locked(void) { } #endif /* CONFIG_PRINTK */ + /* - * Print out all remaining records to all consoles. + * Print out one record for each console. * * @do_cond_resched is set by the caller. It can be true only in schedulable * context. * * @next_seq is set to the sequence number after the last available record. - * The value is valid only when this function returns true. It means that all - * usable consoles are completely flushed. + * The value is valid only when all usable consoles were flushed. It is + * when the function returns true (can do the job) and @try_again parameter + * is set to false, see below. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the * console_lock. Otherwise it is set to false. * - * Returns true when there was at least one usable console and all messages - * were flushed to all usable consoles. A returned false informs the caller - * that everything was not flushed (either there were no usable consoles or - * another context has taken over printing or it is a panic situation and this - * is not the panic CPU). Regardless the reason, the caller should assume it - * is not useful to immediately try again. + * @try_again will be set to true when it still makes sense to call this + * function again. The function could do the job, see the return value. + * And some consoles still make progress. + * + * Returns true when the function could do the job. Some consoles are usable, + * and there was no takeover and no panic_on_other_cpu(). * * Requires the console_lock. */ -static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) +static bool console_flush_one_record(bool do_cond_resched, u64 *next_seq, bool *handover, + bool *try_again) { struct console_flush_type ft; bool any_usable = false; struct console *con; - bool any_progress; int cookie; - *next_seq = 0; - *handover = false; + *try_again = false; - do { - any_progress = false; + printk_get_console_flush_type(&ft); - printk_get_console_flush_type(&ft); + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + u64 printk_seq; + bool progress; - cookie = console_srcu_read_lock(); - for_each_console_srcu(con) { - short flags = console_srcu_read_flags(con); - u64 printk_seq; - bool progress; + /* + * console_flush_one_record() is only responsible for + * nbcon consoles when the nbcon consoles cannot print via + * their atomic or threaded flushing. + */ + if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) + continue; - /* - * console_flush_all() is only responsible for nbcon - * consoles when the nbcon consoles cannot print via - * their atomic or threaded flushing. - */ - if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) - continue; + if (!console_is_usable(con, flags, !do_cond_resched)) + continue; + any_usable = true; - if (!console_is_usable(con, flags, !do_cond_resched)) - continue; - any_usable = true; + if (flags & CON_NBCON) { + progress = nbcon_legacy_emit_next_record(con, handover, cookie, + !do_cond_resched); + printk_seq = nbcon_seq_read(con); + } else { + progress = console_emit_next_record(con, handover, cookie); + printk_seq = con->seq; + } - if (flags & CON_NBCON) { - progress = nbcon_legacy_emit_next_record(con, handover, cookie, - !do_cond_resched); - printk_seq = nbcon_seq_read(con); - } else { - progress = console_emit_next_record(con, handover, cookie); - printk_seq = con->seq; - } + /* + * If a handover has occurred, the SRCU read lock + * is already released. + */ + if (*handover) + goto fail; - /* - * If a handover has occurred, the SRCU read lock - * is already released. - */ - if (*handover) - return false; + /* Track the next of the highest seq flushed. */ + if (printk_seq > *next_seq) + *next_seq = printk_seq; - /* Track the next of the highest seq flushed. */ - if (printk_seq > *next_seq) - *next_seq = printk_seq; + if (!progress) + continue; - if (!progress) - continue; - any_progress = true; + /* + * An usable console made a progress. There might still be + * pending messages. + */ + *try_again = true; - /* Allow panic_cpu to take over the consoles safely. */ - if (panic_on_other_cpu()) - goto abandon; + /* Allow panic_cpu to take over the consoles safely. */ + if (panic_on_other_cpu()) + goto fail_srcu; - if (do_cond_resched) - cond_resched(); - } - console_srcu_read_unlock(cookie); - } while (any_progress); + if (do_cond_resched) + cond_resched(); + } + console_srcu_read_unlock(cookie); return any_usable; -abandon: +fail_srcu: console_srcu_read_unlock(cookie); +fail: + *try_again = false; return false; } +/* + * Print out all remaining records to all consoles. + * + * @do_cond_resched is set by the caller. It can be true only in schedulable + * context. + * + * @next_seq is set to the sequence number after the last available record. + * The value is valid only when this function returns true. It means that all + * usable consoles are completely flushed. + * + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. Otherwise it is set to false. + * + * Returns true when there was at least one usable console and all messages + * were flushed to all usable consoles. A returned false informs the caller + * that everything was not flushed (either there were no usable consoles or + * another context has taken over printing or it is a panic situation and this + * is not the panic CPU). Regardless the reason, the caller should assume it + * is not useful to immediately try again. + * + * Requires the console_lock. + */ +static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) +{ + bool try_again; + bool ret; + + *next_seq = 0; + *handover = false; + + do { + ret = console_flush_one_record(do_cond_resched, next_seq, + handover, &try_again); + } while (try_again); + + return ret; +} + static void __console_flush_and_unlock(void) { bool do_cond_resched; @@ -3331,12 +3392,10 @@ void console_unblank(void) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - short flags = console_srcu_read_flags(c); - - if (flags & CON_SUSPENDED) + if (!console_is_usable(c, console_srcu_read_flags(c), true)) continue; - if ((flags & CON_ENABLED) && c->unblank) { + if (c->unblank) { found_unblank = true; break; } @@ -3373,12 +3432,10 @@ void console_unblank(void) cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - short flags = console_srcu_read_flags(c); - - if (flags & CON_SUSPENDED) + if (!console_is_usable(c, console_srcu_read_flags(c), true)) continue; - if ((flags & CON_ENABLED) && c->unblank) + if (c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); @@ -3601,17 +3658,26 @@ static bool legacy_kthread_should_wakeup(void) static int legacy_kthread_func(void *unused) { - for (;;) { - wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); + bool try_again; + +wait_for_event: + wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); + + do { + bool handover = false; + u64 next_seq = 0; if (kthread_should_stop()) - break; + return 0; console_lock(); - __console_flush_and_unlock(); - } + console_flush_one_record(true, &next_seq, &handover, &try_again); + if (!handover) + __console_unlock(); - return 0; + } while (try_again); + + goto wait_for_event; } static bool legacy_kthread_create(void) @@ -4511,6 +4577,13 @@ static void __wake_up_klogd(int val) if (!printk_percpu_data_ready()) return; + /* + * It is not allowed to call this function when console irq_work + * is blocked. + */ + if (WARN_ON_ONCE(console_irqwork_blocked)) + return; + preempt_disable(); /* * Guarantee any new records can be seen by tasks preparing to wait @@ -4567,9 +4640,30 @@ void defer_console_output(void) __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } +/** + * printk_trigger_flush - Attempt to flush printk buffer to consoles. + * + * If possible, flush the printk buffer to all consoles in the caller's + * context. If offloading is available, trigger deferred printing. + * + * This is best effort. Depending on the system state, console states, + * and caller context, no actual flushing may result from this call. + */ void printk_trigger_flush(void) { - defer_console_output(); + struct console_flush_type ft; + + printk_get_console_flush_type(&ft); + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + if (ft.legacy_direct) { + if (console_trylock()) + console_unlock(); + } + if (ft.legacy_offload) + defer_console_output(); } int vprintk_deferred(const char *fmt, va_list args) diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 40198bffb7d0..56c8e3d031f4 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -411,6 +411,23 @@ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) return to_blk_size(size) <= DATA_SIZE(data_ring) / 2; } +/* + * Compare the current and requested logical position and decide + * whether more space is needed. + * + * Return false when @lpos_current is already at or beyond @lpos_target. + * + * Also return false when the difference between the positions is bigger + * than the size of the data buffer. It might happen only when the caller + * raced with another CPU(s) which already made and used the space. + */ +static bool need_more_space(struct prb_data_ring *data_ring, + unsigned long lpos_current, + unsigned long lpos_target) +{ + return lpos_target - lpos_current - 1 < DATA_SIZE(data_ring); +} + /* Query the state of a descriptor. */ static enum desc_state get_desc_state(unsigned long id, unsigned long state_val) @@ -577,7 +594,7 @@ static bool data_make_reusable(struct printk_ringbuffer *rb, unsigned long id; /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ - while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { + while (need_more_space(data_ring, lpos_begin, lpos_end)) { blk = to_block(data_ring, lpos_begin); /* @@ -668,7 +685,7 @@ static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos) * sees the new tail lpos, any descriptor states that transitioned to * the reusable state must already be visible. */ - while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { + while (need_more_space(data_ring, tail_lpos, lpos)) { /* * Make all descriptors reusable that are associated with * data blocks before @lpos. @@ -999,6 +1016,17 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) return true; } +static bool is_blk_wrapped(struct prb_data_ring *data_ring, + unsigned long begin_lpos, unsigned long next_lpos) +{ + /* + * Subtract one from next_lpos since it's not actually part of this data + * block. This allows perfectly fitting records to not wrap. + */ + return DATA_WRAPS(data_ring, begin_lpos) != + DATA_WRAPS(data_ring, next_lpos - 1); +} + /* Determine the end of a data block. */ static unsigned long get_next_lpos(struct prb_data_ring *data_ring, unsigned long lpos, unsigned int size) @@ -1010,7 +1038,7 @@ static unsigned long get_next_lpos(struct prb_data_ring *data_ring, next_lpos = lpos + size; /* First check if the data block does not wrap. */ - if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) + if (!is_blk_wrapped(data_ring, begin_lpos, next_lpos)) return next_lpos; /* Wrapping data blocks store their data at the beginning. */ @@ -1087,7 +1115,7 @@ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, blk = to_block(data_ring, begin_lpos); blk->id = id; /* LMM(data_alloc:B) */ - if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { + if (is_blk_wrapped(data_ring, begin_lpos, next_lpos)) { /* Wrapping data blocks store their data at the beginning. */ blk = to_block(data_ring, 0); @@ -1131,14 +1159,21 @@ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, return NULL; /* Keep track if @blk_lpos was a wrapping data block. */ - wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); + wrapped = is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next); size = to_blk_size(size); next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); - /* If the data block does not increase, there is nothing to do. */ - if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { + /* + * Use the current data block when the size does not increase, i.e. + * when @head_lpos is already able to accommodate the new @next_lpos. + * + * Note that need_more_space() could never return false here because + * the difference between the positions was bigger than the data + * buffer size. The data block is reopened and can't get reused. + */ + if (!need_more_space(data_ring, head_lpos, next_lpos)) { if (wrapped) blk = to_block(data_ring, 0); else @@ -1167,7 +1202,7 @@ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, blk = to_block(data_ring, blk_lpos->begin); - if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { + if (is_blk_wrapped(data_ring, blk_lpos->begin, next_lpos)) { struct prb_data_block *old_blk = blk; /* Wrapping data blocks store their data at the beginning. */ @@ -1203,7 +1238,7 @@ static unsigned int space_used(struct prb_data_ring *data_ring, if (BLK_DATALESS(blk_lpos)) return 0; - if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { + if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) { /* Data block does not wrap. */ return (DATA_INDEX(data_ring, blk_lpos->next) - DATA_INDEX(data_ring, blk_lpos->begin)); @@ -1249,15 +1284,15 @@ static const char *get_data(struct prb_data_ring *data_ring, return NULL; } - /* Regular data block: @begin less than @next and in same wrap. */ - if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && - blk_lpos->begin < blk_lpos->next) { + /* Regular data block: @begin and @next in the same wrap. */ + if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) { db = to_block(data_ring, blk_lpos->begin); *data_size = blk_lpos->next - blk_lpos->begin; /* Wrapping data block: @begin is one wrap behind @next. */ - } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == - DATA_WRAPS(data_ring, blk_lpos->next)) { + } else if (!is_blk_wrapped(data_ring, + blk_lpos->begin + DATA_SIZE(data_ring), + blk_lpos->next)) { db = to_block(data_ring, 0); *data_size = DATA_INDEX(data_ring, blk_lpos->next); @@ -1267,6 +1302,10 @@ static const char *get_data(struct prb_data_ring *data_ring, return NULL; } + /* Sanity check. Data-less blocks were handled earlier. */ + if (WARN_ON_ONCE(!data_check_size(data_ring, *data_size) || !*data_size)) + return NULL; + /* A valid data block will always be aligned to the ID size. */ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 75a84efad40f..392ec2f75f01 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, unsigned long size, void __user *data) { struct ptrace_rseq_configuration conf = { - .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, - .rseq_abi_size = task->rseq_len, - .signature = task->rseq_sig, + .rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr, + .rseq_abi_size = task->rseq.len, + .signature = task->rseq.sig, .flags = 0, }; diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 12e4c64ebae1..625d75392647 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -213,4 +213,19 @@ config RCU_STRICT_GRACE_PERIOD when looking for certain types of RCU usage bugs, for example, too-short RCU read-side critical sections. + +config RCU_DYNTICKS_TORTURE + bool "Minimize RCU dynticks counter size" + depends on RCU_EXPERT && !COMPILE_TEST + default n + help + This option sets the width of the dynticks counter to its + minimum usable value. This minimum width greatly increases + the probability of flushing out bugs involving counter wrap, + but it also increases the probability of extending grace period + durations. This Kconfig option should therefore be avoided in + production due to the consequent increased probability of OOMs. + + This has no value for production and is only for testing. + endmenu # "RCU Debugging" diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 29fe3c01312f..07e51974b06b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -389,6 +389,7 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + void (*exp_current)(void); unsigned long (*get_gp_state_exp)(void); unsigned long (*start_gp_poll_exp)(void); void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp); @@ -691,10 +692,29 @@ static struct rcu_torture_ops rcu_busted_ops = { */ DEFINE_STATIC_SRCU(srcu_ctl); +DEFINE_STATIC_SRCU_FAST(srcu_ctlf); +DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_ctlfud); static struct srcu_struct srcu_ctld; static struct srcu_struct *srcu_ctlp = &srcu_ctl; static struct rcu_torture_ops srcud_ops; +static void srcu_torture_init(void) +{ + rcu_sync_torture_init(); + if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) + VERBOSE_TOROUT_STRING("srcu_torture_init normal SRCU"); + if (reader_flavor & SRCU_READ_FLAVOR_NMI) + VERBOSE_TOROUT_STRING("srcu_torture_init NMI-safe SRCU"); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + srcu_ctlp = &srcu_ctlf; + VERBOSE_TOROUT_STRING("srcu_torture_init fast SRCU"); + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + srcu_ctlp = &srcu_ctlfud; + VERBOSE_TOROUT_STRING("srcu_torture_init fast-up/down SRCU"); + } +} + static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) { srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq); @@ -722,6 +742,12 @@ static int srcu_torture_read_lock(void) scp = srcu_read_lock_fast(srcu_ctlp); idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); WARN_ON_ONCE(idx & ~0x1); + ret += idx << 2; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); ret += idx << 3; } return ret; @@ -749,8 +775,11 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) + srcu_read_unlock_fast_updown(srcu_ctlp, + __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); if (reader_flavor & SRCU_READ_FLAVOR_FAST) - srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x4) >> 2)); if (reader_flavor & SRCU_READ_FLAVOR_NMI) srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) @@ -784,7 +813,7 @@ static int srcu_torture_down_read(void) WARN_ON_ONCE(idx & ~0x1); return idx; } - if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { scp = srcu_down_read_fast(srcu_ctlp); idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); WARN_ON_ONCE(idx & ~0x1); @@ -797,7 +826,7 @@ static int srcu_torture_down_read(void) static void srcu_torture_up_read(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); - if (reader_flavor & SRCU_READ_FLAVOR_FAST) + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) @@ -857,9 +886,14 @@ static void srcu_torture_synchronize_expedited(void) synchronize_srcu_expedited(srcu_ctlp); } +static void srcu_torture_expedite_current(void) +{ + srcu_expedite_current(srcu_ctlp); +} + static struct rcu_torture_ops srcu_ops = { .ttype = SRCU_FLAVOR, - .init = rcu_sync_torture_init, + .init = srcu_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, @@ -871,6 +905,7 @@ static struct rcu_torture_ops srcu_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .exp_current = srcu_torture_expedite_current, .same_gp_state = same_state_synchronize_srcu, .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, @@ -886,14 +921,28 @@ static struct rcu_torture_ops srcu_ops = { .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) - ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcu" }; -static void srcu_torture_init(void) +static void srcud_torture_init(void) { rcu_sync_torture_init(); - WARN_ON(init_srcu_struct(&srcu_ctld)); + if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) { + WARN_ON(init_srcu_struct(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init normal SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_NMI) { + WARN_ON(init_srcu_struct(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init NMI-safe SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + WARN_ON(init_srcu_struct_fast(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init fast SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + WARN_ON(init_srcu_struct_fast_updown(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init fast-up/down SRCU"); + } else { + WARN_ON(init_srcu_struct(&srcu_ctld)); + } srcu_ctlp = &srcu_ctld; } @@ -906,7 +955,7 @@ static void srcu_torture_cleanup(void) /* As above, but dynamically allocated. */ static struct rcu_torture_ops srcud_ops = { .ttype = SRCU_FLAVOR, - .init = srcu_torture_init, + .init = srcud_torture_init, .cleanup = srcu_torture_cleanup, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, @@ -919,6 +968,7 @@ static struct rcu_torture_ops srcud_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .exp_current = srcu_torture_expedite_current, .same_gp_state = same_state_synchronize_srcu, .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, @@ -934,7 +984,7 @@ static struct rcu_torture_ops srcud_ops = { .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) - ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcud" }; @@ -1700,6 +1750,8 @@ rcu_torture_writer(void *arg) ulo[i] = cur_ops->get_comp_state(); gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; + if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + cur_ops->exp_current(); while (!cur_ops->poll_gp_state(gp_snap)) { gp_snap1 = cur_ops->get_gp_state(); for (i = 0; i < ulo_size; i++) @@ -1720,6 +1772,8 @@ rcu_torture_writer(void *arg) cur_ops->get_comp_state_full(&rgo[i]); cur_ops->start_gp_poll_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; + if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + cur_ops->exp_current(); while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { cur_ops->get_gp_state_full(&gp_snap1_full); for (i = 0; i < rgo_size; i++) @@ -2384,10 +2438,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) newstate = rcutorture_extend_mask(rtors.readstate, trsp); WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN); rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++); - if (!rcu_torture_one_read_start(&rtors, trsp, myid)) { - rcutorture_one_extend(&rtors.readstate, 0, trsp, rtors.rtrsp); + if (!rcu_torture_one_read_start(&rtors, trsp, myid)) return false; - } rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp); rcu_torture_one_read_end(&rtors, trsp); return true; diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 19841704d8f5..07a313782dfd 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -136,6 +136,7 @@ struct ref_scale_ops { void (*cleanup)(void); void (*readsection)(const int nloops); void (*delaysection)(const int nloops, const int udl, const int ndl); + bool enable_irqs; const char *name; }; @@ -184,6 +185,8 @@ static const struct ref_scale_ops rcu_ops = { // Definitions for SRCU ref scale testing. DEFINE_STATIC_SRCU(srcu_refctl_scale); +DEFINE_STATIC_SRCU_FAST(srcu_fast_refctl_scale); +DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_fast_updown_refctl_scale); static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale; static void srcu_ref_scale_read_section(const int nloops) @@ -216,6 +219,12 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static bool srcu_fast_sync_scale_init(void) +{ + srcu_ctlp = &srcu_fast_refctl_scale; + return true; +} + static void srcu_fast_ref_scale_read_section(const int nloops) { int i; @@ -240,12 +249,48 @@ static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, c } static const struct ref_scale_ops srcu_fast_ops = { - .init = rcu_sync_scale_init, + .init = srcu_fast_sync_scale_init, .readsection = srcu_fast_ref_scale_read_section, .delaysection = srcu_fast_ref_scale_delay_section, .name = "srcu-fast" }; +static bool srcu_fast_updown_sync_scale_init(void) +{ + srcu_ctlp = &srcu_fast_updown_refctl_scale; + return true; +} + +static void srcu_fast_updown_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + srcu_read_unlock_fast_updown(srcu_ctlp, scp); + } +} + +static void srcu_fast_updown_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast_updown(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_updown_ops = { + .init = srcu_fast_updown_sync_scale_init, + .readsection = srcu_fast_updown_ref_scale_read_section, + .delaysection = srcu_fast_updown_ref_scale_delay_section, + .name = "srcu-fast-updown" +}; + #ifdef CONFIG_TASKS_RCU // Definitions for RCU Tasks ref scale testing: Empty read markers. @@ -323,6 +368,9 @@ static const struct ref_scale_ops rcu_trace_ops = { // Definitions for reference count static atomic_t refcnt; +// Definitions acquire-release. +static DEFINE_PER_CPU(unsigned long, test_acqrel); + static void ref_refcnt_section(const int nloops) { int i; @@ -351,6 +399,184 @@ static const struct ref_scale_ops refcnt_ops = { .name = "refcnt" }; +static void ref_percpuinc_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + this_cpu_inc(test_acqrel); + this_cpu_dec(test_acqrel); + } +} + +static void ref_percpuinc_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + this_cpu_inc(test_acqrel); + un_delay(udl, ndl); + this_cpu_dec(test_acqrel); + } +} + +static const struct ref_scale_ops percpuinc_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_percpuinc_section, + .delaysection = ref_percpuinc_delay_section, + .name = "percpuinc" +}; + +// Note that this can lose counts in preemptible kernels. +static void ref_incpercpu_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap = this_cpu_ptr(&test_acqrel); + + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + } +} + +static void ref_incpercpu_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap = this_cpu_ptr(&test_acqrel); + + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + } +} + +static const struct ref_scale_ops incpercpu_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpu_section, + .delaysection = ref_incpercpu_delay_section, + .name = "incpercpu" +}; + +static void ref_incpercpupreempt_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + preempt_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + preempt_enable(); + } +} + +static void ref_incpercpupreempt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + preempt_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + preempt_enable(); + } +} + +static const struct ref_scale_ops incpercpupreempt_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpupreempt_section, + .delaysection = ref_incpercpupreempt_delay_section, + .name = "incpercpupreempt" +}; + +static void ref_incpercpubh_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_bh_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_bh_enable(); + } +} + +static void ref_incpercpubh_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_bh_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_bh_enable(); + } +} + +static const struct ref_scale_ops incpercpubh_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpubh_section, + .delaysection = ref_incpercpubh_delay_section, + .enable_irqs = true, + .name = "incpercpubh" +}; + +static void ref_incpercpuirqsave_section(const int nloops) +{ + int i; + unsigned long flags; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_irq_save(flags); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_irq_restore(flags); + } +} + +static void ref_incpercpuirqsave_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + unsigned long flags; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_irq_save(flags); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_irq_restore(flags); + } +} + +static const struct ref_scale_ops incpercpuirqsave_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpuirqsave_section, + .delaysection = ref_incpercpuirqsave_delay_section, + .name = "incpercpuirqsave" +}; + // Definitions for rwlock static rwlock_t test_rwlock; @@ -494,9 +720,6 @@ static const struct ref_scale_ops lock_irq_ops = { .name = "lock-irq" }; -// Definitions acquire-release. -static DEFINE_PER_CPU(unsigned long, test_acqrel); - static void ref_acqrel_section(const int nloops) { unsigned long x; @@ -629,6 +852,133 @@ static const struct ref_scale_ops jiffies_ops = { .name = "jiffies" }; +static void ref_preempt_section(const int nloops) +{ + int i; + + migrate_disable(); + for (i = nloops; i >= 0; i--) { + preempt_disable(); + preempt_enable(); + } + migrate_enable(); +} + +static void ref_preempt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + migrate_disable(); + for (i = nloops; i >= 0; i--) { + preempt_disable(); + un_delay(udl, ndl); + preempt_enable(); + } + migrate_enable(); +} + +static const struct ref_scale_ops preempt_ops = { + .readsection = ref_preempt_section, + .delaysection = ref_preempt_delay_section, + .name = "preempt" +}; + +static void ref_bh_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_bh_disable(); + local_bh_enable(); + } + preempt_enable(); +} + +static void ref_bh_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_bh_disable(); + un_delay(udl, ndl); + local_bh_enable(); + } + preempt_enable(); +} + +static const struct ref_scale_ops bh_ops = { + .readsection = ref_bh_section, + .delaysection = ref_bh_delay_section, + .enable_irqs = true, + .name = "bh" +}; + +static void ref_irq_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_disable(); + local_irq_enable(); + } + preempt_enable(); +} + +static void ref_irq_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_disable(); + un_delay(udl, ndl); + local_irq_enable(); + } + preempt_enable(); +} + +static const struct ref_scale_ops irq_ops = { + .readsection = ref_irq_section, + .delaysection = ref_irq_delay_section, + .name = "irq" +}; + +static void ref_irqsave_section(const int nloops) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_save(flags); + local_irq_restore(flags); + } + preempt_enable(); +} + +static void ref_irqsave_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_save(flags); + un_delay(udl, ndl); + local_irq_restore(flags); + } + preempt_enable(); +} + +static const struct ref_scale_ops irqsave_ops = { + .readsection = ref_irqsave_section, + .delaysection = ref_irqsave_delay_section, + .name = "irqsave" +}; + //////////////////////////////////////////////////////////////////////// // // Methods leveraging SLAB_TYPESAFE_BY_RCU. @@ -924,15 +1274,18 @@ repeat: if (!atomic_dec_return(&n_warmedup)) while (atomic_read_acquire(&n_warmedup)) rcu_scale_one_reader(); - // Also keep interrupts disabled. This also has the effect - // of preventing entries into slow path for rcu_read_unlock(). - local_irq_save(flags); + // Also keep interrupts disabled when it is safe to do so, which + // it is not for local_bh_enable(). This also has the effect of + // preventing entries into slow path for rcu_read_unlock(). + if (!cur_ops->enable_irqs) + local_irq_save(flags); start = ktime_get_mono_fast_ns(); rcu_scale_one_reader(); duration = ktime_get_mono_fast_ns() - start; - local_irq_restore(flags); + if (!cur_ops->enable_irqs) + local_irq_restore(flags); rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; // To reduce runtime-skew noise, do maintain-load invocations until @@ -1163,9 +1516,13 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_fast_ops, RCU_TRACE_OPS RCU_TASKS_OPS - &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, - &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_fast_updown_ops, + RCU_TRACE_OPS RCU_TASKS_OPS + &refcnt_ops, &percpuinc_ops, &incpercpu_ops, &incpercpupreempt_ops, + &incpercpubh_ops, &incpercpuirqsave_ops, + &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &sched_clock_ops, &clock_ops, &jiffies_ops, + &preempt_ops, &bh_ops, &irq_ops, &irqsave_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, }; diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index e3b64a5e0ec7..3450c3751ef7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -106,15 +106,15 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); preempt_enable(); - if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task()) + if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task() && !irqs_disabled()) swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Workqueue handler to drive one grace period and invoke any callbacks - * that become ready as a result. Single-CPU and !PREEMPTION operation - * means that we get away with murder on synchronization. ;-) + * that become ready as a result. Single-CPU operation and preemption + * disabling mean that we get away with murder on synchronization. ;-) */ void srcu_drive_gp(struct work_struct *wp) { @@ -141,7 +141,12 @@ void srcu_drive_gp(struct work_struct *wp) WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ preempt_enable(); - swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); + do { + // Deadlock issues prevent __srcu_read_unlock() from + // doing an unconditional wakeup, so polling is required. + swait_event_timeout_exclusive(ssp->srcu_wq, + !READ_ONCE(ssp->srcu_lock_nesting[idx]), HZ / 10); + } while (READ_ONCE(ssp->srcu_lock_nesting[idx])); preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 1ff94b76d91f..ea3f128de06f 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -286,32 +286,92 @@ err_free_sup: #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *ssp, const char *name, - struct lock_class_key *key) +static int +__init_srcu_struct_common(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); lockdep_init_map(&ssp->dep_map, name, key, 0); return init_srcu_struct_fields(ssp, false); } + +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = 0; + return __init_srcu_struct_common(ssp, name, key); +} EXPORT_SYMBOL_GPL(__init_srcu_struct); +int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return __init_srcu_struct_common(ssp, name, key); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct_fast); + +int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name, + struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN; + return __init_srcu_struct_common(ssp, name, key); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct_fast_updown); + #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** * init_srcu_struct - initialize a sleep-RCU structure * @ssp: structure to initialize. * - * Must invoke this on a given srcu_struct before passing that srcu_struct + * Use this in place of DEFINE_SRCU() and DEFINE_STATIC_SRCU() + * for non-static srcu_struct structures that are to be passed to + * srcu_read_lock(), srcu_read_lock_nmisafe(), and friends. It is necessary + * to invoke this on a given srcu_struct before passing that srcu_struct * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ int init_srcu_struct(struct srcu_struct *ssp) { + ssp->srcu_reader_flavor = 0; return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); +/** + * init_srcu_struct_fast - initialize a fast-reader sleep-RCU structure + * @ssp: structure to initialize. + * + * Use this in place of DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST() + * for non-static srcu_struct structures that are to be passed to + * srcu_read_lock_fast() and friends. It is necessary to invoke this on a + * given srcu_struct before passing that srcu_struct to any other function. + * Each srcu_struct represents a separate domain of SRCU protection. + */ +int init_srcu_struct_fast(struct srcu_struct *ssp) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return init_srcu_struct_fields(ssp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct_fast); + +/** + * init_srcu_struct_fast_updown - initialize a fast-reader up/down sleep-RCU structure + * @ssp: structure to initialize. + * + * Use this function in place of DEFINE_SRCU_FAST_UPDOWN() and + * DEFINE_STATIC_SRCU_FAST_UPDOWN() for non-static srcu_struct + * structures that are to be passed to srcu_read_lock_fast_updown(), + * srcu_down_read_fast(), and friends. It is necessary to invoke this on a + * given srcu_struct before passing that srcu_struct to any other function. + * Each srcu_struct represents a separate domain of SRCU protection. + */ +int init_srcu_struct_fast_updown(struct srcu_struct *ssp) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN; + return init_srcu_struct_fields(ssp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct_fast_updown); + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* @@ -461,7 +521,7 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { int cpu; - unsigned long mask = 0; + unsigned long mask = ssp->srcu_reader_flavor; unsigned long sum = 0; for_each_possible_cpu(cpu) { @@ -734,6 +794,10 @@ void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) sdp = raw_cpu_ptr(ssp->sda); old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor); + WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor && + old_read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(read_flavor == SRCU_READ_FLAVOR_FAST && !ssp->srcu_reader_flavor); if (!old_read_flavor) { old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); if (!old_read_flavor) @@ -1688,6 +1752,64 @@ void srcu_barrier(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(srcu_barrier); +/* Callback for srcu_expedite_current() usage. */ +static void srcu_expedite_current_cb(struct rcu_head *rhp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head); + + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + WARN_ON_ONCE(1); + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_IDLE; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, requeue ourselves as an expedited SRCU callback. + if (needcb) + __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); +} + +/** + * srcu_expedite_current - Expedite the current SRCU grace period + * @ssp: srcu_struct to expedite. + * + * Cause the current SRCU grace period to become expedited. The grace + * period following the current one might also be expedited. If there is + * no current grace period, one might be created. If the current grace + * period is currently sleeping, that sleep will complete before expediting + * will take effect. + */ +void srcu_expedite_current(struct srcu_struct *ssp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp; + + migrate_disable(); + sdp = this_cpu_ptr(ssp->sda); + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_REPOST; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, queue an expedited SRCU callback. + if (needcb) + __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); + migrate_enable(); +} +EXPORT_SYMBOL_GPL(srcu_expedite_current); + /** * srcu_batches_completed - return batches completed. * @ssp: srcu_struct on which to report batch completion. diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c1ebfd51768b..585cade21010 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -70,12 +70,10 @@ void rcu_qs(void) */ void rcu_sched_clock_irq(int user) { - if (user) { + if (user) rcu_qs(); - } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) + set_need_resched_current(); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8293bae1dec1..293bbd9ac3f4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2696,10 +2696,8 @@ void rcu_sched_clock_irq(int user) /* The load-acquire pairs with the store-release setting to true. */ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { /* Idle and userspace execution already are quiescent states. */ - if (!rcu_is_cpu_rrupt_from_idle() && !user) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + if (!rcu_is_cpu_rrupt_from_idle() && !user) + set_need_resched_current(); __this_cpu_write(rcu_data.rcu_urgent_qs, false); } rcu_flavor_sched_clock_irq(user); @@ -2824,7 +2822,6 @@ static void strict_work_handler(struct work_struct *work) /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { - unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -2837,8 +2834,8 @@ static __latent_entropy void rcu_core(void) if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { rcu_preempt_deferred_qs(current); } else if (rcu_preempt_need_deferred_qs(current)) { - set_tsk_need_resched(current); - set_preempt_need_resched(); + guard(irqsave)(); + set_need_resched_current(); } /* Update RCU state based on any recent quiescent states. */ @@ -2847,10 +2844,9 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { - local_irq_save(flags); + guard(irqsave)(); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - local_irq_restore(flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); @@ -4021,7 +4017,7 @@ bool rcu_cpu_online(int cpu) * RCU on an offline processor during initial boot, hence the check for * rcu_scheduler_fully_active. */ -bool rcu_lockdep_current_cpu_online(void) +bool notrace rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; bool ret = false; diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6058a734090c..96c49c56fc14 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -729,8 +729,7 @@ static void rcu_exp_need_qs(void) __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); /* Store .exp before .rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); } #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d85763336b3c..dbe2d02be824 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -753,8 +753,7 @@ static void rcu_read_unlock_special(struct task_struct *t) // Also if no expediting and no possible deboosting, // slow is OK. Plus nohz_full CPUs eventually get // tick enabled. - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && cpu_online(rdp->cpu)) { @@ -813,10 +812,8 @@ static void rcu_flavor_sched_clock_irq(int user) if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ - if (rcu_preempt_need_deferred_qs(t)) { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + if (rcu_preempt_need_deferred_qs(t)) + set_need_resched_current(); } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index d16afeb11506..b67532cb8770 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -763,8 +763,7 @@ static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) * progress and it could be we're stuck in kernel space without context * switches for an entirely unreasonable amount of time. */ - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); } static bool csd_lock_suppress_rcu_stall; diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c912b594ba98..dfeba9b35395 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -117,7 +117,7 @@ static bool rcu_read_lock_held_common(bool *ret) return false; } -int rcu_read_lock_sched_held(void) +int notrace rcu_read_lock_sched_held(void) { bool ret; @@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); * Note that rcu_read_lock() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ -int rcu_read_lock_held(void) +int notrace rcu_read_lock_held(void) { bool ret; @@ -367,7 +367,7 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); * Note that rcu_read_lock_bh() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ -int rcu_read_lock_bh_held(void) +int notrace rcu_read_lock_bh_held(void) { bool ret; @@ -377,7 +377,7 @@ int rcu_read_lock_bh_held(void) } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); -int rcu_read_lock_any_held(void) +int notrace rcu_read_lock_any_held(void) { bool ret; diff --git a/kernel/rseq.c b/kernel/rseq.c index 2452b7366b00..395d8b002350 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -8,98 +8,7 @@ * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */ -#include <linux/sched.h> -#include <linux/uaccess.h> -#include <linux/syscalls.h> -#include <linux/rseq.h> -#include <linux/types.h> -#include <linux/ratelimit.h> -#include <asm/ptrace.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/rseq.h> - -/* The original rseq structure size (including padding) is 32 bytes. */ -#define ORIG_RSEQ_SIZE 32 - -#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) - -#ifdef CONFIG_DEBUG_RSEQ -static struct rseq *rseq_kernel_fields(struct task_struct *t) -{ - return (struct rseq *) t->rseq_fields; -} - -static int rseq_validate_ro_fields(struct task_struct *t) -{ - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - u32 cpu_id_start, cpu_id, node_id, mm_cid; - struct rseq __user *rseq = t->rseq; - - /* - * Validate fields which are required to be read-only by - * user-space. - */ - if (!user_read_access_begin(rseq, t->rseq_len)) - goto efault; - unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); - unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_get_user(node_id, &rseq->node_id, efault_end); - unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); - user_read_access_end(); - - if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || - cpu_id != rseq_kernel_fields(t)->cpu_id || - node_id != rseq_kernel_fields(t)->node_id || - mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { - - pr_warn("Detected rseq corruption for pid: %d, name: %s\n" - "\tcpu_id_start: %u ?= %u\n" - "\tcpu_id: %u ?= %u\n" - "\tnode_id: %u ?= %u\n" - "\tmm_cid: %u ?= %u\n", - t->pid, t->comm, - cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, - cpu_id, rseq_kernel_fields(t)->cpu_id, - node_id, rseq_kernel_fields(t)->node_id, - mm_cid, rseq_kernel_fields(t)->mm_cid); - } - - /* For now, only print a console warning on mismatch. */ - return 0; - -efault_end: - user_read_access_end(); -efault: - return -EFAULT; -} - -/* - * Update an rseq field and its in-kernel copy in lock-step to keep a coherent - * state. - */ -#define rseq_unsafe_put_user(t, value, field, error_label) \ - do { \ - unsafe_put_user(value, &t->rseq->field, error_label); \ - rseq_kernel_fields(t)->field = value; \ - } while (0) - -#else -static int rseq_validate_ro_fields(struct task_struct *t) -{ - return 0; -} - -#define rseq_unsafe_put_user(t, value, field, error_label) \ - unsafe_put_user(value, &t->rseq->field, error_label) -#endif - /* - * * Restartable sequences are a lightweight interface that allows * user-level code to be executed atomically relative to scheduler * preemption and signal delivery. Typically used for implementing @@ -158,356 +67,356 @@ static int rseq_validate_ro_fields(struct task_struct *t) * F1. <failure> */ -static int rseq_update_cpu_node_id(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq; - u32 cpu_id = raw_smp_processor_id(); - u32 node_id = cpu_to_node(cpu_id); - u32 mm_cid = task_mm_cid(t); +/* Required to select the proper per_cpu ops for rseq_stats_inc() */ +#define RSEQ_BUILD_SLOW_PATH - /* - * Validate read-only rseq fields. - */ - if (rseq_validate_ro_fields(t)) - goto efault; - WARN_ON_ONCE((int) mm_cid < 0); - if (!user_write_access_begin(rseq, t->rseq_len)) - goto efault; +#include <linux/debugfs.h> +#include <linux/ratelimit.h> +#include <linux/rseq_entry.h> +#include <linux/sched.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <asm/ptrace.h> - rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); +#define CREATE_TRACE_POINTS +#include <trace/events/rseq.h> - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally updated only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - trace_rseq_update(t); - return 0; +DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); -efault_end: - user_write_access_end(); -efault: - return -EFAULT; +static inline void rseq_control_debug(bool on) +{ + if (on) + static_branch_enable(&rseq_debug_enabled); + else + static_branch_disable(&rseq_debug_enabled); } -static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) +static int __init rseq_setup_debug(char *str) { - struct rseq __user *rseq = t->rseq; - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, - mm_cid = 0; - - /* - * Validate read-only rseq fields. - */ - if (rseq_validate_ro_fields(t)) - goto efault; + bool on; - if (!user_write_access_begin(rseq, t->rseq_len)) - goto efault; - - /* - * Reset all fields to their initial state. - * - * All fields have an initial state of 0 except cpu_id which is set to - * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after - * unregistration can figure out that rseq needs to be registered - * again. - */ - rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); - - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally reset only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - return 0; - -efault_end: - user_write_access_end(); -efault: - return -EFAULT; + if (kstrtobool(str, &on)) + return -EINVAL; + rseq_control_debug(on); + return 1; } +__setup("rseq_debug=", rseq_setup_debug); +#ifdef CONFIG_TRACEPOINTS /* - * Get the user-space pointer value stored in the 'rseq_cs' field. + * Out of line, so the actual update functions can be in a header to be + * inlined into the exit to user code. */ -static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +void __rseq_trace_update(struct task_struct *t) { - if (!rseq_cs) - return -EFAULT; - -#ifdef CONFIG_64BIT - if (get_user(*rseq_cs, &rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) - return -EFAULT; -#endif + trace_rseq_update(t); +} - return 0; +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); } +#endif /* CONFIG_TRACEPOINTS */ -/* - * If the rseq_cs field of 'struct rseq' contains a valid pointer to - * user-space, copy 'struct rseq_cs' from user-space and validate its fields. - */ -static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) +#ifdef CONFIG_DEBUG_FS +#ifdef CONFIG_RSEQ_STATS +DEFINE_PER_CPU(struct rseq_stats, rseq_stats); + +static int rseq_stats_show(struct seq_file *m, void *p) { - struct rseq_cs __user *urseq_cs; - u64 ptr; - u32 __user *usig; - u32 sig; - int ret; - - ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); - if (ret) - return ret; - - /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ - if (!ptr) { - memset(rseq_cs, 0, sizeof(*rseq_cs)); - return 0; + struct rseq_stats stats = { }; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); + stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); + stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); + stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); + stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); + stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); + stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); + stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); } - /* Check that the pointer value fits in the user-space process space. */ - if (ptr >= TASK_SIZE) - return -EINVAL; - urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; - if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) - return -EFAULT; - if (rseq_cs->start_ip >= TASK_SIZE || - rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || - rseq_cs->abort_ip >= TASK_SIZE || - rseq_cs->version > 0) - return -EINVAL; - /* Check for overflow. */ - if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) - return -EINVAL; - /* Ensure that abort_ip is not in the critical section. */ - if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) - return -EINVAL; + seq_printf(m, "exit: %16lu\n", stats.exit); + seq_printf(m, "signal: %16lu\n", stats.signal); + seq_printf(m, "slowp: %16lu\n", stats.slowpath); + seq_printf(m, "fastp: %16lu\n", stats.fastpath); + seq_printf(m, "ids: %16lu\n", stats.ids); + seq_printf(m, "cs: %16lu\n", stats.cs); + seq_printf(m, "clear: %16lu\n", stats.clear); + seq_printf(m, "fixup: %16lu\n", stats.fixup); + return 0; +} - usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); - ret = get_user(sig, usig); - if (ret) - return ret; +static int rseq_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_stats_show, inode->i_private); +} - if (current->rseq_sig != sig) { - printk_ratelimited(KERN_WARNING - "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", - sig, current->rseq_sig, current->pid, usig); - return -EINVAL; - } +static const struct file_operations stat_ops = { + .open = rseq_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_stats_init(struct dentry *root_dir) +{ + debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); return 0; } +#else +static inline void rseq_stats_init(struct dentry *root_dir) { } +#endif /* CONFIG_RSEQ_STATS */ -static bool rseq_warn_flags(const char *str, u32 flags) +static int rseq_debug_show(struct seq_file *m, void *p) { - u32 test_flags; - - if (!flags) - return false; - test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); - test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); - return true; + bool on = static_branch_unlikely(&rseq_debug_enabled); + + seq_printf(m, "%d\n", on); + return 0; } -static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) { - u32 flags, event_mask; - int ret; + bool on; - if (rseq_warn_flags("rseq_cs", cs_flags)) + if (kstrtobool_from_user(ubuf, count, &on)) return -EINVAL; - /* Get thread flags. */ - ret = get_user(flags, &t->rseq->flags); - if (ret) - return ret; + rseq_control_debug(on); + return count; +} - if (rseq_warn_flags("rseq", flags)) - return -EINVAL; +static int rseq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_debug_show, inode->i_private); +} - /* - * Load and clear event mask atomically with respect to - * scheduler preemption and membarrier IPIs. - */ - scoped_guard(RSEQ_EVENT_GUARD) { - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - } +static const struct file_operations debug_ops = { + .open = rseq_debug_open, + .read = seq_read, + .write = rseq_debug_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_debugfs_init(void) +{ + struct dentry *root_dir = debugfs_create_dir("rseq", NULL); - return !!event_mask; + debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); + rseq_stats_init(root_dir); + return 0; } +__initcall(rseq_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ -static int clear_rseq_cs(struct rseq __user *rseq) +static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) { - /* - * The rseq_cs field is set to NULL on preemption or signal - * delivery on top of rseq assembly block, as well as on top - * of code outside of the rseq assembly block. This performs - * a lazy clear of the rseq_cs field. - * - * Set rseq_cs to NULL. - */ -#ifdef CONFIG_64BIT - return put_user(0UL, &rseq->rseq_cs); -#else - if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) - return -EFAULT; - return 0; -#endif + return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); } -/* - * Unsigned comparison will be true when ip >= start_ip, and when - * ip < start_ip + post_commit_offset. - */ -static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) +static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { - return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; + struct rseq __user *urseq = t->rseq.usrptr; + u64 csaddr; + + scoped_user_read_access(urseq, efault) + unsafe_get_user(csaddr, &urseq->rseq_cs, efault); + if (likely(!csaddr)) + return true; + return rseq_update_user_cs(t, regs, csaddr); +efault: + return false; } -static int rseq_ip_fixup(struct pt_regs *regs) +static void rseq_slowpath_update_usr(struct pt_regs *regs) { - unsigned long ip = instruction_pointer(regs); + /* + * Preserve rseq state and user_irq state. The generic entry code + * clears user_irq on the way out, the non-generic entry + * architectures are not having user_irq. + */ + const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; struct task_struct *t = current; - struct rseq_cs rseq_cs; - int ret; + struct rseq_ids ids; + u32 node_id; + bool event; + + if (unlikely(t->flags & PF_EXITING)) + return; - ret = rseq_get_rseq_cs(t, &rseq_cs); - if (ret) - return ret; + rseq_stat_inc(rseq_stats.slowpath); /* - * Handle potentially not being within a critical section. - * If not nested over a rseq critical section, restart is useless. - * Clear the rseq_cs pointer and return. + * Read and clear the event pending bit first. If the task + * was not preempted or migrated or a signal is on the way, + * there is no point in doing any of the heavy lifting here + * on production kernels. In that case TIF_NOTIFY_RESUME + * was raised by some other functionality. + * + * This is correct because the read/clear operation is + * guarded against scheduler preemption, which makes it CPU + * local atomic. If the task is preempted right after + * re-enabling preemption then TIF_NOTIFY_RESUME is set + * again and this function is invoked another time _before_ + * the task is able to return to user mode. + * + * On a debug kernel, invoke the fixup code unconditionally + * with the result handed in to allow the detection of + * inconsistencies. */ - if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t->rseq); - ret = rseq_need_restart(t, rseq_cs.flags); - if (ret <= 0) - return ret; - ret = clear_rseq_cs(t->rseq); - if (ret) - return ret; - trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, - rseq_cs.abort_ip); - instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); - return 0; + scoped_guard(irq) { + event = t->rseq.event.sched_switch; + t->rseq.event.all &= evt_mask.all; + ids.cpu_id = task_cpu(t); + ids.mm_cid = task_mm_cid(t); + } + + if (!event) + return; + + node_id = cpu_to_node(ids.cpu_id); + + if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { + /* + * Clear the errors just in case this might survive magically, but + * leave the rest intact. + */ + t->rseq.event.error = 0; + force_sig(SIGSEGV); + } } -/* - * This resume handler must always be executed between any of: - * - preemption, - * - signal delivery, - * and return to user-space. - * - * This is how we can ensure that the entire rseq critical section - * will issue the commit instruction only if executed atomically with - * respect to other threads scheduled on the same CPU, and with respect - * to signal handlers. - */ -void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) +void __rseq_handle_slowpath(struct pt_regs *regs) { - struct task_struct *t = current; - int ret, sig; - - if (unlikely(t->flags & PF_EXITING)) + /* + * If invoked from hypervisors before entering the guest via + * resume_user_mode_work(), then @regs is a NULL pointer. + * + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises + * it before returning from the ioctl() to user space when + * rseq_event.sched_switch is set. + * + * So it's safe to ignore here instead of pointlessly updating it + * in the vcpu_run() loop. + */ + if (!regs) return; + rseq_slowpath_update_usr(regs); +} + +void __rseq_signal_deliver(int sig, struct pt_regs *regs) +{ + rseq_stat_inc(rseq_stats.signal); /* - * regs is NULL if and only if the caller is in a syscall path. Skip - * fixup and leave rseq_cs as is so that rseq_sycall() will detect and - * kill a misbehaving userspace on debug kernels. + * Don't update IDs, they are handled on exit to user if + * necessary. The important thing is to abort a critical section of + * the interrupted context as after this point the instruction + * pointer in @regs points to the signal handler. */ - if (regs) { - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + if (unlikely(!rseq_handle_cs(current, regs))) { + /* + * Clear the errors just in case this might survive + * magically, but leave the rest intact. + */ + current->rseq.event.error = 0; + force_sigsegv(sig); } - if (unlikely(rseq_update_cpu_node_id(t))) - goto error; - return; - -error: - sig = ksig ? ksig->sig : 0; - force_sigsegv(sig); } -#ifdef CONFIG_DEBUG_RSEQ - /* * Terminate the process if a syscall is issued within a restartable * sequence. */ -void rseq_syscall(struct pt_regs *regs) +void __rseq_debug_syscall_return(struct pt_regs *regs) { - unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; - struct rseq_cs rseq_cs; + u64 csaddr; - if (!t->rseq) + if (!t->rseq.event.has_rseq) return; - if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) - force_sig(SIGSEGV); + if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) + goto fail; + if (likely(!csaddr)) + return; + if (unlikely(csaddr >= TASK_SIZE)) + goto fail; + if (rseq_debug_update_user_cs(t, regs, csaddr)) + return; +fail: + force_sig(SIGSEGV); } +#ifdef CONFIG_DEBUG_RSEQ +/* Kept around to keep GENERIC_ENTRY=n architectures supported. */ +void rseq_syscall(struct pt_regs *regs) +{ + __rseq_debug_syscall_return(regs); +} #endif +static bool rseq_reset_ids(void) +{ + struct rseq_ids ids = { + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, + .mm_cid = 0, + }; + + /* + * If this fails, terminate it because this leaves the kernel in + * stupid state as exit to user space will try to fixup the ids + * again. + */ + if (rseq_set_ids(current, &ids, 0)) + return true; + + force_sig(SIGSEGV); + return false; +} + +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + /* * sys_rseq - setup restartable sequences for caller thread. */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, - int, flags, u32, sig) +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { - int ret; - u64 rseq_cs; - if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; /* Unregister rseq for current thread. */ - if (current->rseq != rseq || !current->rseq) + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) return -EINVAL; - if (rseq_len != current->rseq_len) + if (rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; - ret = rseq_reset_rseq_cpu_node_id(current); - if (ret) - return ret; - current->rseq = NULL; - current->rseq_sig = 0; - current->rseq_len = 0; + if (!rseq_reset_ids()) + return -EFAULT; + rseq_reset(current); return 0; } if (unlikely(flags)) return -EINVAL; - if (current->rseq) { + if (current->rseq.usrptr) { /* * If rseq is already registered, check whether * the provided address differs from the prior * one. */ - if (current->rseq != rseq || rseq_len != current->rseq_len) + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; /* Already registered. */ return -EBUSY; @@ -531,43 +440,39 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (!access_ok(rseq, rseq_len)) return -EFAULT; - /* - * If the rseq_cs pointer is non-NULL on registration, clear it to - * avoid a potential segfault on return to user-space. The proper thing - * to do would have been to fail the registration but this would break - * older libcs that reuse the rseq area for new threads without - * clearing the fields. - */ - if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) - return -EFAULT; - if (rseq_cs && clear_rseq_cs(rseq)) - return -EFAULT; + scoped_user_write_access(rseq, efault) { + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. Don't bother reading it, just reset it. + */ + unsafe_put_user(0UL, &rseq->rseq_cs, efault); + /* Initialize IDs in user space */ + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0U, &rseq->node_id, efault); + unsafe_put_user(0U, &rseq->mm_cid, efault); + } -#ifdef CONFIG_DEBUG_RSEQ - /* - * Initialize the in-kernel rseq fields copy for validation of - * read-only fields. - */ - if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || - get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || - get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || - get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) - return -EFAULT; -#endif /* * Activate the registration by setting the rseq area address, length * and signature in the task struct. */ - current->rseq = rseq; - current->rseq_len = rseq_len; - current->rseq_sig = sig; + current->rseq.usrptr = rseq; + current->rseq.len = rseq_len; + current->rseq.sig = sig; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ - rseq_set_notify_resume(current); - + current->rseq.event.has_rseq = true; + rseq_force_update(); return 0; + +efault: + return -EFAULT; } diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index cdea931aae30..954137775f38 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -178,8 +178,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * this process can already run with task_group() == prev->tg or we can * race with cgroup code which can read autogroup = prev under rq->lock. * In the latter case for_each_thread() can not miss a migrating thread, - * cpu_cgroup_attach() must not be possible after cgroup_exit() and it - * can't be removed from thread list, we hold ->siglock. + * cpu_cgroup_attach() must not be possible after cgroup_task_exit() + * and it can't be removed from thread list, we hold ->siglock. * * If an exiting thread was already removed from thread list we rely on * sched_autogroup_exit_task(). diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 198d2dd45f59..b7801cd05d5a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); #ifdef CONFIG_SCHED_PROXY_EXEC DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); @@ -583,8 +584,8 @@ EXPORT_SYMBOL(__trace_set_current_state); * * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: * - * is set by activate_task() and cleared by deactivate_task(), under - * rq->lock. Non-zero indicates the task is runnable, the special + * is set by activate_task() and cleared by deactivate_task()/block_task(), + * under rq->lock. Non-zero indicates the task is runnable, the special * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * @@ -2089,6 +2090,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p, flags); + rq->queue_mask |= p->sched_class->queue_mask; p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2121,6 +2123,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) * and mark the task ->sched_delayed. */ uclamp_rq_dec(rq, p); + rq->queue_mask |= p->sched_class->queue_mask; return p->sched_class->dequeue_task(rq, p, flags); } @@ -2128,8 +2131,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_on_rq_migrating(p)) flags |= ENQUEUE_MIGRATED; - if (flags & ENQUEUE_MIGRATED) - sched_mm_cid_migrate_to(rq, p); enqueue_task(rq, p, flags); @@ -2169,37 +2170,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -/* - * ->switching_to() is called with the pi_lock and rq_lock held and must not - * mess with locking. - */ -void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class) -{ - if (prev_class != p->sched_class && p->sched_class->switching_to) - p->sched_class->switching_to(rq, p); -} - -/* - * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, - * use the balance_callback list if you want balancing. - * - * this means any call to check_class_changed() must be followed by a call to - * balance_callback(). - */ -void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) -{ - if (prev_class != p->sched_class) { - if (prev_class->switched_from) - prev_class->switched_from(rq, p); - - p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio || dl_task(p)) - p->sched_class->prio_changed(rq, p, oldprio); -} - void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; @@ -2362,7 +2332,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); static void migrate_disable_switch(struct rq *rq, struct task_struct *p) { @@ -2377,10 +2347,8 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) if (p->cpus_ptr != &p->cpus_mask) return; - /* - * Violates locking rules! See comment in __do_set_cpus_allowed(). - */ - __do_set_cpus_allowed(p, &ac); + scoped_guard (task_rq_lock, p) + do_set_cpus_allowed(p, &ac); } void ___migrate_enable(void) @@ -2613,7 +2581,8 @@ static int migration_cpu_stop(void *data) */ WARN_ON_ONCE(!pending->stop_pending); preempt_disable(); - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); preempt_enable(); @@ -2622,7 +2591,8 @@ static int migration_cpu_stop(void *data) out: if (pending) pending->stop_pending = false; - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); if (complete) complete_all(&pending->done); @@ -2671,6 +2641,8 @@ out_unlock: return 0; } +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask); + /* * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. @@ -2684,6 +2656,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx cpumask_copy(&p->cpus_mask, ctx->new_mask); p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); + mm_update_cpus_allowed(p->mm, ctx->new_mask); /* * Swap in a new user_cpus_ptr if SCA_USER flag set @@ -2693,56 +2666,17 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { - struct rq *rq = task_rq(p); - bool queued, running; - - /* - * This here violates the locking rules for affinity, since we're only - * supposed to change these variables while holding both rq->lock and - * p->pi_lock. - * - * HOWEVER, it magically works, because ttwu() is the only code that - * accesses these variables under p->pi_lock and only does so after - * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() - * before finish_task(). - * - * XXX do further audits, this smells like something putrid. - */ - if (ctx->flags & SCA_MIGRATE_DISABLE) - WARN_ON_ONCE(!p->on_cpu); - else - lockdep_assert_held(&p->pi_lock); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) { - /* - * Because __kthread_bind() calls this on blocked tasks without - * holding rq->lock. - */ - lockdep_assert_rq_held(rq); - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - } - if (running) - put_prev_task(rq, p); - - p->sched_class->set_cpus_allowed(p, ctx); - mm_set_cpus_allowed(p->mm, ctx->new_mask); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); + scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->sched_class->set_cpus_allowed(p, ctx); } /* * Used for kthread_bind() and select_fallback_rq(), in both cases the user * affinity (if any) should be destroyed too. */ -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask) { struct affinity_context ac = { .new_mask = new_mask, @@ -2754,7 +2688,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) struct rcu_head rcu; }; - __do_set_cpus_allowed(p, &ac); + scoped_guard (__task_rq_lock, p) + do_set_cpus_allowed(p, &ac); /* * Because this is called with p->pi_lock held, it is not possible @@ -2792,7 +2727,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, * Use pi_lock to protect content of user_cpus_ptr * * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent - * do_set_cpus_allowed(). + * set_cpus_allowed_force(). */ raw_spin_lock_irqsave(&src->pi_lock, flags); if (src->user_cpus_ptr) { @@ -3064,8 +2999,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, unsigned int dest_cpu; int ret = 0; - update_rq_clock(rq); - if (kthread || is_migration_disabled(p)) { /* * Kernel threads are allowed on online && !active CPUs, @@ -3120,7 +3053,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, goto out; } - __do_set_cpus_allowed(p, ctx); + do_set_cpus_allowed(p, ctx); return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); @@ -3329,8 +3262,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - rseq_migrate(p); - sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); } @@ -3529,13 +3460,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: - /* - * XXX When called from select_task_rq() we only - * hold p->pi_lock and again violate locking order. - * - * More yuck to audit. - */ - do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); + set_cpus_allowed_force(p, task_cpu_fallback_mask(p)); state = fail; break; case fail: @@ -3777,7 +3702,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) ttwu_do_wakeup(p); ret = 1; } - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); return ret; } @@ -4231,7 +4156,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * __schedule(). See the comment for smp_mb__after_spinlock(). * * Form a control-dep-acquire with p->on_rq == 0 above, to ensure - * schedule()'s deactivate_task() has 'happened' and p will no longer + * schedule()'s block_task() has 'happened' and p will no longer * care about it's own p->state. See the comment in __schedule(). */ smp_acquire__after_ctrl_dep(); @@ -4370,7 +4295,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) ret = func(p, arg); if (rq) - rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; @@ -4487,7 +4412,6 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; - init_sched_mm_cid(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -4763,7 +4687,6 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) p->sched_task_group = tg; } #endif - rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -4827,7 +4750,6 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); - rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -5121,7 +5043,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); - rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -5222,6 +5143,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); + /* + * sched_ext_dead() must come before cgroup_task_dead() to + * prevent cgroups from being removed while its member tasks are + * visible to SCX schedulers. + */ + sched_ext_dead(prev); + cgroup_task_dead(prev); + /* Task is done with its stack. */ put_task_stack(prev); @@ -5284,19 +5213,16 @@ context_switch(struct rq *rq, struct task_struct *prev, * * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch - * - * switch_mm_cid() needs to be updated if the barriers provided - * by context_switch() are modified. */ - if (!next->mm) { // to kernel + if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next); next->active_mm = prev->active_mm; - if (prev->mm) // from user + if (prev->mm) // from user mmgrab_lazy_tlb(prev->active_mm); else prev->active_mm = NULL; - } else { // to user + } else { // to user membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting @@ -5309,15 +5235,20 @@ context_switch(struct rq *rq, struct task_struct *prev, switch_mm_irqs_off(prev->active_mm, next->mm, next); lru_gen_use_mm(next->mm); - if (!prev->mm) { // from kernel + if (!prev->mm) { // from kernel /* will mmdrop_lazy_tlb() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; prev->active_mm = NULL; } } - /* switch_mm_cid() requires the memory barriers above. */ - switch_mm_cid(rq, prev, next); + mm_cid_switch_to(prev, next); + + /* + * Tell rseq that the task was scheduled in. Must be after + * switch_mm_cid() to get the TIF flag set. + */ + rseq_sched_switch_event(next); prepare_lock_switch(rq, next, rf); @@ -5602,7 +5533,6 @@ void sched_tick(void) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); - task_tick_mm_cid(rq, donor); scx_tick(rq); rq_unlock(rq, &rf); @@ -5692,7 +5622,7 @@ static void sched_tick_remote(struct work_struct *work) * reasonable amount of time. */ u64 delta = rq_clock_task(rq) - curr->se.exec_start; - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30); } curr->sched_class->task_tick(rq, curr, 0); @@ -5916,19 +5846,6 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, const struct sched_class *start_class = prev->sched_class; const struct sched_class *class; -#ifdef CONFIG_SCHED_CLASS_EXT - /* - * SCX requires a balance() call before every pick_task() including when - * waking up from SCHED_IDLE. If @start_class is below SCX, start from - * SCX instead. Also, set a flag to detect missing balance() call. - */ - if (scx_enabled()) { - rq->scx.flags |= SCX_RQ_BAL_PENDING; - if (sched_class_above(&ext_sched_class, start_class)) - start_class = &ext_sched_class; - } -#endif - /* * We must do the balancing pass before put_prev_task(), such * that when we release the rq->lock the task is in the same @@ -5972,7 +5889,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* Assume the next prioritized class is idle_sched_class */ if (!p) { - p = pick_task_idle(rq); + p = pick_task_idle(rq, rf); put_prev_set_next_task(rq, prev, p); } @@ -5984,11 +5901,15 @@ restart: for_each_active_class(class) { if (class->pick_next_task) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) return p; } else { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) { put_prev_set_next_task(rq, prev, p); return p; @@ -6018,7 +5939,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b) return a->core_cookie == b->core_cookie; } -static inline struct task_struct *pick_task(struct rq *rq) +/* + * Careful; this can return RETRY_TASK, it does not include the retry-loop + * itself due to the whole SMT pick retry thing below. + */ +static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; @@ -6026,7 +5951,7 @@ static inline struct task_struct *pick_task(struct rq *rq) rq->dl_server = NULL; for_each_active_class(class) { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); if (p) return p; } @@ -6041,7 +5966,7 @@ static void queue_core_balance(struct rq *rq); static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *next, *p, *max = NULL; + struct task_struct *next, *p, *max; const struct cpumask *smt_mask; bool fi_before = false; bool core_clock_updated = (rq == rq->core); @@ -6126,7 +6051,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * and there are no cookied tasks running on siblings. */ if (!need_sync) { - next = pick_task(rq); +restart_single: + next = pick_task(rq, rf); + if (unlikely(next == RETRY_TASK)) + goto restart_single; if (!next->core_cookie) { rq->core_pick = NULL; rq->core_dl_server = NULL; @@ -6146,6 +6074,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * Tie-break prio towards the current CPU */ +restart_multi: + max = NULL; for_each_cpu_wrap(i, smt_mask, cpu) { rq_i = cpu_rq(i); @@ -6157,7 +6087,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); - rq_i->core_pick = p = pick_task(rq_i); + p = pick_task(rq_i, rf); + if (unlikely(p == RETRY_TASK)) + goto restart_multi; + + rq_i->core_pick = p; rq_i->core_dl_server = rq_i->dl_server; if (!max || prio_less(max, p, fi_before)) @@ -6179,7 +6113,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (cookie) p = sched_core_find(rq_i, cookie); if (!p) - p = idle_sched_class.pick_task(rq_i); + p = idle_sched_class.pick_task(rq_i, rf); } rq_i->core_pick = p; @@ -6812,6 +6746,7 @@ static void __sched notrace __schedule(int sched_mode) local_irq_disable(); rcu_note_context_switch(preempt); + migrate_disable_switch(rq, prev); /* * Make sure that signal_pending_state()->signal_pending() below @@ -6918,7 +6853,6 @@ keep_resched: */ ++*switch_count; - migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); psi_sched_switch(prev, next, !task_on_rq_queued(prev) || prev->se.sched_delayed); @@ -7326,7 +7260,7 @@ void rt_mutex_post_schedule(void) */ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int prio, oldprio, queued, running, queue_flag = + int prio, oldprio, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class, *next_class; struct rq_flags rf; @@ -7388,64 +7322,51 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) prev_class = p->sched_class; next_class = __setscheduler_class(p->policy, prio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + if (prev_class != next_class) + queue_flag |= DEQUEUE_CLASS; - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flag); - if (running) - put_prev_task(rq, p); - - /* - * Boosting condition are: - * 1. -rt task is running and holds mutex A - * --> -dl task blocks on mutex A - * - * 2. -dl task is running and holds mutex A - * --> -dl task blocks on mutex A and could preempt the - * running task - */ - if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || - (pi_task && dl_prio(pi_task->prio) && - dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.pi_se = pi_task->dl.pi_se; - queue_flag |= ENQUEUE_REPLENISH; + scoped_guard (sched_change, p, queue_flag) { + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { + p->dl.pi_se = pi_task->dl.pi_se; + scope->flags |= ENQUEUE_REPLENISH; + } else { + p->dl.pi_se = &p->dl; + } + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (oldprio < prio) + scope->flags |= ENQUEUE_HEAD; } else { - p->dl.pi_se = &p->dl; + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (rt_prio(oldprio)) + p->rt.timeout = 0; } - } else if (rt_prio(prio)) { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (oldprio < prio) - queue_flag |= ENQUEUE_HEAD; - } else { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (rt_prio(oldprio)) - p->rt.timeout = 0; - } - p->sched_class = next_class; - p->prio = prio; - - check_class_changing(rq, p, prev_class); - - if (queued) - enqueue_task(rq, p, queue_flag); - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); + p->sched_class = next_class; + p->prio = prio; + } out_unlock: /* Avoid rq from going away on us: */ preempt_disable(); rq_unpin_lock(rq, &rf); __balance_callbacks(rq); - raw_spin_rq_unlock(rq); + rq_repin_lock(rq, &rf); + __task_rq_unlock(rq, p, &rf); preempt_enable(); } @@ -8084,26 +8005,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - bool queued, running; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(p, &rf); - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); - if (running) - put_prev_task(rq, p); - - p->numa_preferred_nid = nid; - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - task_rq_unlock(rq, p, &rf); + guard(task_rq_lock)(p); + scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->numa_preferred_nid = nid; } #endif /* CONFIG_NUMA_BALANCING */ @@ -8141,18 +8045,15 @@ static int __balance_push_cpu_stop(void *arg) struct rq_flags rf; int cpu; - raw_spin_lock_irq(&p->pi_lock); - rq_lock(rq, &rf); - - update_rq_clock(rq); - - if (task_rq(p) == rq && task_on_rq_queued(p)) { + scoped_guard (raw_spinlock_irq, &p->pi_lock) { cpu = select_fallback_rq(rq->cpu, p); - rq = __migrate_task(rq, &rf, p, cpu); - } - rq_unlock(rq, &rf); - raw_spin_unlock_irq(&p->pi_lock); + rq_lock(rq, &rf); + update_rq_clock(rq); + if (task_rq(p) == rq && task_on_rq_queued(p)) + rq = __migrate_task(rq, &rf, p, cpu); + rq_unlock(rq, &rf); + } put_task_struct(p); @@ -8571,10 +8472,12 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { WARN(true, "Dying CPU not properly vacated!"); dump_rq_tasks(rq, KERN_WARNING); } + dl_server_stop(&rq->fair_server); rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); @@ -8589,6 +8492,8 @@ void __init sched_init_smp(void) { sched_init_numa(NUMA_NO_NODE); + prandom_init_once(&sched_rnd_state); + /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot @@ -9205,38 +9110,23 @@ static void sched_change_group(struct task_struct *tsk) */ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { - int queued, running, queue_flags = - DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + bool resched = false; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - update_rq_clock(rq); - - running = task_current_donor(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, queue_flags); - if (running) - put_prev_task(rq, tsk); - - sched_change_group(tsk); - if (!for_autogroup) - scx_cgroup_move_task(tsk); + scoped_guard (sched_change, tsk, queue_flags) { + sched_change_group(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); + if (scope->running) + resched = true; + } - if (queued) - enqueue_task(rq, tsk, queue_flags); - if (running) { - set_next_task(rq, tsk); - /* - * After changing group, the running task may have joined a - * throttled one but it's still the running task. Trigger a - * resched to make sure that task can still run. - */ + if (resched) resched_curr(rq); - } } static struct cgroup_subsys_state * @@ -9604,7 +9494,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; - cfs_rq->runtime_remaining = 0; + cfs_rq->runtime_remaining = 1; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); @@ -10372,557 +10262,571 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) } #ifdef CONFIG_SCHED_MM_CID - /* - * @cid_lock: Guarantee forward-progress of cid allocation. - * - * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock - * is only used when contention is detected by the lock-free allocation so - * forward progress can be guaranteed. - */ -DEFINE_RAW_SPINLOCK(cid_lock); - -/* - * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. - * - * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is - * detected, it is set to 1 to ensure that all newly coming allocations are - * serialized by @cid_lock until the allocation which detected contention - * completes and sets @use_cid_lock back to 0. This guarantees forward progress - * of a cid allocation. - */ -int use_cid_lock; - -/* - * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid - * concurrently with respect to the execution of the source runqueue context - * switch. - * - * There is one basic properties we want to guarantee here: + * Concurrency IDentifier management * - * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively - * used by a task. That would lead to concurrent allocation of the cid and - * userspace corruption. - * - * Provide this guarantee by introducing a Dekker memory ordering to guarantee - * that a pair of loads observe at least one of a pair of stores, which can be - * shown as: + * Serialization rules: * - * X = Y = 0 + * mm::mm_cid::mutex: Serializes fork() and exit() and therefore + * protects mm::mm_cid::users. * - * w[X]=1 w[Y]=1 - * MB MB - * r[Y]=y r[X]=x + * mm::mm_cid::lock: Serializes mm_update_max_cids() and + * mm_update_cpus_allowed(). Nests in mm_cid::mutex + * and runqueue lock. * - * Which guarantees that x==0 && y==0 is impossible. But rather than using - * values 0 and 1, this algorithm cares about specific state transitions of the - * runqueue current task (as updated by the scheduler context switch), and the - * per-mm/cpu cid value. + * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks + * and can only be modified with atomic operations. * - * Let's introduce task (Y) which has task->mm == mm and task (N) which has - * task->mm != mm for the rest of the discussion. There are two scheduler state - * transitions on context switch we care about: + * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue + * lock. * - * (TSA) Store to rq->curr with transition from (N) to (Y) + * CID ownership: * - * (TSB) Store to rq->curr with transition from (Y) to (N) + * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or + * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the + * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode, + * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the + * task needs to drop the CID into the pool when scheduling out. Both bits + * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is + * actually handed over to user space in the RSEQ memory. * - * On the remote-clear side, there is one transition we care about: + * Mode switching: * - * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag + * Switching to per CPU mode happens when the user count becomes greater + * than the maximum number of CIDs, which is calculated by: * - * There is also a transition to UNSET state which can be performed from all - * sides (scheduler, remote-clear). It is always performed with a cmpxchg which - * guarantees that only a single thread will succeed: + * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users); + * max_cids = min(1.25 * opt_cids, num_possible_cpus()); * - * (TMB) cmpxchg to *pcpu_cid to mark UNSET + * The +25% allowance is useful for tight CPU masks in scenarios where only + * a few threads are created and destroyed to avoid frequent mode + * switches. Though this allowance shrinks, the closer opt_cids becomes to + * num_possible_cpus(), which is the (unfortunate) hard ABI limit. * - * Just to be clear, what we do _not_ want to happen is a transition to UNSET - * when a thread is actively using the cid (property (1)). + * At the point of switching to per CPU mode the new user is not yet + * visible in the system, so the task which initiated the fork() runs the + * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and + * either transfers each tasks owned CID to the CPU the task runs on or + * drops it into the CID pool if a task is not on a CPU at that point in + * time. Tasks which schedule in before the task walk reaches them do the + * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes + * it's guaranteed that no task related to that MM owns a CID anymore. * - * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. + * Switching back to task mode happens when the user count goes below the + * threshold which was recorded on the per CPU mode switch: * - * Scenario A) (TSA)+(TMA) (from next task perspective) + * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2); * - * CPU0 CPU1 + * This threshold is updated when a affinity change increases the number of + * allowed CPUs for the MM, which might cause a switch back to per task + * mode. * - * Context switch CS-1 Remote-clear - * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) - * (implied barrier after cmpxchg) - * - switch_mm_cid() - * - memory barrier (see switch_mm_cid() - * comment explaining how this barrier - * is combined with other scheduler - * barriers) - * - mm_cid_get (next) - * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) + * If the switch back was initiated by a exiting task, then that task runs + * the fixup function. If it was initiated by a affinity change, then it's + * run either in the deferred update function in context of a workqueue or + * by a task which forks a new one or by a task which exits. Whatever + * happens first. mm_cid_fixup_cpus_to_task() walks through the possible + * CPUs and either transfers the CPU owned CIDs to a related task which + * runs on the CPU or drops it into the pool. Tasks which schedule in on a + * CPU which the walk did not cover yet do the handover themself. * - * This Dekker ensures that either task (Y) is observed by the - * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are - * observed. + * This transition from CPU to per task ownership happens in two phases: * - * If task (Y) store is observed by rcu_dereference(), it means that there is - * still an active task on the cpu. Remote-clear will therefore not transition - * to UNSET, which fulfills property (1). + * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task + * CID and denotes that the CID is only temporarily owned by the + * task. When it schedules out the task drops the CID back into the + * pool if this bit is set. * - * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), - * it will move its state to UNSET, which clears the percpu cid perhaps - * uselessly (which is not an issue for correctness). Because task (Y) is not - * observed, CPU1 can move ahead to set the state to UNSET. Because moving - * state to UNSET is done with a cmpxchg expecting that the old state has the - * LAZY flag set, only one thread will successfully UNSET. + * 2) The initiating context walks the per CPU space and after completion + * clears mm:mm_cid.transit. So after that point the CIDs are strictly + * task owned again. * - * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 - * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and - * CPU1 will observe task (Y) and do nothing more, which is fine. + * This two phase transition is required to prevent CID space exhaustion + * during the transition as a direct transfer of ownership would fail if + * two tasks are scheduled in on the same CPU before the fixup freed per + * CPU CIDs. * - * What we are effectively preventing with this Dekker is a scenario where - * neither LAZY flag nor store (Y) are observed, which would fail property (1) - * because this would UNSET a cid which is actively used. + * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID + * related to that MM is owned by a CPU anymore. */ -void sched_mm_cid_migrate_from(struct task_struct *t) -{ - t->migrate_from_cpu = task_cpu(t); -} - -static -int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid) +/* + * Update the CID range properties when the constraints change. Invoked via + * fork(), exit() and affinity changes + */ +static void __mm_update_max_cids(struct mm_mm_cid *mc) { - struct mm_struct *mm = t->mm; - struct task_struct *src_task; - int src_cid, last_mm_cid; + unsigned int opt_cids, max_cids; - if (!mm) - return -1; + /* Calculate the new optimal constraint */ + opt_cids = min(mc->nr_cpus_allowed, mc->users); - last_mm_cid = t->last_mm_cid; - /* - * If the migrated task has no last cid, or if the current - * task on src rq uses the cid, it means the source cid does not need - * to be moved to the destination cpu. - */ - if (last_mm_cid == -1) - return -1; - src_cid = READ_ONCE(src_pcpu_cid->cid); - if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) - return -1; + /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */ + max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus()); + WRITE_ONCE(mc->max_cids, max_cids); +} - /* - * If we observe an active task using the mm on this rq, it means we - * are not the last task to be migrated from this cpu for this mm, so - * there is no need to move src_cid to the destination cpu. - */ - guard(rcu)(); - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - t->last_mm_cid = -1; - return -1; - } +static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) +{ + unsigned int opt_cids; - return src_cid; + opt_cids = min(mc->nr_cpus_allowed, mc->users); + /* Has to be at least 1 because 0 indicates PCPU mode off */ + return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1); } -static -int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid, - int src_cid) +static bool mm_update_max_cids(struct mm_struct *mm) { - struct task_struct *src_task; - struct mm_struct *mm = t->mm; - int lazy_cid; - - if (src_cid == -1) - return -1; + struct mm_mm_cid *mc = &mm->mm_cid; - /* - * Attempt to clear the source cpu cid to move it to the destination - * cpu. - */ - lazy_cid = mm_cid_set_lazy_put(src_cid); - if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) - return -1; + lockdep_assert_held(&mm->mm_cid.lock); - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ + /* Clear deferred mode switch flag. A change is handled by the caller */ + mc->update_deferred = false; + __mm_update_max_cids(mc); - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, this task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - /* - * We observed an active task for this mm, there is therefore - * no point in moving this cid to the destination cpu. - */ - t->last_mm_cid = -1; - return -1; - } + /* Check whether owner mode must be changed */ + if (!mc->percpu) { + /* Enable per CPU mode when the number of users is above max_cids */ + if (mc->users > mc->max_cids) + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); + } else { + /* Switch back to per task if user count under threshold */ + if (mc->users < mc->pcpu_thrs) + mc->pcpu_thrs = 0; } - /* - * The src_cid is unused, so it can be unset. - */ - if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - return -1; - WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); - return src_cid; + /* Mode change required? */ + if (!!mc->percpu == !!mc->pcpu_thrs) + return false; + /* When switching back to per TASK mode, set the transition flag */ + if (!mc->pcpu_thrs) + WRITE_ONCE(mc->transit, MM_CID_TRANSIT); + WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); + return true; } -/* - * Migration to dst cpu. Called with dst_rq lock held. - * Interrupts are disabled, which keeps the window of cid ownership without the - * source rq lock held small. - */ -void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { - struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; - struct mm_struct *mm = t->mm; - int src_cid, src_cpu; - bool dst_cid_is_set; - struct rq *src_rq; - - lockdep_assert_rq_held(dst_rq); + struct cpumask *mm_allowed; + struct mm_mm_cid *mc; + unsigned int weight; - if (!mm) - return; - src_cpu = t->migrate_from_cpu; - if (src_cpu == -1) { - t->last_mm_cid = -1; + if (!mm || !READ_ONCE(mm->mm_cid.users)) return; - } /* - * Move the src cid if the dst cid is unset. This keeps id - * allocation closest to 0 in cases where few threads migrate around - * many CPUs. - * - * If destination cid or recent cid is already set, we may have - * to just clear the src cid to ensure compactness in frequent - * migrations scenarios. - * - * It is not useful to clear the src cid when the number of threads is - * greater or equal to the number of allowed CPUs, because user-space - * can expect that the number of allowed cids can reach the number of - * allowed CPUs. - */ - dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || - !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); - if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) + * mm::mm_cid::mm_cpus_allowed is the superset of each threads + * allowed CPUs mask which means it can only grow. + */ + mc = &mm->mm_cid; + guard(raw_spinlock)(&mc->lock); + mm_allowed = mm_cpus_allowed(mm); + weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk); + if (weight == mc->nr_cpus_allowed) return; - src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); - src_rq = cpu_rq(src_cpu); - src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); - if (src_cid == -1) + + WRITE_ONCE(mc->nr_cpus_allowed, weight); + __mm_update_max_cids(mc); + if (!mc->percpu) return; - src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, - src_cid); - if (src_cid == -1) + + /* Adjust the threshold to the wider set */ + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); + /* Switch back to per task mode? */ + if (mc->users >= mc->pcpu_thrs) return; - if (dst_cid_is_set) { - __mm_cid_put(mm, src_cid); + + /* Don't queue twice */ + if (mc->update_deferred) return; - } - /* Move src_cid to dst cpu. */ - mm_cid_snapshot_time(dst_rq, mm); - WRITE_ONCE(dst_pcpu_cid->cid, src_cid); - WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); + + /* Queue the irq work, which schedules the real work */ + mc->update_deferred = true; + irq_work_queue(&mc->irq_work); } -static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, - int cpu) +static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) { - struct rq *rq = cpu_rq(cpu); - struct task_struct *t; - int cid, lazy_cid; + if (cid_on_cpu(t->mm_cid.cid)) { + unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid); - cid = READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid)) - return; + t->mm_cid.cid = cid_to_transit_cid(cid); + pcp->cid = t->mm_cid.cid; + } +} - /* - * Clear the cpu cid if it is set to keep cid allocation compact. If - * there happens to be other tasks left on the source cpu using this - * mm, the next task using this mm will reallocate its cid on context - * switch. - */ - lazy_cid = mm_cid_set_lazy_put(cid); - if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) - return; +static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) +{ + unsigned int cpu; - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ + /* Walk the CPUs and fixup all stale CIDs */ + for_each_possible_cpu(cpu) { + struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu); + struct rq *rq = cpu_rq(cpu); - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, that task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - t = rcu_dereference(rq->curr); - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) - return; + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(rq_lock_irq)(rq); + /* Is the CID still owned by the CPU? */ + if (cid_on_cpu(pcp->cid)) { + /* + * If rq->curr has @mm, transfer it with the + * transition bit set. Otherwise drop it. + */ + if (rq->curr->mm == mm && rq->curr->mm_cid.active) + mm_cid_transit_to_task(rq->curr, pcp); + else + mm_drop_cid_on_cpu(mm, pcp); + + } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) { + unsigned int cid = rq->curr->mm_cid.cid; + + /* Ensure it has the transition bit set */ + if (!cid_in_transit(cid)) { + cid = cid_to_transit_cid(cid); + rq->curr->mm_cid.cid = cid; + pcp->cid = cid; + } + } } + /* Clear the transition bit */ + WRITE_ONCE(mm->mm_cid.transit, 0); +} - /* - * The cid is unused, so it can be unset. - * Disable interrupts to keep the window of cid ownership without rq - * lock small. - */ - scoped_guard (irqsave) { - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - __mm_cid_put(mm, cid); +static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) +{ + if (cid_on_task(t->mm_cid.cid)) { + t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid); + pcp->cid = t->mm_cid.cid; } } -static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) +static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) { - struct rq *rq = cpu_rq(cpu); - struct mm_cid *pcpu_cid; - struct task_struct *curr; - u64 rq_clock; + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(task_rq_lock)(t); + /* If the task is not active it is not in the users count */ + if (!t->mm_cid.active) + return false; + if (cid_on_task(t->mm_cid.cid)) { + /* If running on the CPU, transfer the CID, otherwise drop it */ + if (task_rq(t)->curr == t) + mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); + else + mm_unset_cid_on_task(t); + } + return true; +} - /* - * rq->clock load is racy on 32-bit but one spurious clear once in a - * while is irrelevant. - */ - rq_clock = READ_ONCE(rq->clock); - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); +static void mm_cid_fixup_tasks_to_cpus(void) +{ + struct mm_struct *mm = current->mm; + struct task_struct *p, *t; + unsigned int users; /* - * In order to take care of infrequently scheduled tasks, bump the time - * snapshot associated with this cid if an active task using the mm is - * observed on this rq. + * This can obviously race with a concurrent affinity change, which + * increases the number of allowed CPUs for this mm, but that does + * not affect the mode and only changes the CID constraints. A + * possible switch back to per task mode happens either in the + * deferred handler function or in the next fork()/exit(). + * + * The caller has already transferred. The newly incoming task is + * already accounted for, but not yet visible. */ - scoped_guard (rcu) { - curr = rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - return; - } + users = mm->mm_cid.users - 2; + if (!users) + return; + + guard(rcu)(); + for_other_threads(current, t) { + if (mm_cid_fixup_task_to_cpu(t, mm)) + users--; } - if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) + if (!users) return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); + + /* Happens only for VM_CLONE processes. */ + for_each_process_thread(p, t) { + if (t == current || t->mm != mm) + continue; + if (mm_cid_fixup_task_to_cpu(t, mm)) { + if (--users == 0) + return; + } + } } -static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, - int weight) +static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) { - struct mm_cid *pcpu_cid; - int cid; - - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); - cid = READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid) || cid < weight) - return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); + t->mm_cid.active = 1; + mm->mm_cid.users++; + return mm_update_max_cids(mm); } -static void task_mm_cid_work(struct callback_head *work) +void sched_mm_cid_fork(struct task_struct *t) { - unsigned long now = jiffies, old_scan, next_scan; - struct task_struct *t = current; - struct cpumask *cidmask; - struct mm_struct *mm; - int weight, cpu; + struct mm_struct *mm = t->mm; + bool percpu; - WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); + WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); - work->next = work; /* Prevent double-add */ - if (t->flags & PF_EXITING) - return; - mm = t->mm; - if (!mm) - return; - old_scan = READ_ONCE(mm->mm_cid_next_scan); - next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); - if (!old_scan) { - unsigned long res; - - res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); - if (res != old_scan) - old_scan = res; + guard(mutex)(&mm->mm_cid.mutex); + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu); + + /* First user ? */ + if (!mm->mm_cid.users) { + sched_mm_cid_add_user(t, mm); + t->mm_cid.cid = mm_get_cid(mm); + /* Required for execve() */ + pcp->cid = t->mm_cid.cid; + return; + } + + if (!sched_mm_cid_add_user(t, mm)) { + if (!mm->mm_cid.percpu) + t->mm_cid.cid = mm_get_cid(mm); + return; + } + + /* Handle the mode change and transfer current's CID */ + percpu = !!mm->mm_cid.percpu; + if (!percpu) + mm_cid_transit_to_task(current, pcp); else - old_scan = next_scan; + mm_cid_transfer_to_cpu(current, pcp); } - if (time_before(now, old_scan)) - return; - if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) - return; - cidmask = mm_cidmask(mm); - /* Clear cids that were not recently used. */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_old(mm, cpu); - weight = cpumask_weight(cidmask); - /* - * Clear cids that are greater or equal to the cidmask weight to - * recompact it. - */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_weight(mm, cpu, weight); -} -void init_sched_mm_cid(struct task_struct *t) -{ - struct mm_struct *mm = t->mm; - int mm_users = 0; - - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) - mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); + if (percpu) { + mm_cid_fixup_tasks_to_cpus(); + } else { + mm_cid_fixup_cpus_to_tasks(mm); + t->mm_cid.cid = mm_get_cid(mm); } - t->cid_work.next = &t->cid_work; /* Protect against double add */ - init_task_work(&t->cid_work, task_mm_cid_work); } -void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) +static bool sched_mm_cid_remove_user(struct task_struct *t) { - struct callback_head *work = &curr->cid_work; - unsigned long now = jiffies; - - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || - work->next != work) - return; - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) - return; - - /* No page allocation under rq lock */ - task_work_add(curr, work, TWA_RESUME); + t->mm_cid.active = 0; + scoped_guard(preempt) { + /* Clear the transition bit */ + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); + mm_unset_cid_on_task(t); + } + t->mm->mm_cid.users--; + return mm_update_max_cids(t->mm); } -void sched_mm_cid_exit_signals(struct task_struct *t) +static bool __sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; - - if (!mm) - return; - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); + if (!sched_mm_cid_remove_user(t)) + return false; /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). + * Contrary to fork() this only deals with a switch back to per + * task mode either because the above decreased users or an + * affinity change increased the number of allowed CPUs and the + * deferred fixup did not run yet. */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return false; + /* + * A failed fork(2) cleanup never gets here, so @current must have + * the same MM as @t. That's true for exit() and the failed + * pthread_create() cleanup case. + */ + if (WARN_ON_ONCE(current->mm != mm)) + return false; + return true; } -void sched_mm_cid_before_execve(struct task_struct *t) +/* + * When a task exits, the MM CID held by the task is not longer required as + * the task cannot return to user space. + */ +void sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; - if (!mm) + if (!mm || !t->mm_cid.active) return; + /* + * Ensure that only one instance is doing MM CID operations within + * a MM. The common case is uncontended. The rare fixup case adds + * some overhead. + */ + scoped_guard(mutex, &mm->mm_cid.mutex) { + /* mm_cid::mutex is sufficient to protect mm_cid::users */ + if (likely(mm->mm_cid.users > 1)) { + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + if (!__sched_mm_cid_exit(t)) + return; + /* Mode change required. Transfer currents CID */ + mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); + } + mm_cid_fixup_cpus_to_tasks(mm); + return; + } + /* Last user */ + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Required across execve() */ + if (t == current) + mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu)); + /* Ignore mode change. There is nothing to do. */ + sched_mm_cid_remove_user(t); + } + } - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). + * As this is the last user (execve(), process exit or failed + * fork(2)) there is no concurrency anymore. + * + * Synchronize eventually pending work to ensure that there are no + * dangling references left. @t->mm_cid.users is zero so nothing + * can queue this work anymore. */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + irq_work_sync(&mm->mm_cid.irq_work); + cancel_work_sync(&mm->mm_cid.work); +} + +/* Deactivate MM CID allocation across execve() */ +void sched_mm_cid_before_execve(struct task_struct *t) +{ + sched_mm_cid_exit(t); } +/* Reactivate MM CID after successful execve() */ void sched_mm_cid_after_execve(struct task_struct *t) { - struct mm_struct *mm = t->mm; - struct rq *rq; + sched_mm_cid_fork(t); +} + +static void mm_cid_work_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); - if (!mm) + guard(mutex)(&mm->mm_cid.mutex); + /* Did the last user task exit already? */ + if (!mm->mm_cid.users) return; - preempt_disable(); - rq = this_rq(); - scoped_guard (rq_lock_irqsave, rq) { - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Have fork() or exit() handled it already? */ + if (!mm->mm_cid.update_deferred) + return; + /* This clears mm_cid::update_deferred */ + if (!mm_update_max_cids(mm)) + return; + /* Affinity changes can only switch back to task mode */ + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return; } + mm_cid_fixup_cpus_to_tasks(mm); } -void sched_mm_cid_fork(struct task_struct *t) +static void mm_cid_irq_work(struct irq_work *work) { - WARN_ON_ONCE(!t->mm || t->mm_cid != -1); - t->mm_cid_active = 1; + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work); + + /* + * Needs to be unconditional because mm_cid::lock cannot be held + * when scheduling work as mm_update_cpus_allowed() nests inside + * rq::lock and schedule_work() might end up in wakeup... + */ + schedule_work(&mm->mm_cid.work); } -#endif /* CONFIG_SCHED_MM_CID */ -#ifdef CONFIG_SCHED_CLASS_EXT -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx) +void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { + mm->mm_cid.max_cids = 0; + mm->mm_cid.percpu = 0; + mm->mm_cid.transit = 0; + mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; + mm->mm_cid.users = 0; + mm->mm_cid.pcpu_thrs = 0; + mm->mm_cid.update_deferred = 0; + raw_spin_lock_init(&mm->mm_cid.lock); + mutex_init(&mm->mm_cid.mutex); + mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); + INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); + bitmap_zero(mm_cidmask(mm), num_possible_cpus()); +} +#else /* CONFIG_SCHED_MM_CID */ +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } +#endif /* !CONFIG_SCHED_MM_CID */ + +static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); + +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags) +{ + struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); struct rq *rq = task_rq(p); + /* + * Must exclusively use matched flags since this is both dequeue and + * enqueue. + */ + WARN_ON_ONCE(flags & 0xFFFF0000); + lockdep_assert_rq_held(rq); - *ctx = (struct sched_enq_and_set_ctx){ + if (!(flags & DEQUEUE_NOCLOCK)) { + update_rq_clock(rq); + flags |= DEQUEUE_NOCLOCK; + } + + if (flags & DEQUEUE_CLASS) { + if (p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); + } + + *ctx = (struct sched_change_ctx){ .p = p, - .queue_flags = queue_flags, + .flags = flags, .queued = task_on_rq_queued(p), - .running = task_current(rq, p), + .running = task_current_donor(rq, p), }; - update_rq_clock(rq); + if (!(flags & DEQUEUE_CLASS)) { + if (p->sched_class->get_prio) + ctx->prio = p->sched_class->get_prio(rq, p); + else + ctx->prio = p->prio; + } + if (ctx->queued) - dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + dequeue_task(rq, p, flags); if (ctx->running) put_prev_task(rq, p); + + if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from) + p->sched_class->switched_from(rq, p); + + return ctx; } -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +void sched_change_end(struct sched_change_ctx *ctx) { - struct rq *rq = task_rq(ctx->p); + struct task_struct *p = ctx->p; + struct rq *rq = task_rq(p); lockdep_assert_rq_held(rq); + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); + if (ctx->queued) - enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + enqueue_task(rq, p, ctx->flags); if (ctx->running) - set_next_task(rq, ctx->p); + set_next_task(rq, p); + + if (ctx->flags & ENQUEUE_CLASS) { + if (p->sched_class->switched_to) + p->sched_class->switched_to(rq, p); + } else { + p->sched_class->prio_changed(rq, p, ctx->prio); + } } -#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index cdd740b3f774..37b572cc8aca 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -166,12 +166,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, * cpudl_clear - remove a CPU from the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target CPU + * @online: the online state of the deadline runqueue * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_clear(struct cpudl *cp, int cpu) +void cpudl_clear(struct cpudl *cp, int cpu, bool online) { int old_idx, new_cpu; unsigned long flags; @@ -184,7 +185,7 @@ void cpudl_clear(struct cpudl *cp, int cpu) if (old_idx == IDX_INVALID) { /* * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is + * This could happen if rq_online_dl or rq_offline_dl is * called for a CPU without -dl tasks running. */ } else { @@ -195,9 +196,12 @@ void cpudl_clear(struct cpudl *cp, int cpu) cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; cpudl_heapify(cp, old_idx); - - cpumask_set_cpu(cpu, cp->free_cpus); } + if (likely(online)) + __cpumask_set_cpu(cpu, cp->free_cpus); + else + __cpumask_clear_cpu(cpu, cp->free_cpus); + raw_spin_unlock_irqrestore(&cp->lock, flags); } @@ -228,7 +232,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) cp->elements[new_idx].cpu = cpu; cp->elements[cpu].idx = new_idx; cpudl_heapify_up(cp, new_idx); - cpumask_clear_cpu(cpu, cp->free_cpus); + __cpumask_clear_cpu(cpu, cp->free_cpus); } else { cp->elements[old_idx].dl = dl; cpudl_heapify(cp, old_idx); @@ -238,26 +242,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) } /* - * cpudl_set_freecpu - Set the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_set_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_set_cpu(cpu, cp->free_cpus); -} - -/* - * cpudl_clear_freecpu - Clear the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_clear_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_clear_cpu(cpu, cp->free_cpus); -} - -/* * cpudl_init - initialize the cpudl structure * @cp: the cpudl max-heap context */ diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 11c0f1faa7e1..d7699468eedd 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -19,8 +19,6 @@ struct cpudl { int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); -void cpudl_clear(struct cpudl *cp, int cpu); +void cpudl_clear(struct cpudl *cp, int cpu, bool online); int cpudl_init(struct cpudl *cp); -void cpudl_set_freecpu(struct cpudl *cp, int cpu); -void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7097de2c8cda..4f97896887ec 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -313,10 +313,8 @@ static u64 read_sum_exec_runtime(struct task_struct *t) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { struct signal_struct *sig = tsk->signal; - u64 utime, stime; struct task_struct *t; - unsigned int seq, nextseq; - unsigned long flags; + u64 utime, stime; /* * Update current task runtime to account pending time since last @@ -329,27 +327,19 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) if (same_thread_group(current, tsk)) (void) task_sched_runtime(current); - rcu_read_lock(); - /* Attempt a lockless read on the first round. */ - nextseq = 0; - do { - seq = nextseq; - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + guard(rcu)(); + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { times->utime = sig->utime; times->stime = sig->stime; times->sum_exec_runtime = sig->sum_sched_runtime; - for_each_thread(tsk, t) { + __for_each_thread(sig, t) { task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += read_sum_exec_runtime(t); } - /* If lockless access failed, take the lock. */ - nextseq = 1; - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - rcu_read_unlock(); + } } #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 615411a0a881..319439fe1870 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -125,20 +125,11 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); - if (cpumask_subset(rd->span, cpu_active_mask)) - return cpumask_weight(rd->span); - - cpus = 0; - - for_each_cpu_and(i, rd->span, cpu_active_mask) - cpus++; - - return cpus; + return cpumask_weight_and(rd->span, cpu_active_mask); } static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) @@ -405,7 +396,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se); * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct sched_dl_entity *dl_se) +static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task) { struct hrtimer *timer = &dl_se->inactive_timer; struct rq *rq = rq_of_dl_se(dl_se); @@ -444,10 +435,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se) } else { struct task_struct *p = dl_task_of(dl_se); - if (dl_task(p)) + if (dl_task) sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (READ_ONCE(p->__state) == TASK_DEAD) @@ -1166,8 +1157,17 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ sched_clock_tick(); update_rq_clock(rq); - if (!dl_se->dl_runtime) + /* + * Make sure current has propagated its pending runtime into + * any relevant server through calling dl_server_update() and + * friends. + */ + rq->donor->sched_class->update_curr(rq); + + if (dl_se->dl_defer_idle) { + dl_server_stop(dl_se); return HRTIMER_NORESTART; + } if (dl_se->dl_defer_armed) { /* @@ -1416,10 +1416,11 @@ s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta } static inline void -update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, - int flags); +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags); + static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { + bool idle = rq->curr == rq->idle; s64 scaled_delta_exec; if (unlikely(delta_exec <= 0)) { @@ -1440,6 +1441,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 dl_se->runtime -= scaled_delta_exec; + if (dl_se->dl_defer_idle && !idle) + dl_se->dl_defer_idle = 0; + /* * The fair server can consume its runtime while throttled (not queued/ * running as regular CFS). @@ -1450,6 +1454,29 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 */ if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { /* + * Non-servers would never get time accounted while throttled. + */ + WARN_ON_ONCE(!dl_server(dl_se)); + + /* + * While the server is marked idle, do not push out the + * activation further, instead wait for the period timer + * to lapse and stop the server. + */ + if (dl_se->dl_defer_idle && idle) { + /* + * The timer is at the zero-laxity point, this means + * dl_server_stop() / dl_server_start() can happen + * while now < deadline. This means update_dl_entity() + * will not replenish. Additionally start_dl_timer() + * will be set for 'deadline - runtime'. Negative + * runtime will not do. + */ + dl_se->runtime = 0; + return; + } + + /* * If the server was previously activated - the starving condition * took place, it this point it went away because the fair scheduler * was able to get runtime in background. So return to the initial @@ -1461,6 +1488,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 replenish_dl_new_period(dl_se, dl_se->rq); + if (idle) + dl_se->dl_defer_idle = 1; + /* * Not being able to start the timer seems problematic. If it could not * be started for whatever reason, we need to "unthrottle" the DL server @@ -1543,38 +1573,213 @@ throttle: * as time available for the fair server, avoiding a penalty for the * rt scheduler that did not consumed that time. */ -void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) +void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec) { - s64 delta_exec; - - if (!rq->fair_server.dl_defer) - return; - - /* no need to discount more */ - if (rq->fair_server.runtime < 0) - return; - - delta_exec = rq_clock_task(rq) - p->se.exec_start; - if (delta_exec < 0) - return; - - rq->fair_server.runtime -= delta_exec; - - if (rq->fair_server.runtime < 0) { - rq->fair_server.dl_defer_running = 0; - rq->fair_server.runtime = 0; - } - - p->se.exec_start = rq_clock_task(rq); + if (dl_se->dl_server_active && dl_se->dl_runtime && dl_se->dl_defer) + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) + if (dl_se->dl_server_active && dl_se->dl_runtime) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } +/* + * dl_server && dl_defer: + * + * 6 + * +--------------------+ + * v | + * +-------------+ 4 +-----------+ 5 +------------------+ + * +-> | A:init | <--- | D:running | -----> | E:replenish-wait | + * | +-------------+ +-----------+ +------------------+ + * | | | 1 ^ ^ | + * | | 1 +----------+ | 3 | + * | v | | + * | +--------------------------------+ 2 | + * | | | ----+ | + * | 8 | B:zero_laxity-wait | | | + * | | | <---+ | + * | +--------------------------------+ | + * | | ^ ^ 2 | + * | | 7 | 2 +--------------------+ + * | v | + * | +-------------+ | + * +-- | C:idle-wait | -+ + * +-------------+ + * ^ 7 | + * +---------+ + * + * + * [A] - init + * dl_server_active = 0 + * dl_throttled = 0 + * dl_defer_armed = 0 + * dl_defer_running = 0/1 + * dl_defer_idle = 0 + * + * [B] - zero_laxity-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 1 + * dl_defer_running = 0 + * dl_defer_idle = 0 + * + * [C] - idle-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 1 + * dl_defer_running = 0 + * dl_defer_idle = 1 + * + * [D] - running + * dl_server_active = 1 + * dl_throttled = 0 + * dl_defer_armed = 0 + * dl_defer_running = 1 + * dl_defer_idle = 0 + * + * [E] - replenish-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 0 + * dl_defer_running = 1 + * dl_defer_idle = 0 + * + * + * [1] A->B, A->D + * dl_server_start() + * dl_server_active = 1; + * enqueue_dl_entity() + * update_dl_entity(WAKEUP) + * if (!dl_defer_running) + * dl_defer_armed = 1; + * dl_throttled = 1; + * if (dl_throttled && start_dl_timer()) + * return; // [B] + * __enqueue_dl_entity(); + * // [D] + * + * // deplete server runtime from client-class + * [2] B->B, C->B, E->B + * dl_server_update() + * update_curr_dl_se() // idle = false + * if (dl_defer_idle) + * dl_defer_idle = 0; + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) + * dl_defer_running = 0; + * hrtimer_try_to_cancel(); // stop timer + * replenish_dl_new_period() + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; + * start_dl_timer(); // restart timer + * // [B] + * + * // timer actually fires means we have runtime + * [3] B->D + * dl_server_timer() + * if (dl_defer_armed) + * dl_defer_running = 1; + * enqueue_dl_entity(REPLENISH) + * replenish_dl_entity() + * // fwd period + * if (dl_throttled) + * dl_throttled = 0; + * if (dl_defer_armed) + * dl_defer_armed = 0; + * __enqueue_dl_entity(); + * // [D] + * + * // schedule server + * [4] D->A + * pick_task_dl() + * p = server_pick_task(); + * if (!p) + * dl_server_stop() + * dequeue_dl_entity(); + * hrtimer_try_to_cancel(); + * dl_defer_armed = 0; + * dl_throttled = 0; + * dl_server_active = 0; + * // [A] + * return p; + * + * // server running + * [5] D->E + * update_curr_dl_se() + * if (dl_runtime_exceeded()) + * dl_throttled = 1; + * dequeue_dl_entity(); + * start_dl_timer(); + * // [E] + * + * // server replenished + * [6] E->D + * dl_server_timer() + * enqueue_dl_entity(REPLENISH) + * replenish_dl_entity() + * fwd-period + * if (dl_throttled) + * dl_throttled = 0; + * __enqueue_dl_entity(); + * // [D] + * + * // deplete server runtime from idle + * [7] B->C, C->C + * dl_server_update_idle() + * update_curr_dl_se() // idle = true + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) + * if (dl_defer_idle) + * return; + * dl_defer_running = 0; + * hrtimer_try_to_cancel(); + * replenish_dl_new_period() + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; + * dl_defer_idle = 1; + * start_dl_timer(); // restart timer + * // [C] + * + * // stop idle server + * [8] C->A + * dl_server_timer() + * if (dl_defer_idle) + * dl_server_stop(); + * // [A] + * + * + * digraph dl_server { + * "A:init" -> "B:zero_laxity-wait" [label="1:dl_server_start"] + * "A:init" -> "D:running" [label="1:dl_server_start"] + * "B:zero_laxity-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] + * "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"] + * "C:idle-wait" -> "A:init" [label="8:dl_server_timer"] + * "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] + * "D:running" -> "A:init" [label="4:pick_task_dl"] + * "D:running" -> "E:replenish-wait" [label="5:update_curr_dl_se"] + * "E:replenish-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "E:replenish-wait" -> "D:running" [label="6:dl_server_timer"] + * } + * + * + * Notes: + * + * - When there are fair tasks running the most likely loop is [2]->[2]. + * the dl_server never actually runs, the timer never fires. + * + * - When there is actual fair starvation; the timer fires and starts the + * dl_server. This will then throttle and replenish like a normal DL + * task. Notably it will not 'defer' again. + * + * - When idle it will push the actication forward once, and then wait + * for the timer to hit or a non-idle update to restart things. + */ void dl_server_start(struct sched_dl_entity *dl_se) { struct rq *rq = dl_se->rq; @@ -1582,6 +1787,14 @@ void dl_server_start(struct sched_dl_entity *dl_se) if (!dl_server(dl_se) || dl_se->dl_server_active) return; + /* + * Update the current task to 'now'. + */ + rq->donor->sched_class->update_curr(rq); + + if (WARN_ON_ONCE(!cpu_online(cpu_of(rq)))) + return; + dl_se->dl_server_active = 1; enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) @@ -1597,6 +1810,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se) hrtimer_try_to_cancel(&dl_se->dl_timer); dl_se->dl_defer_armed = 0; dl_se->dl_throttled = 0; + dl_se->dl_defer_idle = 0; dl_se->dl_server_active = 0; } @@ -1808,7 +2022,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online); cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { struct rb_node *leftmost = rb_first_cached(&dl_rq->root); @@ -2045,7 +2259,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) * or "inactive") */ if (flags & DEQUEUE_SLEEP) - task_non_contending(dl_se); + task_non_contending(dl_se, true); } static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -2140,7 +2354,7 @@ static void yield_task_dl(struct rq *rq) * it and the bandwidth timer will wake it up and will give it * new scheduling parameters (thanks to dl_yielded=1). */ - rq->curr->dl.dl_yielded = 1; + rq->donor->dl.dl_yielded = 1; update_rq_clock(rq); update_curr_dl(rq); @@ -2170,7 +2384,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) struct rq *rq; if (!(flags & WF_TTWU)) - goto out; + return cpu; rq = cpu_rq(cpu); @@ -2208,7 +2422,6 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) } rcu_read_unlock(); -out: return cpu; } @@ -2352,7 +2565,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) * __pick_next_task_dl - Helper to pick the next -deadline task to run. * @rq: The runqueue to pick the next task from. */ -static struct task_struct *__pick_task_dl(struct rq *rq) +static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -2366,7 +2579,7 @@ again: WARN_ON_ONCE(!dl_se); if (dl_server(dl_se)) { - p = dl_se->server_pick_task(dl_se); + p = dl_se->server_pick_task(dl_se, rf); if (!p) { dl_server_stop(dl_se); goto again; @@ -2379,9 +2592,9 @@ again: return p; } -static struct task_struct *pick_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf) { - return __pick_task_dl(rq); + return __pick_task_dl(rq, rf); } static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) @@ -2462,6 +2675,7 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu return NULL; } +/* Access rule: must be called on local CPU with preemption disabled */ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) @@ -2880,9 +3094,10 @@ static void rq_online_dl(struct rq *rq) if (rq->dl.overloaded) dl_set_overload(rq); - cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); + else + cpudl_clear(&rq->rd->cpudl, rq->cpu, true); } /* Assumes rq->lock is held */ @@ -2891,8 +3106,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_clear(&rq->rd->cpudl, rq->cpu); - cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, false); } void __init init_sched_dl_class(void) @@ -2904,11 +3118,43 @@ void __init init_sched_dl_class(void) GFP_KERNEL, cpu_to_node(i)); } +/* + * This function always returns a non-empty bitmap in @cpus. This is because + * if a root domain has reserved bandwidth for DL tasks, the DL bandwidth + * check will prevent CPU hotplug from deactivating all CPUs in that domain. + */ +static void dl_get_task_effective_cpus(struct task_struct *p, struct cpumask *cpus) +{ + const struct cpumask *hk_msk; + + hk_msk = housekeeping_cpumask(HK_TYPE_DOMAIN); + if (housekeeping_enabled(HK_TYPE_DOMAIN)) { + if (!cpumask_intersects(p->cpus_ptr, hk_msk)) { + /* + * CPUs isolated by isolcpu="domain" always belong to + * def_root_domain. + */ + cpumask_andnot(cpus, cpu_active_mask, hk_msk); + return; + } + } + + /* + * If a root domain holds a DL task, it must have active CPUs. So + * active CPUs can always be found by walking up the task's cpuset + * hierarchy up to the partition root. + */ + cpuset_cpus_allowed_locked(p, cpus); +} + +/* The caller should hold cpuset_mutex */ void dl_add_task_root_domain(struct task_struct *p) { struct rq_flags rf; struct rq *rq; struct dl_bw *dl_b; + unsigned int cpu; + struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); if (!dl_task(p) || dl_entity_is_special(&p->dl)) { @@ -2916,16 +3162,25 @@ void dl_add_task_root_domain(struct task_struct *p) return; } - rq = __task_rq_lock(p, &rf); - + /* + * Get an active rq, whose rq->rd traces the correct root + * domain. + * Ideally this would be under cpuset reader lock until rq->rd is + * fetched. However, sleepable locks cannot nest inside pi_lock, so we + * rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex' + * to guarantee the CPU stays in the cpuset. + */ + dl_get_task_effective_cpus(p, msk); + cpu = cpumask_first_and(cpu_active_mask, msk); + BUG_ON(cpu >= nr_cpu_ids); + rq = cpu_rq(cpu); dl_b = &rq->rd->dl_bw; - raw_spin_lock(&dl_b->lock); + /* End of fetching rd */ + raw_spin_lock(&dl_b->lock); __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); - raw_spin_unlock(&dl_b->lock); - - task_rq_unlock(rq, p, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); } void dl_clear_root_domain(struct root_domain *rd) @@ -2970,7 +3225,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(&p->dl); + task_non_contending(&p->dl, false); /* * In case a task is setscheduled out from SCHED_DEADLINE we need to @@ -3042,23 +3297,24 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } } +static u64 get_prio_dl(struct rq *rq, struct task_struct *p) +{ + return p->dl.deadline; +} + /* * If the scheduling parameters of a -deadline task changed, * a push or pull operation might be needed. */ -static void prio_changed_dl(struct rq *rq, struct task_struct *p, - int oldprio) +static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline) { if (!task_on_rq_queued(p)) return; - /* - * This might be too much, but unfortunately - * we don't have the old deadline value, and - * we can't argue if the task is increasing - * or lowering its prio, so... - */ - if (!rq->dl.overloaded) + if (p->dl.deadline == old_deadline) + return; + + if (dl_time_before(old_deadline, p->dl.deadline)) deadline_queue_pull_task(rq); if (task_current_donor(rq, p)) { @@ -3091,6 +3347,8 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu) DEFINE_SCHED_CLASS(dl) = { + .queue_mask = 8, + .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -3113,6 +3371,7 @@ DEFINE_SCHED_CLASS(dl) = { .task_tick = task_tick_dl, .task_fork = task_fork_dl, + .get_prio = get_prio_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 02e16b70a790..41caa22e0680 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -796,7 +796,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; + s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -819,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) right_vruntime = last->vruntime; - min_vruntime = cfs_rq->min_vruntime; + zero_vruntime = cfs_rq->zero_vruntime; raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", SPLIT_NS(left_deadline)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", SPLIT_NS(left_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", - SPLIT_NS(min_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", + SPLIT_NS(zero_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", SPLIT_NS(avg_vruntime(cfs_rq))); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2b0e88206d07..05f5a49e9649 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -25,7 +25,7 @@ static struct scx_sched __rcu *scx_root; * guarantee system safety. Maintain a dedicated task list which contains every * task between its fork and eventual free. */ -static DEFINE_SPINLOCK(scx_tasks_lock); +static DEFINE_RAW_SPINLOCK(scx_tasks_lock); static LIST_HEAD(scx_tasks); /* ops enable/disable */ @@ -33,9 +33,10 @@ static DEFINE_MUTEX(scx_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); -static unsigned long scx_in_softlockup; -static atomic_t scx_breather_depth = ATOMIC_INIT(0); static int scx_bypass_depth; +static cpumask_var_t scx_bypass_lb_donee_cpumask; +static cpumask_var_t scx_bypass_lb_resched_cpumask; +static bool scx_aborting; static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -67,8 +68,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; -/* for %SCX_KICK_WAIT */ -static unsigned long __percpu *scx_kick_cpus_pnt_seqs; +/* + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated + * lazily when enabling and freed when disabling to avoid waste when sched_ext + * isn't active. + */ +struct scx_kick_syncs { + struct rcu_head rcu; + unsigned long syncs[]; +}; + +static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); /* * Direct dispatch marker. @@ -132,26 +144,70 @@ static struct scx_dump_data scx_dump_data = { /* /sys/kernel/sched_ext interface */ static struct kset *scx_kset; +/* + * Parameters that can be adjusted through /sys/module/sched_ext/parameters. + * There usually is no reason to modify these as normal scheduler operation + * shouldn't be affected by them. The knobs are primarily for debugging. + */ +static u64 scx_slice_dfl = SCX_SLICE_DFL; +static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; +static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; + +static int set_slice_us(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); +} + +static const struct kernel_param_ops slice_us_param_ops = { + .set = set_slice_us, + .get = param_get_uint, +}; + +static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); +} + +static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { + .set = set_bypass_lb_intv_us, + .get = param_get_uint, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "sched_ext." + +module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); +MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); +module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); +MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); + +#undef MODULE_PARAM_PREFIX + #define CREATE_TRACE_POINTS #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); +static u32 reenq_local(struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); -static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, +static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); -static __printf(4, 5) void scx_exit(struct scx_sched *sch, +static __printf(4, 5) bool scx_exit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, ...) { va_list args; + bool ret; va_start(args, fmt); - scx_vexit(sch, kind, exit_code, fmt, args); + ret = scx_vexit(sch, kind, exit_code, fmt, args); va_end(args); + + return ret; } #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) +#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) @@ -189,7 +245,15 @@ static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) { - return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); + return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); +} + +static const struct sched_class *scx_setscheduler_class(struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) + return &stop_sched_class; + + return __setscheduler_class(p->policy, p->prio); } /* @@ -458,19 +522,16 @@ struct scx_task_iter { * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be - * visited as long as they still exist. + * visited as long as they are not dead. */ static void scx_task_iter_start(struct scx_task_iter *iter) { - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & - ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + memset(iter, 0, sizeof(*iter)); - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); - iter->locked_task = NULL; - iter->cnt = 0; iter->list_locked = true; } @@ -496,14 +557,14 @@ static void scx_task_iter_unlock(struct scx_task_iter *iter) __scx_task_iter_rq_unlock(iter); if (iter->list_locked) { iter->list_locked = false; - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); } } static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { if (!iter->list_locked) { - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->list_locked = true; } } @@ -536,14 +597,13 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - __scx_task_iter_maybe_relock(iter); - if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); - __scx_task_iter_maybe_relock(iter); } + __scx_task_iter_maybe_relock(iter); + list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) return NULL; @@ -744,6 +804,11 @@ static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err static void run_deferred(struct rq *rq) { process_ddsp_deferred_locals(rq); + + if (local_read(&rq->scx.reenq_local_deferred)) { + local_set(&rq->scx.reenq_local_deferred, 0); + reenq_local(rq); + } } static void deferred_bal_cb_workfn(struct rq *rq) @@ -764,12 +829,28 @@ static void deferred_irq_workfn(struct irq_work *irq_work) * schedule_deferred - Schedule execution of deferred actions on an rq * @rq: target rq * - * Schedule execution of deferred actions on @rq. Must be called with @rq - * locked. Deferred actions are executed with @rq locked but unpinned, and thus - * can unlock @rq to e.g. migrate tasks to other rqs. + * Schedule execution of deferred actions on @rq. Deferred actions are executed + * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks + * to other rqs. */ static void schedule_deferred(struct rq *rq) { + /* + * Queue an irq work. They are executed on IRQ re-enable which may take + * a bit longer than the scheduler hook in schedule_deferred_locked(). + */ + irq_work_queue(&rq->scx.deferred_irq_work); +} + +/** + * schedule_deferred_locked - Schedule execution of deferred actions on an rq + * @rq: target rq + * + * Schedule execution of deferred actions on @rq. Equivalent to + * schedule_deferred() but requires @rq to be locked and can be more efficient. + */ +static void schedule_deferred_locked(struct rq *rq) +{ lockdep_assert_rq_held(rq); /* @@ -780,23 +861,32 @@ static void schedule_deferred(struct rq *rq) if (rq->scx.flags & SCX_RQ_IN_WAKEUP) return; + /* Don't do anything if there already is a deferred operation. */ + if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING) + return; + /* * If in balance, the balance callbacks will be called before rq lock is * released. Schedule one. + * + * + * We can't directly insert the callback into the + * rq's list: The call can drop its lock and make the pending balance + * callback visible to unrelated code paths that call rq_pin_lock(). + * + * Just let balance_one() know that it must do it itself. */ if (rq->scx.flags & SCX_RQ_IN_BALANCE) { - queue_balance_callback(rq, &rq->scx.deferred_bal_cb, - deferred_bal_cb_workfn); + rq->scx.flags |= SCX_RQ_BAL_CB_PENDING; return; } /* - * No scheduler hooks available. Queue an irq work. They are executed on - * IRQ re-enable which may take a bit longer than the scheduler hooks. - * The above WAKEUP and BALANCE paths should cover most of the cases and - * the time to IRQ re-enable shouldn't be long. + * No scheduler hooks available. Use the generic irq_work path. The + * above WAKEUP and BALANCE paths should cover most of the cases and the + * time to IRQ re-enable shouldn't be long. */ - irq_work_queue(&rq->scx.deferred_irq_work); + schedule_deferred(rq); } /** @@ -881,7 +971,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { - p->scx.slice = SCX_SLICE_DFL; + p->scx.slice = READ_ONCE(scx_slice_dfl); __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } @@ -895,7 +985,9 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, !RB_EMPTY_NODE(&p->scx.dsq_priq)); if (!is_local) { - raw_spin_lock(&dsq->lock); + raw_spin_lock_nested(&dsq->lock, + (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); + if (unlikely(dsq->id == SCX_DSQ_INVALID)) { scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ @@ -944,8 +1036,11 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, container_of(rbp, struct task_struct, scx.dsq_priq); list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); + /* first task unchanged - no update needed */ } else { list_add(&p->scx.dsq_list.node, &dsq->list); + /* not builtin and new task is at head - use fastpath */ + rcu_assign_pointer(dsq->first_task, p); } } else { /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ @@ -953,10 +1048,19 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", dsq->id); - if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { list_add(&p->scx.dsq_list.node, &dsq->list); - else + /* new task inserted at head - use fastpath */ + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } else { + bool was_empty; + + was_empty = list_empty(&dsq->list); list_add_tail(&p->scx.dsq_list.node, &dsq->list); + if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } } /* seq records the order tasks are queued, used by BPF DSQ iterator */ @@ -1013,6 +1117,13 @@ static void task_unlink_from_dsq(struct task_struct *p, list_del_init(&p->scx.dsq_list.node); dsq_mod_nr(dsq, -1); + + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { + struct task_struct *first_task; + + first_task = nldsq_next_task(dsq, NULL, false); + rcu_assign_pointer(dsq->first_task, first_task); + } } static void dispatch_dequeue(struct rq *rq, struct task_struct *p) @@ -1020,6 +1131,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) struct scx_dispatch_q *dsq = p->scx.dsq; bool is_local = dsq == &rq->scx.local_dsq; + lockdep_assert_rq_held(rq); + if (!dsq) { /* * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. @@ -1066,6 +1179,20 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } +/* + * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq + * and dsq are locked. + */ +static void dispatch_dequeue_locked(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + lockdep_assert_rq_held(task_rq(p)); + lockdep_assert_held(&dsq->lock); + + task_unlink_from_dsq(p, dsq); + p->scx.dsq = NULL; +} + static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, struct rq *rq, u64 dsq_id, struct task_struct *p) @@ -1171,7 +1298,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); list_add_tail(&p->scx.dsq_list.node, &rq->scx.ddsp_deferred_locals); - schedule_deferred(rq); + schedule_deferred_locked(rq); return; } @@ -1196,6 +1323,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, { struct scx_sched *sch = scx_root; struct task_struct **ddsp_taskp; + struct scx_dispatch_q *dsq; unsigned long qseq; WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); @@ -1214,7 +1342,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (scx_rq_bypassing(rq)) { __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); - goto global; + goto bypass; } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -1263,8 +1391,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, direct: direct_dispatch(sch, p, enq_flags); return; - +local_norefill: + dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); + return; local: + dsq = &rq->scx.local_dsq; + goto enqueue; +global: + dsq = find_global_dsq(sch, p); + goto enqueue; +bypass: + dsq = &task_rq(p)->scx.bypass_dsq; + goto enqueue; + +enqueue: /* * For task-ordering, slice refill must be treated as implying the end * of the current slice. Otherwise, the longer @p stays on the CPU, the @@ -1272,14 +1412,7 @@ local: */ touch_core_sched(rq, p); refill_task_slice_dfl(sch, p); -local_norefill: - dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); - return; - -global: - touch_core_sched(rq, p); /* see the comment in local: */ - refill_task_slice_dfl(sch, p); - dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags); + dispatch_enqueue(sch, dsq, p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -1453,7 +1586,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags static void yield_task_scx(struct rq *rq) { struct scx_sched *sch = scx_root; - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; if (SCX_HAS_OP(sch, yield)) SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); @@ -1464,7 +1597,7 @@ static void yield_task_scx(struct rq *rq) static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { struct scx_sched *sch = scx_root; - struct task_struct *from = rq->curr; + struct task_struct *from = rq->donor; if (SCX_HAS_OP(sch, yield)) return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, @@ -1720,8 +1853,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, * @p is going from a non-local DSQ to a non-local DSQ. As * $src_dsq is already locked, do an abbreviated dequeue. */ - task_unlink_from_dsq(p, src_dsq); - p->scx.dsq = NULL; + dispatch_dequeue_locked(p, src_dsq); raw_spin_unlock(&src_dsq->lock); dispatch_enqueue(sch, dst_dsq, p, enq_flags); @@ -1730,49 +1862,12 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, return dst_rq; } -/* - * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly - * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artificial delays while the - * bypass mode is switching to guarantee timely completion. - */ -static void scx_breather(struct rq *rq) -{ - u64 until; - - lockdep_assert_rq_held(rq); - - if (likely(!atomic_read(&scx_breather_depth))) - return; - - raw_spin_rq_unlock(rq); - - until = ktime_get_ns() + NSEC_PER_MSEC; - - do { - int cnt = 1024; - while (atomic_read(&scx_breather_depth) && --cnt) - cpu_relax(); - } while (atomic_read(&scx_breather_depth) && - time_before64(ktime_get_ns(), until)); - - raw_spin_rq_lock(rq); -} - static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, struct scx_dispatch_q *dsq) { struct task_struct *p; retry: /* - * This retry loop can repeatedly race against scx_bypass() dequeueing - * tasks from @dsq trying to put the system into the bypass mode. On - * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock - * the machine into soft lockups. Give a breather. - */ - scx_breather(rq); - - /* * The caller can't expect to successfully consume a task if the task's * addition to @dsq isn't guaranteed to be visible somehow. Test * @dsq->list without locking and skip if it seems empty. @@ -1785,6 +1880,17 @@ retry: nldsq_for_each_task(p, dsq) { struct rq *task_rq = task_rq(p); + /* + * This loop can lead to multiple lockup scenarios, e.g. the BPF + * scheduler can put an enormous number of affinitized tasks into + * a contended DSQ, or the outer retry loop can repeatedly race + * against scx_bypass() dequeueing tasks from @dsq trying to put + * the system into the bypass mode. This can easily live-lock the + * machine. If aborting, exit from all non-bypass DSQs. + */ + if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS) + break; + if (rq == task_rq) { task_unlink_from_dsq(p, dsq); move_local_task_to_local_dsq(p, 0, dsq, rq); @@ -2003,6 +2109,19 @@ static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) dspc->cursor = 0; } +static inline void maybe_queue_balance_callback(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING)) + return; + + queue_balance_callback(rq, &rq->scx.deferred_bal_cb, + deferred_bal_cb_workfn); + + rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; +} + static int balance_one(struct rq *rq, struct task_struct *prev) { struct scx_sched *sch = scx_root; @@ -2013,7 +2132,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); + rq->scx.flags &= ~SCX_RQ_BAL_KEEP; if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && unlikely(rq->scx.cpu_released)) { @@ -2055,8 +2174,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (consume_global_dsq(sch, rq)) goto has_tasks; - if (unlikely(!SCX_HAS_OP(sch, dispatch)) || - scx_rq_bypassing(rq) || !scx_rq_online(rq)) + if (scx_rq_bypassing(rq)) { + if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq)) + goto has_tasks; + else + goto no_tasks; + } + + if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) goto no_tasks; dspc->rq = rq; @@ -2119,40 +2244,6 @@ has_tasks: return true; } -static int balance_scx(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) -{ - int ret; - - rq_unpin_lock(rq, rf); - - ret = balance_one(rq, prev); - -#ifdef CONFIG_SCHED_SMT - /* - * When core-sched is enabled, this ops.balance() call will be followed - * by pick_task_scx() on this CPU and the SMT siblings. Balance the - * siblings too. - */ - if (sched_core_enabled(rq)) { - const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); - int scpu; - - for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { - struct rq *srq = cpu_rq(scpu); - struct task_struct *sprev = srq->curr; - - WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); - update_rq_clock(srq); - balance_one(srq, sprev); - } - } -#endif - rq_repin_lock(rq, rf); - - return ret; -} - static void process_ddsp_deferred_locals(struct rq *rq) { struct task_struct *p; @@ -2241,12 +2332,6 @@ static void switch_class(struct rq *rq, struct task_struct *next) struct scx_sched *sch = scx_root; const struct sched_class *next_class = next->sched_class; - /* - * Pairs with the smp_load_acquire() issued by a CPU in - * kick_cpus_irq_workfn() who is waiting for this CPU to perform a - * resched. - */ - smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) return; @@ -2286,6 +2371,10 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, struct task_struct *next) { struct scx_sched *sch = scx_root; + + /* see kick_cpus_irq_workfn() */ + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + update_curr_scx(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ @@ -2332,41 +2421,37 @@ static struct task_struct *first_local_task(struct rq *rq) struct task_struct, scx.dsq_list.node); } -static struct task_struct *pick_task_scx(struct rq *rq) +static struct task_struct * +do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) { struct task_struct *prev = rq->curr; + bool keep_prev, kick_idle = false; struct task_struct *p; - bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; - bool kick_idle = false; + + /* see kick_cpus_irq_workfn() */ + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + + rq_modified_clear(rq); + + rq_unpin_lock(rq, rf); + balance_one(rq, prev); + rq_repin_lock(rq, rf); + maybe_queue_balance_callback(rq); /* - * WORKAROUND: - * - * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just - * have gone through balance_scx(). Unfortunately, there currently is a - * bug where fair could say yes on balance() but no on pick_task(), - * which then ends up calling pick_task_scx() without preceding - * balance_scx(). + * If any higher-priority sched class enqueued a runnable task on + * this rq during balance_one(), abort and return RETRY_TASK, so + * that the scheduler loop can restart. * - * Keep running @prev if possible and avoid stalling from entering idle - * without balancing. - * - * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() - * if pick_task_scx() is called without preceding balance_scx(). + * If @force_scx is true, always try to pick a SCHED_EXT task, + * regardless of any higher-priority sched classes activity. */ - if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev->scx.flags & SCX_TASK_QUEUED) { - keep_prev = true; - } else { - keep_prev = false; - kick_idle = true; - } - } else if (unlikely(keep_prev && - prev->sched_class != &ext_sched_class)) { - /* - * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is - * conditional on scx_enabled() and may have been skipped. - */ + if (!force_scx && rq_modified_above(rq, &ext_sched_class)) + return RETRY_TASK; + + keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); keep_prev = false; } @@ -2404,6 +2489,11 @@ static struct task_struct *pick_task_scx(struct rq *rq) return p; } +static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) +{ + return do_pick_task_scx(rq, rf, false); +} + #ifdef CONFIG_SCHED_CORE /** * scx_prio_less - Task ordering for core-sched @@ -2860,7 +2950,7 @@ void init_scx_entity(struct sched_ext_entity *scx) INIT_LIST_HEAD(&scx->runnable_node); scx->runnable_at = jiffies; scx->ddsp_dsq_id = SCX_DSQ_INVALID; - scx->slice = SCX_SLICE_DFL; + scx->slice = READ_ONCE(scx_slice_dfl); } void scx_pre_fork(struct task_struct *p) @@ -2904,9 +2994,9 @@ void scx_post_fork(struct task_struct *p) } } - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); list_add_tail(&p->scx.tasks_node, &scx_tasks); - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); percpu_up_read(&scx_fork_rwsem); } @@ -2926,13 +3016,13 @@ void scx_cancel_fork(struct task_struct *p) percpu_up_read(&scx_fork_rwsem); } -void sched_ext_free(struct task_struct *p) +void sched_ext_dead(struct task_struct *p) { unsigned long flags; - spin_lock_irqsave(&scx_tasks_lock, flags); + raw_spin_lock_irqsave(&scx_tasks_lock, flags); list_del_init(&p->scx.tasks_node); - spin_unlock_irqrestore(&scx_tasks_lock, flags); + raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); /* * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED @@ -2961,7 +3051,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, p, p->scx.weight); } -static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) { } @@ -3030,6 +3120,7 @@ void scx_tg_init(struct task_group *tg) tg->scx.weight = CGROUP_WEIGHT_DFL; tg->scx.bw_period_us = default_bw_period_us(); tg->scx.bw_quota_us = RUNTIME_INF; + tg->scx.idle = false; } int scx_tg_online(struct task_group *tg) @@ -3178,7 +3269,18 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - /* TODO: Implement ops->cgroup_set_idle() */ + struct scx_sched *sch = scx_root; + + percpu_down_read(&scx_cgroup_ops_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL, + tg_cgrp(tg), idle); + + /* Update the task group's idle state */ + tg->scx.idle = idle; + + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_bandwidth(struct task_group *tg, @@ -3234,6 +3336,8 @@ static void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { + .queue_mask = 1, + .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, @@ -3241,7 +3345,6 @@ DEFINE_SCHED_CLASS(ext) = { .wakeup_preempt = wakeup_preempt_scx, - .balance = balance_scx, .pick_task = pick_task_scx, .put_prev_task = put_prev_task_scx, @@ -3471,7 +3574,9 @@ static void scx_sched_free_rcu_work(struct work_struct *work) struct scx_dispatch_q *dsq; int node; + irq_work_sync(&sch->error_irq_work); kthread_stop(sch->helper->task); + free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) @@ -3590,38 +3695,55 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) } /** - * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * handle_lockup - sched_ext common lockup handler + * @fmt: format string * - * While there are various reasons why RCU CPU stalls can occur on a system - * that may not be caused by the current BPF scheduler, try kicking out the - * current scheduler in an attempt to recover the system to a good state before - * issuing panics. + * Called on system stall or lockup condition and initiates abort of sched_ext + * if enabled, which may resolve the reported lockup. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the lockup. %false if sched_ext is not enabled or abort was already + * initiated by someone else. */ -bool scx_rcu_cpu_stall(void) +static __printf(1, 2) bool handle_lockup(const char *fmt, ...) { struct scx_sched *sch; + va_list args; + bool ret; - rcu_read_lock(); + guard(rcu)(); sch = rcu_dereference(scx_root); - if (unlikely(!sch)) { - rcu_read_unlock(); + if (unlikely(!sch)) return false; - } switch (scx_enable_state()) { case SCX_ENABLING: case SCX_ENABLED: - break; + va_start(args, fmt); + ret = scx_verror(sch, fmt, args); + va_end(args); + return ret; default: - rcu_read_unlock(); return false; } +} - scx_error(sch, "RCU CPU stall detected!"); - rcu_read_unlock(); - - return true; +/** + * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * + * While there are various reasons why RCU CPU stalls can occur on a system + * that may not be caused by the current BPF scheduler, try kicking out the + * current scheduler in an attempt to recover the system to a good state before + * issuing panics. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported RCU stall. %false if sched_ext is not enabled or someone + * else already initiated abort. + */ +bool scx_rcu_cpu_stall(void) +{ + return handle_lockup("RCU CPU stall detected!"); } /** @@ -3632,50 +3754,240 @@ bool scx_rcu_cpu_stall(void) * live-lock the system by making many CPUs target the same DSQ to the point * where soft-lockup detection triggers. This function is called from * soft-lockup watchdog when the triggering point is close and tries to unjam - * the system by enabling the breather and aborting the BPF scheduler. + * the system and aborting the BPF scheduler. */ void scx_softlockup(u32 dur_s) { - struct scx_sched *sch; + if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) + return; - rcu_read_lock(); + printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", + smp_processor_id(), dur_s); +} - sch = rcu_dereference(scx_root); - if (unlikely(!sch)) - goto out_unlock; +/** + * scx_hardlockup - sched_ext hardlockup handler + * + * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting + * numerous affinitized tasks in a single queue and directing all CPUs at it. + * Try kicking out the current scheduler in an attempt to recover the system to + * a good state before taking more drastic actions. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported hardlockdup. %false if sched_ext is not enabled or + * someone else already initiated abort. + */ +bool scx_hardlockup(int cpu) +{ + if (!handle_lockup("hard lockup - CPU %d", cpu)) + return false; - switch (scx_enable_state()) { - case SCX_ENABLING: - case SCX_ENABLED: - break; - default: - goto out_unlock; + printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", + cpu); + return true; +} + +static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, + struct cpumask *donee_mask, struct cpumask *resched_mask, + u32 nr_donor_target, u32 nr_donee_target) +{ + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + struct task_struct *p, *n; + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); + s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; + u32 nr_balanced = 0, min_delta_us; + + /* + * All we want to guarantee is reasonable forward progress. No reason to + * fine tune. Assuming every task on @donor_dsq runs their full slice, + * consider offloading iff the total queued duration is over the + * threshold. + */ + min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV; + if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us)) + return 0; + + raw_spin_rq_lock_irq(rq); + raw_spin_lock(&donor_dsq->lock); + list_add(&cursor.node, &donor_dsq->list); +resume: + n = container_of(&cursor, struct task_struct, scx.dsq_list); + n = nldsq_next_task(donor_dsq, n, false); + + while ((p = n)) { + struct rq *donee_rq; + struct scx_dispatch_q *donee_dsq; + int donee; + + n = nldsq_next_task(donor_dsq, n, false); + + if (donor_dsq->nr <= nr_donor_target) + break; + + if (cpumask_empty(donee_mask)) + break; + + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); + if (donee >= nr_cpu_ids) + continue; + + donee_rq = cpu_rq(donee); + donee_dsq = &donee_rq->scx.bypass_dsq; + + /* + * $p's rq is not locked but $p's DSQ lock protects its + * scheduling properties making this test safe. + */ + if (!task_can_run_on_remote_rq(sch, p, donee_rq, false)) + continue; + + /* + * Moving $p from one non-local DSQ to another. The source rq + * and DSQ are already locked. Do an abbreviated dequeue and + * then perform enqueue without unlocking $donor_dsq. + * + * We don't want to drop and reacquire the lock on each + * iteration as @donor_dsq can be very long and potentially + * highly contended. Donee DSQs are less likely to be contended. + * The nested locking is safe as only this LB moves tasks + * between bypass DSQs. + */ + dispatch_dequeue_locked(p, donor_dsq); + dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED); + + /* + * $donee might have been idle and need to be woken up. No need + * to be clever. Kick every CPU that receives tasks. + */ + cpumask_set_cpu(donee, resched_mask); + + if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) + cpumask_clear_cpu(donee, donee_mask); + + nr_balanced++; + if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { + list_move_tail(&cursor.node, &n->scx.dsq_list.node); + raw_spin_unlock(&donor_dsq->lock); + raw_spin_rq_unlock_irq(rq); + cpu_relax(); + raw_spin_rq_lock_irq(rq); + raw_spin_lock(&donor_dsq->lock); + goto resume; + } } - /* allow only one instance, cleared at the end of scx_bypass() */ - if (test_and_set_bit(0, &scx_in_softlockup)) - goto out_unlock; + list_del_init(&cursor.node); + raw_spin_unlock(&donor_dsq->lock); + raw_spin_rq_unlock_irq(rq); + + return nr_balanced; +} + +static void bypass_lb_node(struct scx_sched *sch, int node) +{ + const struct cpumask *node_mask = cpumask_of_node(node); + struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; + struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; + u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; + u32 nr_target, nr_donor_target; + u32 before_min = U32_MAX, before_max = 0; + u32 after_min = U32_MAX, after_max = 0; + int cpu; + + /* count the target tasks and CPUs */ + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); - printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", - smp_processor_id(), dur_s, scx_root->ops.name); + nr_tasks += nr; + nr_cpus++; + + before_min = min(nr, before_min); + before_max = max(nr, before_max); + } + + if (!nr_cpus) + return; /* - * Some CPUs may be trapped in the dispatch paths. Enable breather - * immediately; otherwise, we might even be able to get to scx_bypass(). + * We don't want CPUs to have more than $nr_donor_target tasks and + * balancing to fill donee CPUs upto $nr_target. Once targets are + * calculated, find the donee CPUs. */ - atomic_inc(&scx_breather_depth); + nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); + nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); - scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s); -out_unlock: - rcu_read_unlock(); + cpumask_clear(donee_mask); + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target) + cpumask_set_cpu(cpu, donee_mask); + } + + /* iterate !donee CPUs and see if they should be offloaded */ + cpumask_clear(resched_mask); + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + struct rq *rq = cpu_rq(cpu); + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + + if (cpumask_empty(donee_mask)) + break; + if (cpumask_test_cpu(cpu, donee_mask)) + continue; + if (READ_ONCE(donor_dsq->nr) <= nr_donor_target) + continue; + + nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask, + nr_donor_target, nr_target); + } + + for_each_cpu(cpu, resched_mask) { + struct rq *rq = cpu_rq(cpu); + + raw_spin_rq_lock_irq(rq); + resched_curr(rq); + raw_spin_rq_unlock_irq(rq); + } + + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + + after_min = min(nr, after_min); + after_max = max(nr, after_max); + + } + + trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, + before_min, before_max, after_min, after_max); } -static void scx_clear_softlockup(void) +/* + * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine + * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some + * bypass DSQs can be overloaded. If there are enough tasks to saturate other + * lightly loaded CPUs, such imbalance can lead to very high execution latency + * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such + * outcomes, a simple load balancing mechanism is implemented by the following + * timer which runs periodically while bypass mode is in effect. + */ +static void scx_bypass_lb_timerfn(struct timer_list *timer) { - if (test_and_clear_bit(0, &scx_in_softlockup)) - atomic_dec(&scx_breather_depth); + struct scx_sched *sch; + int node; + u32 intv_us; + + sch = rcu_dereference_all(scx_root); + if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth)) + return; + + for_each_node_with_cpus(node) + bypass_lb_node(sch, node); + + intv_us = READ_ONCE(scx_bypass_lb_intv_us); + if (intv_us) + mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); } +static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn); + /** * scx_bypass - [Un]bypass scx_ops and guarantee forward progress * @bypass: true for bypass, false for unbypass @@ -3719,25 +4031,34 @@ static void scx_bypass(bool bypass) sch = rcu_dereference_bh(scx_root); if (bypass) { - scx_bypass_depth++; + u32 intv_us; + + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1); WARN_ON_ONCE(scx_bypass_depth <= 0); if (scx_bypass_depth != 1) goto unlock; + WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC); bypass_timestamp = ktime_get_ns(); if (sch) scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); + + intv_us = READ_ONCE(scx_bypass_lb_intv_us); + if (intv_us && !timer_pending(&scx_bypass_lb_timer)) { + scx_bypass_lb_timer.expires = + jiffies + usecs_to_jiffies(intv_us); + add_timer_global(&scx_bypass_lb_timer); + } } else { - scx_bypass_depth--; + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1); WARN_ON_ONCE(scx_bypass_depth < 0); if (scx_bypass_depth != 0) goto unlock; + WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL); if (sch) scx_add_event(sch, SCX_EV_BYPASS_DURATION, ktime_get_ns() - bypass_timestamp); } - atomic_inc(&scx_breather_depth); - /* * No task property is changing. We just need to make sure all currently * queued tasks are re-queued according to the new scx_rq_bypassing() @@ -3780,11 +4101,10 @@ static void scx_bypass(bool bypass) */ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, scx.runnable_node) { - struct sched_enq_and_set_ctx ctx; - /* cycling deq/enq is enough, see the function comment */ - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* nothing */ ; + } } /* resched to restore ticks and idle state */ @@ -3794,10 +4114,8 @@ static void scx_bypass(bool bypass) raw_spin_rq_unlock(rq); } - atomic_dec(&scx_breather_depth); unlock: raw_spin_unlock_irqrestore(&bypass_lock, flags); - scx_clear_softlockup(); } static void free_exit_info(struct scx_exit_info *ei) @@ -3850,6 +4168,20 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) } } +static void free_kick_syncs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); + struct scx_kick_syncs *to_free; + + to_free = rcu_replace_pointer(*ksyncs, NULL, true); + if (to_free) + kvfree_rcu(to_free, rcu); + } +} + static void scx_disable_workfn(struct kthread_work *work) { struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); @@ -3871,6 +4203,7 @@ static void scx_disable_workfn(struct kthread_work *work) /* guarantee forward progress by bypassing scx_ops */ scx_bypass(true); + WRITE_ONCE(scx_aborting, false); switch (scx_set_enable_state(SCX_DISABLING)) { case SCX_DISABLING: @@ -3913,22 +4246,19 @@ static void scx_disable_workfn(struct kthread_work *work) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *old_class = p->sched_class; - const struct sched_class *new_class = - __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; + const struct sched_class *new_class = scx_setscheduler_class(p); - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + update_rq_clock(task_rq(p)); - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); - - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); scx_exit_task(p); } scx_task_iter_stop(&sti); @@ -3986,6 +4316,7 @@ static void scx_disable_workfn(struct kthread_work *work) free_percpu(scx_dsp_ctx); scx_dsp_ctx = NULL; scx_dsp_max_batch = 0; + free_kick_syncs(); mutex_unlock(&scx_enable_mutex); @@ -3994,9 +4325,24 @@ done: scx_bypass(false); } -static void scx_disable(enum scx_exit_kind kind) +static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) { int none = SCX_EXIT_NONE; + + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) + return false; + + /* + * Some CPUs may be trapped in the dispatch paths. Set the aborting + * flag to break potential live-lock scenarios, ensuring we can + * successfully reach scx_bypass(). + */ + WRITE_ONCE(scx_aborting, true); + return true; +} + +static void scx_disable(enum scx_exit_kind kind) +{ struct scx_sched *sch; if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) @@ -4005,7 +4351,7 @@ static void scx_disable(enum scx_exit_kind kind) rcu_read_lock(); sch = rcu_dereference(scx_root); if (sch) { - atomic_try_cmpxchg(&sch->exit_kind, &none, kind); + scx_claim_exit(sch, kind); kthread_queue_work(sch->helper, &sch->disable_work); } rcu_read_unlock(); @@ -4216,7 +4562,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) size_t avail, used; bool idle; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); idle = list_empty(&rq->scx.runnable_list) && rq->curr->sched_class == &idle_sched_class; @@ -4234,10 +4580,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) seq_buf_init(&ns, buf, avail); dump_newline(&ns); - dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", + dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", cpu, rq->scx.nr_running, rq->scx.flags, rq->scx.cpu_released, rq->scx.ops_qseq, - rq->scx.pnt_seq); + rq->scx.kick_sync); dump_line(&ns, " curr=%s[%d] class=%ps", rq->curr->comm, rq->curr->pid, rq->curr->sched_class); @@ -4285,7 +4631,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) scx_dump_task(&s, &dctx, p, ' '); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); } dump_newline(&s); @@ -4321,15 +4667,14 @@ static void scx_error_irq_workfn(struct irq_work *irq_work) kthread_queue_work(sch->helper, &sch->disable_work); } -static void scx_vexit(struct scx_sched *sch, +static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args) { struct scx_exit_info *ei = sch->exit_info; - int none = SCX_EXIT_NONE; - if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) - return; + if (!scx_claim_exit(sch, kind)) + return false; ei->exit_code = exit_code; #ifdef CONFIG_STACKTRACE @@ -4346,6 +4691,34 @@ static void scx_vexit(struct scx_sched *sch, ei->reason = scx_exit_reason(ei->kind); irq_work_queue(&sch->error_irq_work); + return true; +} + +static int alloc_kick_syncs(void) +{ + int cpu; + + /* + * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size + * can exceed percpu allocator limits on large machines. + */ + for_each_possible_cpu(cpu) { + struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); + struct scx_kick_syncs *new_ksyncs; + + WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); + + new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), + GFP_KERNEL, cpu_to_node(cpu)); + if (!new_ksyncs) { + free_kick_syncs(); + return -ENOMEM; + } + + rcu_assign_pointer(*ksyncs, new_ksyncs); + } + + return 0; } static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) @@ -4392,8 +4765,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) goto err_free_gdsqs; sch->helper = kthread_run_worker(0, "sched_ext_helper"); - if (!sch->helper) + if (IS_ERR(sch->helper)) { + ret = PTR_ERR(sch->helper); goto err_free_pcpu; + } + sched_set_fifo(sch->helper->task); atomic_set(&sch->exit_kind, SCX_EXIT_NONE); @@ -4426,7 +4802,7 @@ err_free_sch: return ERR_PTR(ret); } -static void check_hotplug_seq(struct scx_sched *sch, +static int check_hotplug_seq(struct scx_sched *sch, const struct sched_ext_ops *ops) { unsigned long long global_hotplug_seq; @@ -4443,8 +4819,11 @@ static void check_hotplug_seq(struct scx_sched *sch, SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, "expected hotplug seq %llu did not match actual %llu", ops->hotplug_seq, global_hotplug_seq); + return -EBUSY; } } + + return 0; } static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) @@ -4471,6 +4850,9 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); + if (ops->cpu_acquire || ops->cpu_release) + pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); + return 0; } @@ -4495,10 +4877,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) goto err_unlock; } + ret = alloc_kick_syncs(); + if (ret) + goto err_unlock; + sch = scx_alloc_and_add_sched(ops); if (IS_ERR(sch)) { ret = PTR_ERR(sch); - goto err_unlock; + goto err_free_ksyncs; } /* @@ -4507,6 +4893,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); WARN_ON_ONCE(scx_root); + if (WARN_ON_ONCE(READ_ONCE(scx_aborting))) + WRITE_ONCE(scx_aborting, false); atomic_long_set(&scx_nr_rejected, 0); @@ -4542,7 +4930,11 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) set_bit(i, sch->has_op); - check_hotplug_seq(sch, ops); + ret = check_hotplug_seq(sch, ops); + if (ret) { + cpus_read_unlock(); + goto err_disable; + } scx_idle_update_selcpu_topology(ops); cpus_read_unlock(); @@ -4657,27 +5049,20 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) percpu_down_write(&scx_fork_rwsem); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *old_class = p->sched_class; - const struct sched_class *new_class = - __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; + const struct sched_class *new_class = scx_setscheduler_class(p); - if (!tryget_task_struct(p)) + if (scx_get_task_state(p) != SCX_TASK_READY) continue; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - - p->scx.slice = SCX_SLICE_DFL; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); - - sched_enq_and_set_task(&ctx); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - check_class_changed(task_rq(p), p, old_class, p->prio); - put_task_struct(p); + scoped_guard (sched_change, p, queue_flags) { + p->scx.slice = READ_ONCE(scx_slice_dfl); + p->sched_class = new_class; + } } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); @@ -4701,6 +5086,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) return 0; +err_free_ksyncs: + free_kick_syncs(); err_unlock: mutex_unlock(&scx_enable_mutex); return ret; @@ -4917,6 +5304,7 @@ static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *fro static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} +static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} #endif static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} @@ -4955,6 +5343,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, + .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, #endif .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, @@ -5028,29 +5417,38 @@ static bool can_skip_idle_kick(struct rq *rq) return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); } -static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) +static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) { struct rq *rq = cpu_rq(cpu); struct scx_rq *this_scx = &this_rq->scx; + const struct sched_class *cur_class; bool should_wait = false; unsigned long flags; raw_spin_rq_lock_irqsave(rq, flags); + cur_class = rq->curr->sched_class; /* * During CPU hotplug, a CPU may depend on kicking itself to make - * forward progress. Allow kicking self regardless of online state. + * forward progress. Allow kicking self regardless of online state. If + * @cpu is running a higher class task, we have no control over @cpu. + * Skip kicking. */ - if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { + if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && + !sched_class_above(cur_class, &ext_sched_class)) { if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { - if (rq->curr->sched_class == &ext_sched_class) + if (cur_class == &ext_sched_class) rq->curr->scx.slice = 0; cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); } if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { - pseqs[cpu] = rq->scx.pnt_seq; - should_wait = true; + if (cur_class == &ext_sched_class) { + ksyncs[cpu] = rq->scx.kick_sync; + should_wait = true; + } else { + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } } resched_curr(rq); @@ -5082,12 +5480,20 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) { struct rq *this_rq = this_rq(); struct scx_rq *this_scx = &this_rq->scx; - unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); + struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); bool should_wait = false; + unsigned long *ksyncs; s32 cpu; + if (unlikely(!ksyncs_pcpu)) { + pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs"); + return; + } + + ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; + for_each_cpu(cpu, this_scx->cpus_to_kick) { - should_wait |= kick_one_cpu(cpu, this_rq, pseqs); + should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); } @@ -5101,20 +5507,21 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) return; for_each_cpu(cpu, this_scx->cpus_to_wait) { - unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; + unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; - if (cpu != cpu_of(this_rq)) { - /* - * Pairs with smp_store_release() issued by this CPU in - * switch_class() on the resched path. - * - * We busy-wait here to guarantee that no other task can - * be scheduled on our core before the target CPU has - * entered the resched path. - */ - while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) - cpu_relax(); - } + /* + * Busy-wait until the task running at the time of kicking is no + * longer running. This can be used to implement e.g. core + * scheduling. + * + * smp_cond_load_acquire() pairs with store_releases in + * pick_task_scx() and put_prev_task_scx(). The former breaks + * the wait if SCX's scheduling path is entered even if the same + * task is picked subsequently. The latter is necessary to break + * the wait when $cpu is taken by a higher sched class. + */ + if (cpu != cpu_of(this_rq)) + smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } @@ -5208,16 +5615,12 @@ void __init init_sched_ext_class(void) scx_idle_init_masks(); - scx_kick_cpus_pnt_seqs = - __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, - __alignof__(scx_kick_cpus_pnt_seqs[0])); - BUG_ON(!scx_kick_cpus_pnt_seqs); - for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); + init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); @@ -5225,8 +5628,8 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); - init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); - init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); + rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); if (cpu_online(cpu)) cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; @@ -5323,19 +5726,23 @@ __bpf_kfunc_start_defs(); * exhaustion. If zero, the current residual slice is maintained. If * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with * scx_bpf_kick_cpu() to trigger scheduling. + * + * Returns %true on successful insertion, %false on failure. On the root + * scheduler, %false return triggers scheduler abort and the caller doesn't need + * to check the return value. */ -__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, - u64 enq_flags) +__bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, + u64 slice, u64 enq_flags) { struct scx_sched *sch; guard(rcu)(); sch = rcu_dereference(scx_root); if (unlikely(!sch)) - return; + return false; if (!scx_dsq_insert_preamble(sch, p, enq_flags)) - return; + return false; if (slice) p->scx.slice = slice; @@ -5343,56 +5750,114 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice p->scx.slice = p->scx.slice ?: 1; scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); + + return true; +} + +/* + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. + */ +__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, + u64 slice, u64 enq_flags) +{ + scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags); +} + +static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) +{ + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) + return false; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + p->scx.dsq_vtime = vtime; + + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + + return true; } +struct scx_bpf_dsq_insert_vtime_args { + /* @p can't be packed together as KF_RCU is not transitive */ + u64 dsq_id; + u64 slice; + u64 vtime; + u64 enq_flags; +}; + /** - * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion * @p: task_struct to insert - * @dsq_id: DSQ to insert into - * @slice: duration @p can run for in nsecs, 0 to keep the current value - * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ - * @enq_flags: SCX_ENQ_* + * @args: struct containing the rest of the arguments + * @args->dsq_id: DSQ to insert into + * @args->slice: duration @p can run for in nsecs, 0 to keep the current value + * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @args->enq_flags: SCX_ENQ_* + * + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument + * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided + * as an inline wrapper in common.bpf.h. * - * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id. - * Tasks queued into the priority queue are ordered by @vtime. All other aspects - * are identical to scx_bpf_dsq_insert(). + * Insert @p into the vtime priority queue of the DSQ identified by + * @args->dsq_id. Tasks queued into the priority queue are ordered by + * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). * - * @vtime ordering is according to time_before64() which considers wrapping. A - * numerically larger vtime may indicate an earlier position in the ordering and - * vice-versa. + * @args->vtime ordering is according to time_before64() which considers + * wrapping. A numerically larger vtime may indicate an earlier position in the + * ordering and vice-versa. * * A DSQ can only be used as a FIFO or priority queue at any given time and this * function must not be called on a DSQ which already has one or more FIFO tasks * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and * SCX_DSQ_GLOBAL) cannot be used as priority queues. + * + * Returns %true on successful insertion, %false on failure. On the root + * scheduler, %false return triggers scheduler abort and the caller doesn't need + * to check the return value. */ -__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, - u64 slice, u64 vtime, u64 enq_flags) +__bpf_kfunc bool +__scx_bpf_dsq_insert_vtime(struct task_struct *p, + struct scx_bpf_dsq_insert_vtime_args *args) { struct scx_sched *sch; guard(rcu)(); + sch = rcu_dereference(scx_root); if (unlikely(!sch)) - return; + return false; - if (!scx_dsq_insert_preamble(sch, p, enq_flags)) - return; + return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, + args->vtime, args->enq_flags); +} - if (slice) - p->scx.slice = slice; - else - p->scx.slice = p->scx.slice ?: 1; +/* + * COMPAT: Will be removed in v6.23. + */ +__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + struct scx_sched *sch; - p->scx.dsq_vtime = vtime; + guard(rcu)(); - scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) @@ -5416,6 +5881,13 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, return false; /* + * If the BPF scheduler keeps calling this function repeatedly, it can + * cause similar live-lock conditions as consume_dispatch_q(). + */ + if (unlikely(READ_ONCE(scx_aborting))) + return false; + + /* * Can be called from either ops.dispatch() locking this_rq() or any * context where no rq lock is held. If latter, lock @p's task_rq which * we'll likely need anyway. @@ -5435,13 +5907,6 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, raw_spin_rq_lock(src_rq); } - /* - * If the BPF scheduler keeps calling this function repeatedly, it can - * cause similar live-lock conditions as consume_dispatch_q(). Insert a - * breather if necessary. - */ - scx_breather(src_rq); - locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); @@ -5646,8 +6111,9 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq * lock (e.g. BPF timers or SYSCALL programs). * - * Returns %true if @p has been consumed, %false if @p had already been consumed - * or dequeued. + * Returns %true if @p has been consumed, %false if @p had already been + * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local + * DSQ. */ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, @@ -5688,8 +6154,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_dispatch) BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_dispatch) @@ -5699,32 +6165,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { .set = &scx_kfunc_ids_dispatch, }; -__bpf_kfunc_start_defs(); - -/** - * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ - * - * Iterate over all of the tasks currently enqueued on the local DSQ of the - * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of - * processed tasks. Can only be called from ops.cpu_release(). - */ -__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +static u32 reenq_local(struct rq *rq) { - struct scx_sched *sch; LIST_HEAD(tasks); u32 nr_enqueued = 0; - struct rq *rq; struct task_struct *p, *n; - guard(rcu)(); - sch = rcu_dereference(scx_root); - if (unlikely(!sch)) - return 0; - - if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) - return 0; - - rq = cpu_rq(smp_processor_id()); lockdep_assert_rq_held(rq); /* @@ -5761,6 +6207,37 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) return nr_enqueued; } +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of + * processed tasks. Can only be called from ops.cpu_release(). + * + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void + * returning variant that can be called from anywhere. + */ +__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +{ + struct scx_sched *sch; + struct rq *rq; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) + return 0; + + rq = cpu_rq(smp_processor_id()); + lockdep_assert_rq_held(rq); + + return reenq_local(rq); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) @@ -5820,8 +6297,8 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_unlocked) BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_unlocked) @@ -5833,6 +6310,34 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { __bpf_kfunc_start_defs(); +/** + * scx_bpf_task_set_slice - Set task's time slice + * @p: task of interest + * @slice: time slice to set in nsecs + * + * Set @p's time slice to @slice. Returns %true on success, %false if the + * calling scheduler doesn't have authority over @p. + */ +__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice) +{ + p->scx.slice = slice; + return true; +} + +/** + * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering + * @p: task of interest + * @vtime: virtual time to set + * + * Set @p's virtual time to @vtime. Returns %true on success, %false if the + * calling scheduler doesn't have authority over @p. + */ +__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) +{ + p->scx.dsq_vtime = vtime; + return true; +} + static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) { struct rq *this_rq; @@ -5990,6 +6495,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, sizeof(struct bpf_iter_scx_dsq)); BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != __alignof__(struct bpf_iter_scx_dsq)); + BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & + ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); /* * next() and destroy() will be called regardless of the return value. @@ -6008,9 +6515,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (!kit->dsq) return -ENOENT; - INIT_LIST_HEAD(&kit->cursor.node); - kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; - kit->cursor.priv = READ_ONCE(kit->dsq->seq); + kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags, + READ_ONCE(kit->dsq->seq)); return 0; } @@ -6084,6 +6590,40 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) kit->dsq = NULL; } +/** + * scx_bpf_dsq_peek - Lockless peek at the first element. + * @dsq_id: DSQ to examine. + * + * Read the first element in the DSQ. This is semantically equivalent to using + * the DSQ iterator, but is lockfree. Of course, like any lockless operation, + * this provides only a point-in-time snapshot, and the contents may change + * by the time any subsequent locking operation reads the queue. + * + * Returns the pointer, or NULL indicates an empty queue OR internal error. + */ +__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) +{ + struct scx_sched *sch; + struct scx_dispatch_q *dsq; + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { + scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); + return NULL; + } + + dsq = find_user_dsq(sch, dsq_id); + if (unlikely(!dsq)) { + scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); + return NULL; + } + + return rcu_dereference(dsq->first_task); +} + __bpf_kfunc_end_defs(); static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, @@ -6238,6 +6778,24 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, } /** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from + * anywhere. + */ +__bpf_kfunc void scx_bpf_reenqueue_local___v2(void) +{ + struct rq *rq; + + guard(preempt)(); + + rq = this_rq(); + local_set(&rq->scx.reenq_local_deferred, 1); + schedule_deferred(rq); +} + +/** * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU * @cpu: CPU of interest * @@ -6305,7 +6863,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) guard(rcu)(); - sch = rcu_dereference(sch); + sch = rcu_dereference(scx_root); if (unlikely(!sch)) return; @@ -6638,15 +7196,19 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) +BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU); +BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU); BTF_ID_FLAGS(func, scx_bpf_kick_cpu) BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) @@ -6737,6 +7299,12 @@ static int __init scx_init(void) return ret; } + if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || + !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { + pr_err("sched_ext: Failed to allocate cpumasks\n"); + return -ENOMEM; + } + return 0; } __initcall(scx_init); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index d2434c954848..3d9d404d5cd2 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -995,26 +995,56 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, return prev_cpu; } +struct scx_bpf_select_cpu_and_args { + /* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */ + s32 prev_cpu; + u64 wake_flags; + u64 flags; +}; + /** - * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p, - * prioritizing those in @cpus_allowed + * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask * @p: task_struct to select a CPU for - * @prev_cpu: CPU @p was on previously - * @wake_flags: %SCX_WAKE_* flags * @cpus_allowed: cpumask of allowed CPUs - * @flags: %SCX_PICK_IDLE* flags + * @args: struct containing the rest of the arguments + * @args->prev_cpu: CPU @p was on previously + * @args->wake_flags: %SCX_WAKE_* flags + * @args->flags: %SCX_PICK_IDLE* flags + * + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument + * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided + * as an inline wrapper in common.bpf.h. * * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked * context such as a BPF test_run() call, as long as built-in CPU selection * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE * is set. * - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu(). * * Returns the selected idle CPU, which will be automatically awakened upon * returning from ops.select_cpu() and can be used for direct dispatch, or * a negative value if no idle CPU is available. */ +__bpf_kfunc s32 +__scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, + struct scx_bpf_select_cpu_and_args *args) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags, + cpus_allowed, args->flags); +} + +/* + * COMPAT: Will be removed in v6.22. + */ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *cpus_allowed, u64 flags) { @@ -1383,6 +1413,7 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_idle) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index b3617abed510..386c677e4c9a 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -23,6 +23,11 @@ enum scx_consts { * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. */ SCX_TASK_ITER_BATCH = 32, + + SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, + SCX_BYPASS_LB_DONOR_PCT = 125, + SCX_BYPASS_LB_MIN_DELTA_DIV = 4, + SCX_BYPASS_LB_BATCH = 256, }; enum scx_exit_kind { @@ -697,12 +702,23 @@ struct sched_ext_ops { * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be * interpreted in the same fashion and specifies how much @cgrp can * burst temporarily. The specific control mechanism and thus the - * interpretation of @period_us and burstiness is upto to the BPF + * interpretation of @period_us and burstiness is up to the BPF * scheduler. */ void (*cgroup_set_bandwidth)(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us); + /** + * @cgroup_set_idle: A cgroup's idle state is being changed + * @cgrp: cgroup whose idle state is being updated + * @idle: whether the cgroup is entering or exiting idle state + * + * Update @cgrp's idle state to @idle. This callback is invoked when + * a cgroup transitions between idle and non-idle states, allowing the + * BPF scheduler to adjust its behavior accordingly. + */ + void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle); + #endif /* CONFIG_EXT_GROUP_SCHED */ /* @@ -884,6 +900,10 @@ struct scx_sched { struct scx_dispatch_q **global_dsqs; struct scx_sched_pcpu __percpu *pcpu; + /* + * Updates to the following warned bitfields can race causing RMW issues + * but it doesn't really matter. + */ bool warned_zero_slice:1; bool warned_deprecated_rq:1; @@ -948,6 +968,7 @@ enum scx_enq_flags { SCX_ENQ_CLEAR_OPSS = 1LLU << 56, SCX_ENQ_DSQ_PRIQ = 1LLU << 57, + SCX_ENQ_NESTED = 1LLU << 58, }; enum scx_deq_flags { @@ -986,8 +1007,10 @@ enum scx_kick_flags { SCX_KICK_PREEMPT = 1LLU << 1, /* - * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will - * return after the target CPU finishes picking the next task. + * The scx_bpf_kick_cpu() call will return after the current SCX task of + * the target CPU switches out. This can be used to implement e.g. core + * scheduling. This has no effect if the current task on the target CPU + * is not on SCX. */ SCX_KICK_WAIT = 1LLU << 2, }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc0b7ce8a65d..769d7b7990df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -554,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a, static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return (s64)(se->vruntime - cfs_rq->min_vruntime); + return (s64)(se->vruntime - cfs_rq->zero_vruntime); } #define __node_2_se(node) \ @@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * Which we track using: * - * v0 := cfs_rq->min_vruntime + * v0 := cfs_rq->zero_vruntime * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime * \Sum w_i := cfs_rq->avg_load * - * Since min_vruntime is a monotonic increasing variable that closely tracks - * the per-task service, these deltas: (v_i - v), will be in the order of the - * maximal (virtual) lag induced in the system due to quantisation. + * Since zero_vruntime closely tracks the per-task service, these + * deltas: (v_i - v), will be in the order of the maximal (virtual) lag + * induced in the system due to quantisation. * * Also, we use scale_load_down() to reduce the size. * @@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) avg = div_s64(avg, load); } - return cfs_rq->min_vruntime + avg; + return cfs_rq->zero_vruntime + avg; } /* @@ -732,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; + return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -740,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) return vruntime_eligible(cfs_rq, se->vruntime); } -static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) +static void update_zero_vruntime(struct cfs_rq *cfs_rq) { - u64 min_vruntime = cfs_rq->min_vruntime; - /* - * open coded max_vruntime() to allow updating avg_vruntime - */ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta > 0) { - avg_vruntime_update(cfs_rq, delta); - min_vruntime = vruntime; - } - return min_vruntime; -} - -static void update_min_vruntime(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = __pick_root_entity(cfs_rq); - struct sched_entity *curr = cfs_rq->curr; - u64 vruntime = cfs_rq->min_vruntime; - - if (curr) { - if (curr->on_rq) - vruntime = curr->vruntime; - else - curr = NULL; - } + u64 vruntime = avg_vruntime(cfs_rq); + s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); - if (se) { - if (!curr) - vruntime = se->min_vruntime; - else - vruntime = min_vruntime(vruntime, se->min_vruntime); - } + avg_vruntime_update(cfs_rq, delta); - /* ensure we never gain time by being placed backwards. */ - cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); + cfs_rq->zero_vruntime = vruntime; } static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) @@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); + update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, @@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); avg_vruntime_sub(cfs_rq, se); + update_zero_vruntime(cfs_rq); } struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) @@ -955,6 +929,16 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se; + /* + * Picking the ->next buddy will affect latency but not fairness. + */ + if (sched_feat(PICK_BUDDY) && + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { + /* ->next will never be delayed */ + WARN_ON_ONCE(cfs_rq->next->sched_delayed); + return cfs_rq->next; + } + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; @@ -1193,6 +1177,8 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) return delta_exec; } +static void set_next_buddy(struct sched_entity *se); + /* * Used by other classes to account runtime. */ @@ -1226,7 +1212,6 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->vruntime += calc_delta_fair(delta_exec, curr); resched = update_deadline(cfs_rq, curr); - update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { /* @@ -1239,8 +1224,7 @@ static void update_curr(struct cfs_rq *cfs_rq) * against fair_server such that it can account for this time * and possibly avoid running this period. */ - if (dl_server_active(&rq->fair_server)) - dl_server_update(&rq->fair_server, delta_exec); + dl_server_update(&rq->fair_server, delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -3808,15 +3792,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, if (!curr) __enqueue_entity(cfs_rq, se); cfs_rq->nr_queued++; - - /* - * The entity's vruntime has been adjusted, so let's check - * whether the rq-wide min_vruntime needs updated too. Since - * the calculations above require stable min_vruntime rather - * than up-to-date one, we do the update at the end of the - * reweight process. - */ - update_min_vruntime(cfs_rq); } } @@ -5429,15 +5404,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_group(se); - /* - * Now advance min_vruntime if @se was the entity holding it back, - * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be - * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- i.e. we'll be penalized. - */ - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) - update_min_vruntime(cfs_rq); - if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); @@ -5512,16 +5478,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { struct sched_entity *se; - /* - * Picking the ->next buddy will affect latency but not fairness. - */ - if (sched_feat(PICK_BUDDY) && - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { - /* ->next will never be delayed */ - WARN_ON_ONCE(cfs_rq->next->sched_delayed); - return cfs_rq->next; - } - se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); @@ -6024,20 +5980,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; /* - * It's possible we are called with !runtime_remaining due to things - * like user changed quota setting(see tg_set_cfs_bandwidth()) or async - * unthrottled us with a positive runtime_remaining but other still - * running entities consumed those runtime before we reached here. + * It's possible we are called with runtime_remaining < 0 due to things + * like async unthrottled us with a positive runtime_remaining but other + * still running entities consumed those runtime before we reached here. * - * Anyway, we can't unthrottle this cfs_rq without any runtime remaining - * because any enqueue in tg_unthrottle_up() will immediately trigger a - * throttle, which is not supposed to happen on unthrottle path. + * We can't unthrottle this cfs_rq without any runtime remaining because + * any enqueue in tg_unthrottle_up() will immediately trigger a throttle, + * which is not supposed to happen on unthrottle path. */ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) return; - se = cfs_rq->tg->se[cpu_of(rq)]; - cfs_rq->throttled = 0; update_rq_clock(rq); @@ -6437,6 +6390,16 @@ static void sync_throttle(struct task_group *tg, int cpu) cfs_rq->throttle_count = pcfs_rq->throttle_count; cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); + + /* + * It is not enough to sync the "pelt_clock_throttled" indicator + * with the parent cfs_rq when the hierarchy is not queued. + * Always join a throttled hierarchy with PELT clock throttled + * and leaf it to the first enqueue, or distribution to + * unthrottle the PELT clock. + */ + if (cfs_rq->throttle_count) + cfs_rq->pelt_clock_throttled = 1; } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@ -6996,12 +6959,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) h_nr_idle = 1; } - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { - /* Account for idle runtime */ - if (!rq->nr_running) - dl_server_update_idle_time(rq, rq->curr); + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); - } /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); @@ -7028,8 +6987,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } -static void set_next_buddy(struct sched_entity *se); - /* * Basically dequeue_task_fair(), except it can deal with dequeue_entity() * failing half-way through and resume the dequeue later. @@ -8705,15 +8662,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context set_task_max_allowed_capacity(p); } -static int -balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - if (sched_fair_runnable(rq)) - return 1; - - return sched_balance_newidle(rq, rf) != 0; -} - static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { @@ -8725,16 +8673,81 @@ static void set_next_buddy(struct sched_entity *se) } } +enum preempt_wakeup_action { + PREEMPT_WAKEUP_NONE, /* No preemption. */ + PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */ + PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */ + PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */ +}; + +static inline bool +set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags, + struct sched_entity *pse, struct sched_entity *se) +{ + /* + * Keep existing buddy if the deadline is sooner than pse. + * The older buddy may be cache cold and completely unrelated + * to the current wakeup but that is unpredictable where as + * obeying the deadline is more in line with EEVDF objectives. + */ + if (cfs_rq->next && entity_before(cfs_rq->next, pse)) + return false; + + set_next_buddy(pse); + return true; +} + +/* + * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not + * strictly enforced because the hint is either misunderstood or + * multiple tasks must be woken up. + */ +static inline enum preempt_wakeup_action +preempt_sync(struct rq *rq, int wake_flags, + struct sched_entity *pse, struct sched_entity *se) +{ + u64 threshold, delta; + + /* + * WF_SYNC without WF_TTWU is not expected so warn if it happens even + * though it is likely harmless. + */ + WARN_ON_ONCE(!(wake_flags & WF_TTWU)); + + threshold = sysctl_sched_migration_cost; + delta = rq_clock_task(rq) - se->exec_start; + if ((s64)delta < 0) + delta = 0; + + /* + * WF_RQ_SELECTED implies the tasks are stacking on a CPU when they + * could run on other CPUs. Reduce the threshold before preemption is + * allowed to an arbitrary lower value as it is more likely (but not + * guaranteed) the waker requires the wakee to finish. + */ + if (wake_flags & WF_RQ_SELECTED) + threshold >>= 2; + + /* + * As WF_SYNC is not strictly obeyed, allow some runtime for batch + * wakeups to be issued. + */ + if (entity_before(pse, se) && delta >= threshold) + return PREEMPT_WAKEUP_RESCHED; + + return PREEMPT_WAKEUP_NONE; +} + /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { + enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; struct sched_entity *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; - bool do_preempt_short = false; if (unlikely(se == pse)) return; @@ -8748,10 +8761,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (task_is_throttled(p)) return; - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { - set_next_buddy(pse); - } - /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. @@ -8783,7 +8792,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * When non-idle entity preempt an idle entity, * don't give idle entity slice protection. */ - do_preempt_short = true; + preempt_action = PREEMPT_WAKEUP_SHORT; goto preempt; } @@ -8802,27 +8811,74 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption. */ - do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice); + if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) { + preempt_action = PREEMPT_WAKEUP_SHORT; + goto pick; + } + + /* + * Ignore wakee preemption on WF_FORK as it is less likely that + * there is shared data as exec often follow fork. Do not + * preempt for tasks that are sched_delayed as it would violate + * EEVDF to forcibly queue an ineligible task. + */ + if ((wake_flags & WF_FORK) || pse->sched_delayed) + return; /* + * If @p potentially is completing work required by current then + * consider preemption. + * + * Reschedule if waker is no longer eligible. */ + if (in_task() && !entity_eligible(cfs_rq, se)) { + preempt_action = PREEMPT_WAKEUP_RESCHED; + goto preempt; + } + + /* Prefer picking wakee soon if appropriate. */ + if (sched_feat(NEXT_BUDDY) && + set_preempt_buddy(cfs_rq, wake_flags, pse, se)) { + + /* + * Decide whether to obey WF_SYNC hint for a new buddy. Old + * buddies are ignored as they may not be relevant to the + * waker and less likely to be cache hot. + */ + if (wake_flags & WF_SYNC) + preempt_action = preempt_sync(rq, wake_flags, pse, se); + } + + switch (preempt_action) { + case PREEMPT_WAKEUP_NONE: + return; + case PREEMPT_WAKEUP_RESCHED: + goto preempt; + case PREEMPT_WAKEUP_SHORT: + fallthrough; + case PREEMPT_WAKEUP_PICK: + break; + } + +pick: + /* * If @p has become the most eligible task, force preemption. */ - if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) + if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse) goto preempt; - if (sched_feat(RUN_TO_PARITY) && do_preempt_short) + if (sched_feat(RUN_TO_PARITY)) update_protect_slice(cfs_rq, se); return; preempt: - if (do_preempt_short) + if (preempt_action == PREEMPT_WAKEUP_SHORT) cancel_protect_slice(se); resched_curr_lazy(rq); } -static struct task_struct *pick_task_fair(struct rq *rq) +static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) { struct sched_entity *se; struct cfs_rq *cfs_rq; @@ -8866,7 +8922,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - p = pick_task_fair(rq); + p = pick_task_fair(rq, rf); if (!p) goto idle; se = &p->se; @@ -8920,21 +8976,21 @@ simple: return p; idle: - if (!rf) - return NULL; - - new_tasks = sched_balance_newidle(rq, rf); + if (rf) { + new_tasks = sched_balance_newidle(rq, rf); - /* - * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is - * possible for any higher priority task to appear. In that case we - * must re-start the pick_next_entity() loop. - */ - if (new_tasks < 0) - return RETRY_TASK; + /* + * Because sched_balance_newidle() releases (and re-acquires) + * rq->lock, it is possible for any higher priority task to + * appear. In that case we must re-start the pick_next_entity() + * loop. + */ + if (new_tasks < 0) + return RETRY_TASK; - if (new_tasks > 0) - goto again; + if (new_tasks > 0) + goto again; + } /* * rq is about to be idle, check if we need to update the @@ -8945,14 +9001,10 @@ idle: return NULL; } -static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) +static struct task_struct * +fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) { - return pick_next_task_fair(rq, prev, NULL); -} - -static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) -{ - return pick_task_fair(dl_se->rq); + return pick_task_fair(dl_se->rq, rf); } void fair_server_init(struct rq *rq) @@ -8983,7 +9035,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t */ static void yield_task_fair(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *curr = rq->donor; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se; @@ -9007,7 +9059,18 @@ static void yield_task_fair(struct rq *rq) */ rq_clock_skip_update(rq); - se->deadline += calc_delta_fair(se->slice, se); + /* + * Forfeit the remaining vruntime, only if the entity is eligible. This + * condition is necessary because in core scheduling we prefer to run + * ineligible tasks rather than force idling. If this happens we may + * end up in a loop where the core scheduler picks the yielding task, + * which yields immediately again; without the condition the vruntime + * ends up quickly running away. + */ + if (entity_eligible(cfs_rq, se)) { + se->vruntime = se->deadline; + se->deadline += calc_delta_fair(se->slice, se); + } } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) @@ -10671,7 +10734,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, if (sd->flags & SD_ASYM_CPUCAPACITY) sgs->group_misfit_task_load = 1; - for_each_cpu(i, sched_group_span(group)) { + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { struct rq *rq = cpu_rq(i); unsigned int local; @@ -11723,6 +11786,21 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd } /* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); + +/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ @@ -11747,6 +11825,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), }; + bool need_unlock = false; cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); @@ -11758,6 +11837,14 @@ redo: goto out_balanced; } + if (!need_unlock && (sd->flags & SD_SERIALIZE)) { + int zero = 0; + if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1)) + goto out_balanced; + + need_unlock = true; + } + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); @@ -11998,6 +12085,9 @@ out_one_pinned: sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; out: + if (need_unlock) + atomic_set_release(&sched_balance_running, 0); + return ld_moved; } @@ -12123,21 +12213,6 @@ out_unlock: } /* - * This flag serializes load-balancing passes over large domains - * (above the NODE topology level) - only one load-balancing instance - * may run at a time, to reduce overhead on very large systems with - * lots of CPUs and large NUMA distances. - * - * - Note that load-balancing passes triggered while another one - * is executing are skipped and not re-tried. - * - * - Also note that this does not serialize rebalance_domains() - * execution, as non-SD_SERIALIZE domains will still be - * load-balanced in parallel. - */ -static atomic_t sched_balance_running = ATOMIC_INIT(0); - -/* * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ @@ -12146,30 +12221,43 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) +{ + sd->newidle_call++; + sd->newidle_success += success; + + if (sd->newidle_call >= 1024) { + sd->newidle_ratio = sd->newidle_success; + sd->newidle_call /= 2; + sd->newidle_success /= 2; + } +} + +static inline bool +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) { + unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; + unsigned long now = jiffies; + + if (cost) + update_newidle_stats(sd, success); + if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. - * - * sched_balance_newidle() bumps the cost whenever newidle - * balance fails, and we don't want things to grow out of - * control. Use the sysctl_sched_migration_cost as the upper - * limit, plus a litle extra to avoid off by ones. */ - sd->max_newidle_lb_cost = - min(cost, sysctl_sched_migration_cost + 200); - sd->last_decay_max_lb_cost = jiffies; - } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + sd->max_newidle_lb_cost = cost; + sd->last_decay_max_lb_cost = now; + + } else if (time_after(now, next_decay)) { /* * Decay the newidle max times by ~1% per second to ensure that * it is not outdated and the current max cost is actually * shorter. */ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; - sd->last_decay_max_lb_cost = jiffies; - + sd->last_decay_max_lb_cost = now; return true; } @@ -12192,7 +12280,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize, need_decay = 0; + int need_decay = 0; u64 max_cost = 0; rcu_read_lock(); @@ -12201,7 +12289,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * Decay the newidle max times here because this is a regular * visit to all the domains. */ - need_decay = update_newidle_cost(sd, 0); + need_decay = update_newidle_cost(sd, 0, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -12216,13 +12304,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) } interval = get_sd_balance_interval(sd, busy); - - need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) - goto out; - } - if (time_after_eq(jiffies, sd->last_balance + interval)) { if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* @@ -12236,9 +12317,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); } - if (need_serialize) - atomic_set_release(&sched_balance_running, 0); -out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; update_next_balance = 1; @@ -12817,18 +12895,21 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (!sd) { + rcu_read_unlock(); + goto out; + } if (!get_rd_overloaded(this_rq->rd) || - (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { + this_rq->avg_idle < sd->max_newidle_lb_cost) { - if (sd) - update_next_balance(sd, &next_balance); + update_next_balance(sd, &next_balance); rcu_read_unlock(); - goto out; } rcu_read_unlock(); + rq_modified_clear(this_rq); raw_spin_rq_unlock(this_rq); t0 = sched_clock_cpu(this_cpu); @@ -12844,6 +12925,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) break; if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned int weight = 1; + + if (sched_feat(NI_RANDOM)) { + /* + * Throw a 1k sided dice; and only run + * newidle_balance according to the success + * rate. + */ + u32 d1k = sched_rng() % 1024; + weight = 1 + sd->newidle_ratio; + if (d1k > weight) { + update_newidle_stats(sd, 0); + continue; + } + weight = (1024 + weight/2) / weight; + } pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, @@ -12855,13 +12952,10 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t0 = t1; /* - * Failing newidle means it is not effective; - * bump the cost so we end up doing less of it. + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. */ - if (!pulled_task) - domain_cost = (3 * sd->max_newidle_lb_cost) / 2; - - update_newidle_cost(sd, domain_cost); + update_newidle_cost(sd, domain_cost, weight * !!pulled_task); } /* @@ -12886,8 +12980,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_queued) + /* If a higher prio class was modified, restart the pick */ + if (rq_modified_above(this_rq, &fair_sched_class)) pulled_task = -1; out: @@ -13005,7 +13099,170 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) } /* - * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + * Consider any infeasible weight scenario. Take for instance two tasks, + * each bound to their respective sibling, one with weight 1 and one with + * weight 2. Then the lower weight task will run ahead of the higher weight + * task without bound. + * + * This utterly destroys the concept of a shared time base. + * + * Remember; all this is about a proportionally fair scheduling, where each + * tasks receives: + * + * w_i + * dt_i = ---------- dt (1) + * \Sum_j w_j + * + * which we do by tracking a virtual time, s_i: + * + * 1 + * s_i = --- d[t]_i (2) + * w_i + * + * Where d[t] is a delta of discrete time, while dt is an infinitesimal. + * The immediate corollary is that the ideal schedule S, where (2) to use + * an infinitesimal delta, is: + * + * 1 + * S = ---------- dt (3) + * \Sum_i w_i + * + * From which we can define the lag, or deviation from the ideal, as: + * + * lag(i) = S - s_i (4) + * + * And since the one and only purpose is to approximate S, we get that: + * + * \Sum_i w_i lag(i) := 0 (5) + * + * If this were not so, we no longer converge to S, and we can no longer + * claim our scheduler has any of the properties we derive from S. This is + * exactly what you did above, you broke it! + * + * + * Let's continue for a while though; to see if there is anything useful to + * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i: + * + * \Sum_i w_i s_i + * S = -------------- (6) + * \Sum_i w_i + * + * Which gives us a way to compute S, given our s_i. Now, if you've read + * our code, you know that we do not in fact do this, the reason for this + * is two-fold. Firstly, computing S in that way requires a 64bit division + * for every time we'd use it (see 12), and secondly, this only describes + * the steady-state, it doesn't handle dynamics. + * + * Anyway, in (6): s_i -> x + (s_i - x), to get: + * + * \Sum_i w_i (s_i - x) + * S - x = -------------------- (7) + * \Sum_i w_i + * + * Which shows that S and s_i transform alike (which makes perfect sense + * given that S is basically the (weighted) average of s_i). + * + * So the thing to remember is that the above is strictly UP. It is + * possible to generalize to multiple runqueues -- however it gets really + * yuck when you have to add affinity support, as illustrated by our very + * first counter-example. + * + * Luckily I think we can avoid needing a full multi-queue variant for + * core-scheduling (or load-balancing). The crucial observation is that we + * only actually need this comparison in the presence of forced-idle; only + * then do we need to tell if the stalled rq has higher priority over the + * other. + * + * [XXX assumes SMT2; better consider the more general case, I suspect + * it'll work out because our comparison is always between 2 rqs and the + * answer is only interesting if one of them is forced-idle] + * + * And (under assumption of SMT2) when there is forced-idle, there is only + * a single queue, so everything works like normal. + * + * Let, for our runqueue 'k': + * + * T_k = \Sum_i w_i s_i + * W_k = \Sum_i w_i ; for all i of k (8) + * + * Then we can write (6) like: + * + * T_k + * S_k = --- (9) + * W_k + * + * From which immediately follows that: + * + * T_k + T_l + * S_k+l = --------- (10) + * W_k + W_l + * + * On which we can define a combined lag: + * + * lag_k+l(i) := S_k+l - s_i (11) + * + * And that gives us the tools to compare tasks across a combined runqueue. + * + * + * Combined this gives the following: + * + * a) when a runqueue enters force-idle, sync it against it's sibling rq(s) + * using (7); this only requires storing single 'time'-stamps. + * + * b) when comparing tasks between 2 runqueues of which one is forced-idle, + * compare the combined lag, per (11). + * + * Now, of course cgroups (I so hate them) make this more interesting in + * that a) seems to suggest we need to iterate all cgroup on a CPU at such + * boundaries, but I think we can avoid that. The force-idle is for the + * whole CPU, all it's rqs. So we can mark it in the root and lazily + * propagate downward on demand. + */ + +/* + * So this sync is basically a relative reset of S to 0. + * + * So with 2 queues, when one goes idle, we drop them both to 0 and one + * then increases due to not being idle, and the idle one builds up lag to + * get re-elected. So far so simple, right? + * + * When there's 3, we can have the situation where 2 run and one is idle, + * we sync to 0 and let the idle one build up lag to get re-election. Now + * suppose another one also drops idle. At this point dropping all to 0 + * again would destroy the built-up lag from the queue that was already + * idle, not good. + * + * So instead of syncing everything, we can: + * + * less := !((s64)(s_a - s_b) <= 0) + * + * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b + * == v_a - (v_b - S_a + S_b) + * + * IOW, we can recast the (lag) comparison to a one-sided difference. + * So if then, instead of syncing the whole queue, sync the idle queue + * against the active queue with S_a + S_b at the point where we sync. + * + * (XXX consider the implication of living in a cyclic group: N / 2^n N) + * + * This gives us means of syncing single queues against the active queue, + * and for already idle queues to preserve their build-up lag. + * + * Of course, then we get the situation where there's 2 active and one + * going idle, who do we pick to sync against? Theory would have us sync + * against the combined S, but as we've already demonstrated, there is no + * such thing in infeasible weight scenarios. + * + * One thing I've considered; and this is where that core_active rudiment + * came from, is having active queues sync up between themselves after + * every tick. This limits the observed divergence due to the work + * conservancy. + * + * On top of that, we can improve upon things by employing (10) here. + */ + +/* + * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed. */ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, bool forceidle) @@ -13019,7 +13276,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, cfs_rq->forceidle_seq = fi_seq; } - cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; + cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime; } } @@ -13072,11 +13329,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, /* * Find delta after normalizing se's vruntime with its cfs_rq's - * min_vruntime_fi, which would have been updated in prior calls + * zero_vruntime_fi, which would have been updated in prior calls * to se_fi_update(). */ delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); return delta > 0; } @@ -13138,11 +13395,14 @@ static void task_fork_fair(struct task_struct *p) * the current task. */ static void -prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (rq->cfs.nr_queued == 1) return; @@ -13154,8 +13414,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); - } else + } else { wakeup_preempt(rq, p, 0); + } } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13187,6 +13448,8 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } + + assert_list_leaf_cfs_rq(rq_of(cfs_rq)); } #else /* !CONFIG_FAIR_GROUP_SCHED: */ static void propagate_entity_cfs_rq(struct sched_entity *se) { } @@ -13237,6 +13500,12 @@ static void attach_task_cfs_rq(struct task_struct *p) attach_entity_cfs_rq(se); } +static void switching_from_fair(struct rq *rq, struct task_struct *p) +{ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); +} + static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); @@ -13310,7 +13579,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); + cfs_rq->zero_vruntime = (u64)(-(1LL << 20)); raw_spin_lock_init(&cfs_rq->removed.lock); } @@ -13611,6 +13880,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task */ DEFINE_SCHED_CLASS(fair) = { + .queue_mask = 2, + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, @@ -13619,11 +13890,10 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = check_preempt_wakeup_fair, .pick_task = pick_task_fair, - .pick_next_task = __pick_next_task_fair, + .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, - .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13638,6 +13908,7 @@ DEFINE_SCHED_CLASS(fair) = { .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, + .switching_from = switching_from_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 3c12d9f93331..980d92bab8ab 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -29,7 +29,7 @@ SCHED_FEAT(PREEMPT_SHORT, true) * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, false) +SCHED_FEAT(NEXT_BUDDY, true) /* * Allow completely ignoring cfs_rq->next; which can be set from various @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) + +/* + * Do newidle balancing proportional to its success rate using randomization. + */ +SCHED_FEAT(NI_RANDOM, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c39b089d4f09..c174afe1dd17 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -131,12 +131,13 @@ void __cpuidle default_idle_call(void) } static int call_cpuidle_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + u64 max_latency_ns) { if (current_clr_polling_and_test()) return -EBUSY; - return cpuidle_enter_s2idle(drv, dev); + return cpuidle_enter_s2idle(drv, dev, max_latency_ns); } static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, @@ -205,12 +206,13 @@ static void cpuidle_idle_call(void) u64 max_latency_ns; if (idle_should_enter_s2idle()) { + max_latency_ns = cpu_wakeup_latency_qos_limit() * + NSEC_PER_USEC; - entered_state = call_cpuidle_s2idle(drv, dev); + entered_state = call_cpuidle_s2idle(drv, dev, + max_latency_ns); if (entered_state > 0) goto exit_idle; - - max_latency_ns = U64_MAX; } else { max_latency_ns = dev->forced_idle_latency_limit_ns; } @@ -452,9 +454,11 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) resched_curr(rq); } +static void update_curr_idle(struct rq *rq); + static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - dl_server_update_idle_time(rq, prev); + update_curr_idle(rq); scx_update_idle(rq, false, true); } @@ -466,7 +470,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir next->se.exec_start = rq_clock_task(rq); } -struct task_struct *pick_task_idle(struct rq *rq) +struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf) { scx_update_idle(rq, true, false); return rq->idle; @@ -496,21 +500,36 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) */ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { + update_curr_idle(rq); } -static void switched_to_idle(struct rq *rq, struct task_struct *p) +static void switching_to_idle(struct rq *rq, struct task_struct *p) { BUG(); } static void -prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); } static void update_curr_idle(struct rq *rq) { + struct sched_entity *se = &rq->idle->se; + u64 now = rq_clock_task(rq); + s64 delta_exec; + + delta_exec = now - se->exec_start; + if (unlikely(delta_exec <= 0)) + return; + + se->exec_start = now; + + dl_server_update_idle(&rq->fair_server, delta_exec); } /* @@ -518,6 +537,8 @@ static void update_curr_idle(struct rq *rq) */ DEFINE_SCHED_CLASS(idle) = { + .queue_mask = 0, + /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ @@ -536,6 +557,6 @@ DEFINE_SCHED_CLASS(idle) = { .task_tick = task_tick_idle, .prio_changed = prio_changed_idle, - .switched_to = switched_to_idle, + .switching_to = switching_to_idle, .update_curr = update_curr_idle, }; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index a4cf17b1fab0..3ad0d6df6a0a 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -167,6 +167,29 @@ static int __init housekeeping_setup(char *str, unsigned long flags) } } + /* + * Check the combination of nohz_full and isolcpus=domain, + * necessary to avoid problems with the timer migration + * hierarchy. managed_irq is ignored by this check since it + * isn't considered in the timer migration logic. + */ + iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN); + type = find_first_bit(&iter_flags, HK_TYPE_MAX); + /* + * Pass the check if none of these flags were previously set or + * are not in the current selection. + */ + iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN); + first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 : + cpumask_first_and_and(cpu_present_mask, + housekeeping_staging, housekeeping.cpumasks[type]); + if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) { + pr_warn("Housekeeping: must include one present CPU " + "neither in nohz_full= nor in isolcpus=domain, " + "ignoring setting %s\n", str); + goto free_housekeeping_staging; + } + iter_flags = flags & ~housekeeping.flags; for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 62fba83b7bb1..623445603725 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -199,7 +199,7 @@ static void ipi_rseq(void *info) * is negligible. */ smp_mb(); - rseq_preempt(current); + rseq_sched_switch_event(current); } static void ipi_sync_rq_state(void *info) @@ -407,9 +407,9 @@ static int membarrier_private_expedited(int flags, int cpu_id) * membarrier, we will end up with some thread in the mm * running without a core sync. * - * For RSEQ, don't rseq_preempt() the caller. User code - * is not supposed to issue syscalls at all from inside an - * rseq critical section. + * For RSEQ, don't invoke rseq_sched_switch_event() on the + * caller. User code is not supposed to issue syscalls at + * all from inside an rseq critical section. */ if (flags != MEMBARRIER_FLAG_SYNC_CORE) { preempt_disable(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7936d4333731..f1867fe8e5c5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1490,7 +1490,7 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) static void yield_task_rt(struct rq *rq) { - requeue_task_rt(rq, rq->curr, 0); + requeue_task_rt(rq, rq->donor, 0); } static int find_lowest_rq(struct task_struct *task); @@ -1695,7 +1695,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); } -static struct task_struct *pick_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf) { struct task_struct *p; @@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * us to initiate a push or pull. */ static void -prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (task_current_donor(rq, p)) { /* * If our priority decreases while running, we @@ -2566,6 +2569,8 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) DEFINE_SCHED_CLASS(rt) = { + .queue_mask = 4, + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, @@ -2589,8 +2594,8 @@ DEFINE_SCHED_CLASS(rt) = { .get_rr_interval = get_rr_interval_rt, - .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + .prio_changed = prio_changed_rt, .update_curr = update_curr_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1f5d07067f60..bbf513b3e76c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #ifndef _KERNEL_SCHED_SCHED_H #define _KERNEL_SCHED_SCHED_H +#include <linux/prandom.h> #include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> #include <linux/sched/cpufreq.h> @@ -20,7 +21,6 @@ #include <linux/sched/task_flags.h> #include <linux/sched/task.h> #include <linux/sched/topology.h> - #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/bug.h> @@ -405,6 +405,7 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * naturally thottled to once per period, avoiding high context switch * workloads from spamming the hrtimer program/cancel paths. */ +extern void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); @@ -412,8 +413,6 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_pick_f pick_task); extern void sched_init_dl_servers(void); -extern void dl_server_update_idle_time(struct rq *rq, - struct task_struct *p); extern void fair_server_init(struct rq *rq); extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); extern int dl_server_apply_params(struct sched_dl_entity *dl_se, @@ -682,10 +681,10 @@ struct cfs_rq { s64 avg_vruntime; u64 avg_load; - u64 min_vruntime; + u64 zero_vruntime; #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; - u64 min_vruntime_fi; + u64 zero_vruntime_fi; #endif struct rb_root_cached tasks_timeline; @@ -780,10 +779,10 @@ enum scx_rq_flags { */ SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, - SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ + SCX_RQ_BAL_CB_PENDING = 1 << 6, /* must queue a cb after dispatching */ SCX_RQ_IN_WAKEUP = 1 << 16, SCX_RQ_IN_BALANCE = 1 << 17, @@ -804,10 +803,12 @@ struct scx_rq { cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_wait; - unsigned long pnt_seq; + unsigned long kick_sync; + local_t reenq_local_deferred; struct balance_callback deferred_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; + struct scx_dispatch_q bypass_dsq; }; #endif /* CONFIG_SCHED_CLASS_EXT */ @@ -1119,6 +1120,8 @@ struct rq { /* runqueue lock: */ raw_spinlock_t __lock; + /* Per class runqueue modification mask; bits in class order. */ + unsigned int queue_mask; unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1348,6 +1351,12 @@ static inline bool is_migration_disabled(struct task_struct *p) } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); + +static inline u32 sched_rng(void) +{ + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); +} #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() this_cpu_ptr(&runqueues) @@ -1431,6 +1440,9 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) if (!sched_core_enabled(rq)) return true; + if (rq->core->core_cookie == p->core_cookie) + return true; + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { if (!available_idle_cpu(cpu)) { idle_core = false; @@ -1442,7 +1454,7 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) * A CPU in an idle core is always the best choice for tasks with * cookies. */ - return idle_core || rq->core->core_cookie == p->core_cookie; + return idle_core; } static inline bool sched_group_cookie_match(struct rq *rq, @@ -1826,7 +1838,8 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) +static inline void +__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); @@ -1838,8 +1851,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - rq_unpin_lock(rq, rf); - raw_spin_rq_unlock(rq); + __task_rq_unlock(rq, p, rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } @@ -1848,6 +1860,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, task_rq_unlock(_T->rq, _T->lock, &_T->rf), struct rq *rq; struct rq_flags rf) +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct, + _T->rq = __task_rq_lock(_T->lock, &_T->rf), + __task_rq_unlock(_T->rq, _T->lock, &_T->rf), + struct rq *rq; struct rq_flags rf) + static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { @@ -2208,6 +2225,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; + rseq_sched_set_ids_changed(p); #endif /* CONFIG_SMP */ } @@ -2341,8 +2359,7 @@ extern const u32 sched_prio_to_wmult[40]; /* * {de,en}queue flags: * - * DEQUEUE_SLEEP - task is no longer runnable - * ENQUEUE_WAKEUP - task just became runnable + * SLEEP/WAKEUP - task is no-longer/just-became runnable * * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks * are in a known state which allows modification. Such pairs @@ -2355,34 +2372,46 @@ extern const u32 sched_prio_to_wmult[40]; * * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) * + * DELAYED - de/re-queue a sched_delayed task + * + * CLASS - going to update p->sched_class; makes sched_change call the + * various switch methods. + * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * + * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but + * SCHED_DEADLINE seems to rely on this for now. */ -#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ -#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ -#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ -#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ -#define DEQUEUE_SPECIAL 0x10 -#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ -#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ -#define DEQUEUE_THROTTLE 0x800 - -#define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_RESTORE 0x02 -#define ENQUEUE_MOVE 0x04 -#define ENQUEUE_NOCLOCK 0x08 - -#define ENQUEUE_HEAD 0x10 -#define ENQUEUE_REPLENISH 0x20 -#define ENQUEUE_MIGRATED 0x40 -#define ENQUEUE_INITIAL 0x80 -#define ENQUEUE_MIGRATING 0x100 -#define ENQUEUE_DELAYED 0x200 -#define ENQUEUE_RQ_SELECTED 0x400 +#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */ +#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */ + +#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */ + +#define DEQUEUE_SPECIAL 0x00010000 +#define DEQUEUE_THROTTLE 0x00020000 + +#define ENQUEUE_WAKEUP 0x0001 +#define ENQUEUE_RESTORE 0x0002 +#define ENQUEUE_MOVE 0x0004 +#define ENQUEUE_NOCLOCK 0x0008 + +#define ENQUEUE_MIGRATING 0x0010 +#define ENQUEUE_DELAYED 0x0020 +#define ENQUEUE_CLASS 0x0040 + +#define ENQUEUE_HEAD 0x00010000 +#define ENQUEUE_REPLENISH 0x00020000 +#define ENQUEUE_MIGRATED 0x00040000 +#define ENQUEUE_INITIAL 0x00080000 +#define ENQUEUE_RQ_SELECTED 0x00100000 #define RETRY_TASK ((void *)-1UL) @@ -2399,16 +2428,61 @@ struct sched_class { #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif + /* + * idle: 0 + * ext: 1 + * fair: 2 + * rt: 4 + * dl: 8 + * stop: 16 + */ + unsigned int queue_mask; + /* + * move_queued_task/activate_task/enqueue_task: rq->lock + * ttwu_do_activate/activate_task/enqueue_task: rq->lock + * wake_up_new_task/activate_task/enqueue_task: task_rq_lock + * ttwu_runnable/enqueue_task: task_rq_lock + * proxy_task_current: rq->lock + * sched_change_end + */ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + /* + * move_queued_task/deactivate_task/dequeue_task: rq->lock + * __schedule/block_task/dequeue_task: rq->lock + * proxy_task_current: rq->lock + * wait_task_inactive: task_rq_lock + * sched_change_begin + */ bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + + /* + * do_sched_yield: rq->lock + */ void (*yield_task) (struct rq *rq); + /* + * yield_to: rq->lock (double) + */ bool (*yield_to_task)(struct rq *rq, struct task_struct *p); + /* + * move_queued_task: rq->lock + * __migrate_swap_task: rq->lock + * ttwu_do_activate: rq->lock + * ttwu_runnable: task_rq_lock + * wake_up_new_task: task_rq_lock + */ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + /* + * schedule/pick_next_task/prev_balance: rq->lock + */ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - struct task_struct *(*pick_task)(struct rq *rq); + + /* + * schedule/pick_next_task: rq->lock + */ + struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); /* * Optional! When implemented pick_next_task() should be equivalent to: * @@ -2418,55 +2492,123 @@ struct sched_class { * set_next_task_first(next); * } */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); + /* + * sched_change: + * __schedule: rq->lock + */ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + /* + * select_task_rq: p->pi_lock + * sched_exec: p->pi_lock + */ int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + /* + * set_task_cpu: p->pi_lock || rq->lock (ttwu like) + */ void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + /* + * ttwu_do_activate: rq->lock + * wake_up_new_task: task_rq_lock + */ void (*task_woken)(struct rq *this_rq, struct task_struct *task); + /* + * do_set_cpus_allowed: task_rq_lock + sched_change + */ void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); + /* + * sched_set_rq_{on,off}line: rq->lock + */ void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); + /* + * push_cpu_stop: p->pi_lock && rq->lock + */ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); + /* + * hrtick: rq->lock + * sched_tick: rq->lock + * sched_tick_remote: rq->lock + */ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + /* + * sched_cgroup_fork: p->pi_lock + */ void (*task_fork)(struct task_struct *p); + /* + * finish_task_switch: no locks + */ void (*task_dead)(struct task_struct *p); /* - * The switched_from() call is allowed to drop rq->lock, therefore we - * cannot assume the switched_from/switched_to pair is serialized by - * rq->lock. They are however serialized by p->pi_lock. + * sched_change + */ + void (*switching_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switching_to) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + u64 (*get_prio) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + u64 oldprio); + + /* + * set_load_weight: task_rq_lock + sched_change + * __setscheduler_parms: task_rq_lock + sched_change */ - void (*switching_to) (struct rq *this_rq, struct task_struct *task); - void (*switched_from)(struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*reweight_task)(struct rq *this_rq, struct task_struct *task, const struct load_weight *lw); - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); + /* + * sched_rr_get_interval: task_rq_lock + */ unsigned int (*get_rr_interval)(struct rq *rq, struct task_struct *task); + /* + * task_sched_runtime: task_rq_lock + */ void (*update_curr)(struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED + /* + * sched_change_group: task_rq_lock + sched_change + */ void (*task_change_group)(struct task_struct *p); #endif #ifdef CONFIG_SCHED_CORE + /* + * pick_next_task: rq->lock + * try_steal_cookie: rq->lock (double) + */ int (*task_is_throttled)(struct task_struct *p, int cpu); #endif }; +/* + * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. + */ +static inline void rq_modified_clear(struct rq *rq) +{ + rq->queue_mask = 0; +} + +static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) +{ + unsigned int mask = class->queue_mask; + return rq->queue_mask & ~((mask << 1) - 1); +} + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->donor != prev); @@ -2578,8 +2720,9 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_queued > 0; } -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_task_idle(struct rq *rq); +extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); +extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 @@ -2609,7 +2752,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) static inline cpumask_t *alloc_user_cpus_ptr(int node) { /* - * See do_set_cpus_allowed() above for the rcu_head usage. + * See set_cpus_allowed_force() above for the rcu_head usage. */ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); @@ -3539,285 +3682,212 @@ extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID -#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ -#define MM_CID_SCAN_DELAY 100 /* 100ms */ +static __always_inline bool cid_on_cpu(unsigned int cid) +{ + return cid & MM_CID_ONCPU; +} -extern raw_spinlock_t cid_lock; -extern int use_cid_lock; +static __always_inline bool cid_in_transit(unsigned int cid) +{ + return cid & MM_CID_TRANSIT; +} -extern void sched_mm_cid_migrate_from(struct task_struct *t); -extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); -extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); -extern void init_sched_mm_cid(struct task_struct *t); +static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid) +{ + return cid & ~MM_CID_ONCPU; +} -static inline void __mm_cid_put(struct mm_struct *mm, int cid) +static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid) { - if (cid < 0) - return; - cpumask_clear_cpu(cid, mm_cidmask(mm)); + return cid | MM_CID_ONCPU; } -/* - * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to - * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to - * be held to transition to other states. - * - * State transitions synchronized with cmpxchg or try_cmpxchg need to be - * consistent across CPUs, which prevents use of this_cpu_cmpxchg. - */ -static inline void mm_cid_put_lazy(struct task_struct *t) +static __always_inline unsigned int cid_to_transit_cid(unsigned int cid) { - struct mm_struct *mm = t->mm; - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid; + return cid | MM_CID_TRANSIT; +} - lockdep_assert_irqs_disabled(); - cid = __this_cpu_read(pcpu_cid->cid); - if (!mm_cid_is_lazy_put(cid) || - !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) - return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +static __always_inline unsigned int cid_from_transit_cid(unsigned int cid) +{ + return cid & ~MM_CID_TRANSIT; } -static inline int mm_cid_pcpu_unset(struct mm_struct *mm) +static __always_inline bool cid_on_task(unsigned int cid) { - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid, res; + /* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */ + return cid < MM_CID_TRANSIT; +} - lockdep_assert_irqs_disabled(); - cid = __this_cpu_read(pcpu_cid->cid); - for (;;) { - if (mm_cid_is_unset(cid)) - return MM_CID_UNSET; - /* - * Attempt transition from valid or lazy-put to unset. - */ - res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); - if (res == cid) - break; - cid = res; - } - return cid; +static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid) +{ + clear_bit(cid, mm_cidmask(mm)); } -static inline void mm_cid_put(struct mm_struct *mm) +static __always_inline void mm_unset_cid_on_task(struct task_struct *t) { - int cid; + unsigned int cid = t->mm_cid.cid; - lockdep_assert_irqs_disabled(); - cid = mm_cid_pcpu_unset(mm); - if (cid == MM_CID_UNSET) - return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + t->mm_cid.cid = MM_CID_UNSET; + if (cid_on_task(cid)) + mm_drop_cid(t->mm, cid); } -static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) +static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp) { - struct cpumask *cidmask = mm_cidmask(mm); - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid, max_nr_cid, allowed_max_nr_cid; + /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */ + pcp->cid = cpu_cid_to_cid(pcp->cid); + mm_drop_cid(mm, pcp->cid); +} - /* - * After shrinking the number of threads or reducing the number - * of allowed cpus, reduce the value of max_nr_cid so expansion - * of cid allocation will preserve cache locality if the number - * of threads or allowed cpus increase again. - */ - max_nr_cid = atomic_read(&mm->max_nr_cid); - while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), - atomic_read(&mm->mm_users))), - max_nr_cid > allowed_max_nr_cid) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ - if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { - max_nr_cid = allowed_max_nr_cid; - break; - } - } - /* Try to re-use recent cid. This improves cache locality. */ - cid = __this_cpu_read(pcpu_cid->recent_cid); - if (!mm_cid_is_unset(cid) && cid < max_nr_cid && - !cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - /* - * Expand cid allocation if the maximum number of concurrency - * IDs allocated (max_nr_cid) is below the number cpus allowed - * and number of threads. Expanding cid allocation as much as - * possible improves cache locality. - */ - cid = max_nr_cid; - while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ - if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) - continue; - if (!cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - } - /* - * Find the first available concurrency id. - * Retry finding first zero bit if the mask is temporarily - * filled. This only happens during concurrent remote-clear - * which owns a cid without holding a rq lock. - */ - for (;;) { - cid = cpumask_first_zero(cidmask); - if (cid < READ_ONCE(mm->nr_cpus_allowed)) - break; - cpu_relax(); - } - if (cpumask_test_and_set_cpu(cid, cidmask)) - return -1; +static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids) +{ + unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids); + if (cid >= max_cids) + return MM_CID_UNSET; + if (test_and_set_bit(cid, mm_cidmask(mm))) + return MM_CID_UNSET; return cid; } -/* - * Save a snapshot of the current runqueue time of this cpu - * with the per-cpu cid value, allowing to estimate how recently it was used. - */ -static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) +static inline unsigned int mm_get_cid(struct mm_struct *mm) { - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); + unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids)); - lockdep_assert_rq_held(rq); - WRITE_ONCE(pcpu_cid->time, rq->clock); + while (cid == MM_CID_UNSET) { + cpu_relax(); + cid = __mm_get_cid(mm, num_possible_cpus()); + } + return cid; } -static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) +static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid, + unsigned int max_cids) { - int cid; + unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid); - /* - * All allocations (even those using the cid_lock) are lock-free. If - * use_cid_lock is set, hold the cid_lock to perform cid allocation to - * guarantee forward progress. - */ - if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(t, mm); - if (cid >= 0) - goto end; - raw_spin_lock(&cid_lock); - } else { - raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(t, mm); - if (cid >= 0) - goto unlock; + /* Is it in the optimal CID space? */ + if (likely(cid < max_cids)) + return orig_cid; + + /* Try to find one in the optimal space. Otherwise keep the provided. */ + new_cid = __mm_get_cid(mm, max_cids); + if (new_cid != MM_CID_UNSET) { + mm_drop_cid(mm, cid); + /* Preserve the ONCPU mode of the original CID */ + return new_cid | (orig_cid & MM_CID_ONCPU); } + return orig_cid; +} - /* - * cid concurrently allocated. Retry while forcing following - * allocations to use the cid_lock to ensure forward progress. - */ - WRITE_ONCE(use_cid_lock, 1); - /* - * Set use_cid_lock before allocation. Only care about program order - * because this is only required for forward progress. - */ - barrier(); - /* - * Retry until it succeeds. It is guaranteed to eventually succeed once - * all newcoming allocations observe the use_cid_lock flag set. - */ - do { - cid = __mm_cid_try_get(t, mm); - cpu_relax(); - } while (cid < 0); - /* - * Allocate before clearing use_cid_lock. Only care about - * program order because this is for forward progress. - */ - barrier(); - WRITE_ONCE(use_cid_lock, 0); -unlock: - raw_spin_unlock(&cid_lock); -end: - mm_cid_snapshot_time(rq, mm); +static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid) +{ + if (t->mm_cid.cid != cid) { + t->mm_cid.cid = cid; + rseq_sched_set_ids_changed(t); + } +} - return cid; +static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid) +{ + __this_cpu_write(mm->mm_cid.pcpu->cid, cid); } -static inline int mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) +static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid) { - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - struct cpumask *cpumask; - int cid; + unsigned int max_cids, tcid = t->mm_cid.cid; + struct mm_struct *mm = t->mm; - lockdep_assert_rq_held(rq); - cpumask = mm_cidmask(mm); - cid = __this_cpu_read(pcpu_cid->cid); - if (mm_cid_is_valid(cid)) { - mm_cid_snapshot_time(rq, mm); - return cid; - } - if (mm_cid_is_lazy_put(cid)) { - if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + max_cids = READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case where both have the ONCPU bit set */ + if (likely(cid_on_cpu(cpu_cid & tcid))) { + if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) { + mm_cid_update_task_cid(t, cpu_cid); + return; + } + /* Try to converge into the optimal CID space */ + cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids); + } else { + /* Hand over or drop the task owned CID */ + if (cid_on_task(tcid)) { + if (cid_on_cpu(cpu_cid)) + mm_unset_cid_on_task(t); + else + cpu_cid = cid_to_cpu_cid(tcid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_cpu(cpu_cid)) + cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); } - cid = __mm_cid_get(rq, t, mm); - __this_cpu_write(pcpu_cid->cid, cid); - __this_cpu_write(pcpu_cid->recent_cid, cid); - - return cid; + mm_cid_update_pcpu_cid(mm, cpu_cid); + mm_cid_update_task_cid(t, cpu_cid); } -static inline void switch_mm_cid(struct rq *rq, - struct task_struct *prev, - struct task_struct *next) +static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid) { - /* - * Provide a memory barrier between rq->curr store and load of - * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. - * - * Should be adapted if context_switch() is modified. - */ - if (!next->mm) { // to kernel - /* - * user -> kernel transition does not guarantee a barrier, but - * we can use the fact that it performs an atomic operation in - * mmgrab(). - */ - if (prev->mm) // from user - smp_mb__after_mmgrab(); - /* - * kernel -> kernel transition does not change rq->curr->mm - * state. It stays NULL. - */ - } else { // to user - /* - * kernel -> user transition does not provide a barrier - * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. - * Provide it here. - */ - if (!prev->mm) { // from kernel - smp_mb(); - } else { // from user - /* - * user->user transition relies on an implicit - * memory barrier in switch_mm() when - * current->mm changes. If the architecture - * switch_mm() does not have an implicit memory - * barrier, it is emitted here. If current->mm - * is unchanged, no barrier is needed. - */ - smp_mb__after_switch_mm(); + unsigned int max_cids, tcid = t->mm_cid.cid; + struct mm_struct *mm = t->mm; + + max_cids = READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case, where both have the ONCPU bit clear */ + if (likely(cid_on_task(tcid | cpu_cid))) { + if (likely(tcid < max_cids)) { + mm_cid_update_pcpu_cid(mm, tcid); + return; } + /* Try to converge into the optimal CID space */ + tcid = mm_cid_converge(mm, tcid, max_cids); + } else { + /* Hand over or drop the CPU owned CID */ + if (cid_on_cpu(cpu_cid)) { + if (cid_on_task(tcid)) + mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); + else + tcid = cpu_cid_to_cid(cpu_cid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_task(tcid)) + tcid = mm_get_cid(mm); + /* Set the transition mode flag if required */ + tcid |= READ_ONCE(mm->mm_cid.transit); } - if (prev->mm_cid_active) { - mm_cid_snapshot_time(rq, prev->mm); - mm_cid_put_lazy(prev); - prev->mm_cid = -1; - } - if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); + mm_cid_update_pcpu_cid(mm, tcid); + mm_cid_update_task_cid(t, tcid); +} + +static __always_inline void mm_cid_schedin(struct task_struct *next) +{ + struct mm_struct *mm = next->mm; + unsigned int cpu_cid; + + if (!next->mm_cid.active) + return; + + cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid); + if (likely(!READ_ONCE(mm->mm_cid.percpu))) + mm_cid_from_task(next, cpu_cid); + else + mm_cid_from_cpu(next, cpu_cid); +} + +static __always_inline void mm_cid_schedout(struct task_struct *prev) +{ + /* During mode transitions CIDs are temporary and need to be dropped */ + if (likely(!cid_in_transit(prev->mm_cid.cid))) + return; + + mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid)); + prev->mm_cid.cid = MM_CID_UNSET; +} + +static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) +{ + mm_cid_schedout(prev); + mm_cid_schedin(next); } #else /* !CONFIG_SCHED_MM_CID: */ -static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } -static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } -static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } -static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } -static inline void init_sched_mm_cid(struct task_struct *t) { } +static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ extern u64 avg_vruntime(struct cfs_rq *cfs_rq); @@ -3876,32 +3946,42 @@ extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); -extern void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class); -extern void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio); - extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#ifdef CONFIG_SCHED_CLASS_EXT /* - * Used by SCX in the enable/disable paths to move tasks between sched_classes - * and establish invariants. + * The 'sched_change' pattern is the safe, easy and slow way of changing a + * task's scheduling properties. It dequeues a task, such that the scheduler + * is fully unaware of it; at which point its properties can be modified; + * after which it is enqueued again. + * + * Typically this must be called while holding task_rq_lock, since most/all + * properties are serialized under those locks. There is currently one + * exception to this rule in sched/ext which only holds rq->lock. */ -struct sched_enq_and_set_ctx { + +/* + * This structure is a temporary, used to preserve/convey the queueing state + * of the task between sched_change_begin() and sched_change_end(). Ensuring + * the task's queueing state is idempotent across the operation. + */ +struct sched_change_ctx { + u64 prio; struct task_struct *p; - int queue_flags; + int flags; bool queued; bool running; }; -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx); -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags); +void sched_change_end(struct sched_change_ctx *ctx); -#endif /* CONFIG_SCHED_CLASS_EXT */ +DEFINE_CLASS(sched_change, struct sched_change_ctx *, + sched_change_end(_T), + sched_change_begin(p, flags), + struct task_struct *p, unsigned int flags) + +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change) #include "ext.h" diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 26f3fd4d34ce..cbf7206b3f9d 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) rq = __task_rq_lock(p, &rf); psi_task_change(p, p->psi_flags, 0); - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); } } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 2d4e279f05ee..4f9192be4b5b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -32,7 +32,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir stop->se.exec_start = rq_clock_task(rq); } -static struct task_struct *pick_task_stop(struct rq *rq) +static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf) { if (!sched_stop_runnable(rq)) return NULL; @@ -75,14 +75,17 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -static void switched_to_stop(struct rq *rq, struct task_struct *p) +static void switching_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ } static void -prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); /* how!?, what priority? */ } @@ -95,6 +98,8 @@ static void update_curr_stop(struct rq *rq) */ DEFINE_SCHED_CLASS(stop) = { + .queue_mask = 16, + .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, @@ -112,6 +117,6 @@ DEFINE_SCHED_CLASS(stop) = { .task_tick = task_tick_stop, .prio_changed = prio_changed_stop, - .switched_to = switched_to_stop, + .switching_to = switching_to_stop, .update_curr = update_curr_stop, }; diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 77ae87f36e84..0496dc29ed0f 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -64,8 +64,6 @@ static int effective_prio(struct task_struct *p) void set_user_nice(struct task_struct *p, long nice) { - bool queued, running; - struct rq *rq; int old_prio; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@ -74,10 +72,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - CLASS(task_rq_lock, rq_guard)(p); - rq = rq_guard.rq; - - update_rq_clock(rq); + guard(task_rq_lock)(p); /* * The RT priorities are set via sched_setscheduler(), but we still @@ -90,28 +85,12 @@ void set_user_nice(struct task_struct *p, long nice) return; } - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - if (running) - put_prev_task(rq, p); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); - old_prio = p->prio; - p->prio = effective_prio(p); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - p->sched_class->prio_changed(rq, p, old_prio); + scoped_guard (sched_change, p, DEQUEUE_SAVE) { + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + } } EXPORT_SYMBOL(set_user_nice); @@ -515,7 +494,7 @@ int __sched_setscheduler(struct task_struct *p, bool user, bool pi) { int oldpolicy = -1, policy = attr->sched_policy; - int retval, oldprio, newprio, queued, running; + int retval, oldprio, newprio; const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; @@ -695,38 +674,27 @@ change: prev_class = p->sched_class; next_class = __setscheduler_class(policy, newprio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flags); - if (running) - put_prev_task(rq, p); + if (prev_class != next_class) + queue_flags |= DEQUEUE_CLASS; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { - __setscheduler_params(p, attr); - p->sched_class = next_class; - p->prio = newprio; - } - __setscheduler_uclamp(p, attr); - check_class_changing(rq, p, prev_class); + scoped_guard (sched_change, p, queue_flags) { - if (queued) { - /* - * We enqueue to tail when the priority of a task is - * increased (user space view). - */ - if (oldprio < p->prio) - queue_flags |= ENQUEUE_HEAD; + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + p->sched_class = next_class; + p->prio = newprio; + } + __setscheduler_uclamp(p, attr); - enqueue_task(rq, p, queue_flags); + if (scope->queued) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + if (oldprio < p->prio) + scope->flags |= ENQUEUE_HEAD; + } } - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); /* Avoid rq from going away on us: */ preempt_disable(); @@ -856,6 +824,19 @@ void sched_set_fifo_low(struct task_struct *p) } EXPORT_SYMBOL_GPL(sched_set_fifo_low); +/* + * Used when the primary interrupt handler is forced into a thread, in addition + * to the (always threaded) secondary handler. The secondary handler gets a + * slightly lower priority so that the primary handler can preempt it, thereby + * emulating the behavior of a non-PREEMPT_RT system where the primary handler + * runs in hard interrupt context. + */ +void sched_set_fifo_secondary(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 - 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} + void sched_set_normal(struct task_struct *p, int nice) { struct sched_attr attr = { @@ -1351,7 +1332,7 @@ static void do_sched_yield(void) rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); - current->sched_class->yield_task(rq); + rq->donor->sched_class->yield_task(rq); preempt_disable(); rq_unlock_irq(rq, &rf); @@ -1420,12 +1401,13 @@ EXPORT_SYMBOL(yield); */ int __sched yield_to(struct task_struct *p, bool preempt) { - struct task_struct *curr = current; + struct task_struct *curr; struct rq *rq, *p_rq; int yielded = 0; scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { rq = this_rq(); + curr = rq->donor; again: p_rq = task_rq(p); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 444bdfdab731..cf643a5ddedd 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1590,10 +1590,17 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA enum numa_topology_type sched_numa_topology_type; +/* + * sched_domains_numa_distance is derived from sched_numa_node_distance + * and provides a simplified view of NUMA distances used specifically + * for building NUMA scheduling domains. + */ static int sched_domains_numa_levels; +static int sched_numa_node_levels; int sched_max_numa_distance; static int *sched_domains_numa_distance; +static int *sched_numa_node_distance; static struct cpumask ***sched_domains_numa_masks; #endif /* CONFIG_NUMA */ @@ -1662,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, + + /* 50% success rate */ + .newidle_call = 512, + .newidle_success = 256, + .newidle_ratio = 512, + .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, @@ -1845,10 +1858,10 @@ bool find_numa_distance(int distance) return true; rcu_read_lock(); - distances = rcu_dereference(sched_domains_numa_distance); + distances = rcu_dereference(sched_numa_node_distance); if (!distances) goto unlock; - for (i = 0; i < sched_domains_numa_levels; i++) { + for (i = 0; i < sched_numa_node_levels; i++) { if (distances[i] == distance) { found = true; break; @@ -1924,14 +1937,34 @@ static void init_numa_topology_type(int offline_node) #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) -void sched_init_numa(int offline_node) +/* + * An architecture could modify its NUMA distance, to change + * grouping of NUMA nodes and number of NUMA levels when creating + * NUMA level sched domains. + * + * A NUMA level is created for each unique + * arch_sched_node_distance. + */ +static int numa_node_dist(int i, int j) { - struct sched_domain_topology_level *tl; - unsigned long *distance_map; + return node_distance(i, j); +} + +int arch_sched_node_distance(int from, int to) + __weak __alias(numa_node_dist); + +static bool modified_sched_node_distance(void) +{ + return numa_node_dist != arch_sched_node_distance; +} + +static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int), + int **dist, int *levels) +{ + unsigned long *distance_map __free(bitmap) = NULL; int nr_levels = 0; int i, j; int *distances; - struct cpumask ***masks; /* * O(nr_nodes^2) de-duplicating selection sort -- in order to find the @@ -1939,17 +1972,16 @@ void sched_init_numa(int offline_node) */ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); if (!distance_map) - return; + return -ENOMEM; bitmap_zero(distance_map, NR_DISTANCE_VALUES); for_each_cpu_node_but(i, offline_node) { for_each_cpu_node_but(j, offline_node) { - int distance = node_distance(i, j); + int distance = n_dist(i, j); if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { sched_numa_warn("Invalid distance value range"); - bitmap_free(distance_map); - return; + return -EINVAL; } bitmap_set(distance_map, distance, 1); @@ -1962,18 +1994,46 @@ void sched_init_numa(int offline_node) nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); - if (!distances) { - bitmap_free(distance_map); - return; - } + if (!distances) + return -ENOMEM; for (i = 0, j = 0; i < nr_levels; i++, j++) { j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); distances[i] = j; } - rcu_assign_pointer(sched_domains_numa_distance, distances); + *dist = distances; + *levels = nr_levels; - bitmap_free(distance_map); + return 0; +} + +void sched_init_numa(int offline_node) +{ + struct sched_domain_topology_level *tl; + int nr_levels, nr_node_levels; + int i, j; + int *distances, *domain_distances; + struct cpumask ***masks; + + /* Record the NUMA distances from SLIT table */ + if (sched_record_numa_dist(offline_node, numa_node_dist, &distances, + &nr_node_levels)) + return; + + /* Record modified NUMA distances for building sched domains */ + if (modified_sched_node_distance()) { + if (sched_record_numa_dist(offline_node, arch_sched_node_distance, + &domain_distances, &nr_levels)) { + kfree(distances); + return; + } + } else { + domain_distances = distances; + nr_levels = nr_node_levels; + } + rcu_assign_pointer(sched_numa_node_distance, distances); + WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]); + WRITE_ONCE(sched_numa_node_levels, nr_node_levels); /* * 'nr_levels' contains the number of unique distances @@ -1991,6 +2051,8 @@ void sched_init_numa(int offline_node) * * We reset it to 'nr_levels' at the end of this function. */ + rcu_assign_pointer(sched_domains_numa_distance, domain_distances); + sched_domains_numa_levels = 0; masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); @@ -2016,10 +2078,13 @@ void sched_init_numa(int offline_node) masks[i][j] = mask; for_each_cpu_node_but(k, offline_node) { - if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + if (sched_debug() && + (arch_sched_node_distance(j, k) != + arch_sched_node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); - if (node_distance(j, k) > sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, k) > + sched_domains_numa_distance[i]) continue; cpumask_or(mask, mask, cpumask_of_node(k)); @@ -2059,7 +2124,6 @@ void sched_init_numa(int offline_node) sched_domain_topology = tl; sched_domains_numa_levels = nr_levels; - WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); init_numa_topology_type(offline_node); } @@ -2067,14 +2131,18 @@ void sched_init_numa(int offline_node) static void sched_reset_numa(void) { - int nr_levels, *distances; + int nr_levels, *distances, *dom_distances = NULL; struct cpumask ***masks; nr_levels = sched_domains_numa_levels; + sched_numa_node_levels = 0; sched_domains_numa_levels = 0; sched_max_numa_distance = 0; sched_numa_topology_type = NUMA_DIRECT; - distances = sched_domains_numa_distance; + distances = sched_numa_node_distance; + if (sched_numa_node_distance != sched_domains_numa_distance) + dom_distances = sched_domains_numa_distance; + rcu_assign_pointer(sched_numa_node_distance, NULL); rcu_assign_pointer(sched_domains_numa_distance, NULL); masks = sched_domains_numa_masks; rcu_assign_pointer(sched_domains_numa_masks, NULL); @@ -2083,6 +2151,7 @@ static void sched_reset_numa(void) synchronize_rcu(); kfree(distances); + kfree(dom_distances); for (i = 0; i < nr_levels && masks; i++) { if (!masks[i]) continue; @@ -2129,7 +2198,8 @@ void sched_domains_numa_masks_set(unsigned int cpu) continue; /* Set ourselves in the remote node's masks */ - if (node_distance(j, node) <= sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, node) <= + sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } } diff --git a/kernel/signal.c b/kernel/signal.c index fe9190d84f28..e42b8bd6922f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3125,7 +3125,6 @@ void exit_signals(struct task_struct *tsk) cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { - sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; @@ -3136,7 +3135,6 @@ void exit_signals(struct task_struct *tsk) * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ - sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); diff --git a/kernel/task_work.c b/kernel/task_work.c index d1efec571a4a..0f7519f8e7c9 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ #ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { - test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + /* + * no-op IPI + * + * TWA_NMI_CURRENT will already have set the TIF flag, all + * this interrupt does it tickle the return-to-user path. + */ } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); @@ -86,6 +91,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, break; #ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; #endif diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88aa062b8a55..f8ea8c8fc895 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2145,7 +2145,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) int ret; hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); - hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + hrtimer_set_expires(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; @@ -2172,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, restart = ¤t->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + restart->nanosleep.expires = hrtimer_get_expires(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 5b6997f4dc3d..e76be24b132c 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -478,11 +478,8 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.ns_type = ns_common_type(&init_time_ns), - .ns.__ns_ref = REFCOUNT_INIT(3), + .ns = NS_COMMON_INIT(init_time_ns), .user_ns = &init_user_ns, - .ns.inum = ns_init_inum(&init_time_ns), - .ns.ops = &timens_operations, .frozen_offsets = true, }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2e5b89d7d866..0de2bb7cbec0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1557,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; - restart->nanosleep.expires = expires; + restart->nanosleep.expires = ns_to_ktime(expires); if (restart->nanosleep.type != TT_NONE) error = nanosleep_copyout(restart, &it.it_value); } @@ -1599,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) clockid_t which_clock = restart_block->nanosleep.clockid; struct timespec64 t; - t = ns_to_timespec64(restart_block->nanosleep.expires); + t = ktime_to_timespec64(restart_block->nanosleep.expires); return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index aa3120104a51..80a8a09a21a0 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -475,12 +475,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (!kc->timer_create) return -EOPNOTSUPP; - new_timer = alloc_posix_timer(); - if (unlikely(!new_timer)) - return -EAGAIN; - - spin_lock_init(&new_timer->it_lock); - /* Special case for CRIU to restore timers with a given timer ID. */ if (unlikely(current->signal->timer_create_restore_ids)) { if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) @@ -490,6 +484,12 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, return -EINVAL; } + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + /* * Add the timer to the hash table. The timer is not yet valid * after insertion, but has a unique ID allocated. @@ -1242,7 +1242,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, * sys_clock_settime(). The kernel internal timekeeping is always using * nanoseconds precision independent of the clocksource device which is * used to read the time from. The resolution of that device only - * affects the presicion of the time returned by sys_clock_gettime(). + * affects the precision of the time returned by sys_clock_gettime(). * * Returns: * 0 Success. @tp contains the resolution diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5e2c2c26b3cc..ffee943d796d 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -19,6 +19,10 @@ /** * tick_program_event - program the CPU local timer device for the next event + * @expires: the time at which the next timer event should occur + * @force: flag to force reprograming even if the event time hasn't changed + * + * Return: 0 on success, negative error code on failure */ int tick_program_event(ktime_t expires, int force) { @@ -57,6 +61,13 @@ void tick_resume_oneshot(void) /** * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + * @newdev: Pointer to the clock event device to configure + * @handler: Function to be called when the event device triggers an interrupt + * @next_event: Initial expiry time for the next event (in ktime) + * + * Configures the specified clock event device for onshot mode, + * assigns the given handler as its event callback, and programs + * the device to trigger at the specified next event time. */ void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), @@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev, /** * tick_switch_to_oneshot - switch to oneshot mode + * @handler: function to call when an event occurs on the tick device + * + * Return: 0 on success, -EINVAL if the tick device is not present, + * not functional, or does not support oneshot mode. */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { @@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) /** * tick_oneshot_mode_active - check whether the system is in oneshot mode * - * returns 1 when either nohz or highres are enabled. otherwise 0. + * Return: 1 when either nohz or highres are enabled, otherwise 0. */ int tick_oneshot_mode_active(void) { @@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void) * tick_init_highres - switch to high resolution mode * * Called with interrupts disabled. + * + * Return: 0 on success, -EINVAL if the tick device cannot switch + * to oneshot/high-resolution mode. */ int tick_init_highres(void) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c527b421c865..8ddf74e705d3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -201,6 +201,27 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts, ts->flags &= ~flag; } +/* + * Allow only one non-timekeeper CPU at a time update jiffies from + * the timer tick. + * + * Returns true if update was run. + */ +static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now) +{ + static atomic_t in_progress; + int inp; + + inp = atomic_read(&in_progress); + if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1)) + return false; + + if (ts->last_tick_jiffies == jiffies) + tick_do_update_jiffies64(now); + atomic_set(&in_progress, 0); + return true; +} + #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) @@ -239,10 +260,11 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { - if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { - tick_do_update_jiffies64(now); - ts->stalled_jiffies = 0; - ts->last_tick_jiffies = READ_ONCE(jiffies); + if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) { + if (tick_limited_update_jiffies64(ts, now)) { + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } } } @@ -1152,16 +1174,15 @@ static bool report_idle_softirq(void) return false; } - if (ratelimit >= 10) - return false; - /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; - pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", - pending); - ratelimit++; + if (ratelimit < 10) { + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + } return true; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b6974fce800c..4790da895203 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -3060,29 +3060,34 @@ static const struct attribute_group aux_clock_enable_attr_group = { static int __init tk_aux_sysfs_init(void) { struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + int ret = -ENOMEM; if (!tko) - return -ENOMEM; + return ret; auxo = kobject_create_and_add("aux_clocks", tko); - if (!auxo) { - kobject_put(tko); - return -ENOMEM; - } + if (!auxo) + goto err_clean; - for (int i = 0; i <= MAX_AUX_CLOCKS; i++) { + for (int i = 0; i < MAX_AUX_CLOCKS; i++) { char id[2] = { [0] = '0' + i, }; struct kobject *clk = kobject_create_and_add(id, auxo); - if (!clk) - return -ENOMEM; - - int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + if (!clk) { + ret = -ENOMEM; + goto err_clean; + } + ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); if (ret) - return ret; + goto err_clean; } return 0; + +err_clean: + kobject_put(auxo); + kobject_put(tko); + return ret; } late_initcall(tk_aux_sysfs_init); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 553fa469d7cc..1f2364126894 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1458,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) base = lock_timer_base(timer, &flags); - if (base->running_timer != timer) + if (base->running_timer != timer) { ret = detach_if_pending(timer, base, true); - if (shutdown) - timer->function = NULL; + if (shutdown) + timer->function = NULL; + } raw_spin_unlock_irqrestore(&base->lock, flags); @@ -2472,7 +2473,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK - if (in_irq()) + if (in_hardirq()) irq_work_tick(); #endif sched_tick(); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index c0c54dc5314c..18dda1aa782d 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -10,6 +10,7 @@ #include <linux/spinlock.h> #include <linux/timerqueue.h> #include <trace/events/ipi.h> +#include <linux/sched/isolation.h> #include "timer_migration.h" #include "tick-internal.h" @@ -420,14 +421,54 @@ static struct list_head *tmigr_level_list __read_mostly; static unsigned int tmigr_hierarchy_levels __read_mostly; static unsigned int tmigr_crossnode_level __read_mostly; +static struct tmigr_group *tmigr_root; + static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); +/* + * CPUs available for timer migration. + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * Additionally tmigr_available_mutex serializes set/clear operations with each other. + */ +static cpumask_var_t tmigr_available_cpumask; +static DEFINE_MUTEX(tmigr_available_mutex); + +/* Enabled during late initcall */ +static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated); + #define TMIGR_NONE 0xFF #define BIT_CNT 8 static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) { - return !(tmc->tmgroup && tmc->online); + return !(tmc->tmgroup && tmc->available); +} + +/* + * Returns true if @cpu should be excluded from the hierarchy as isolated. + * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs + * are still part of the hierarchy but become idle (from a tick and timer + * migration perspective) when they stop their tick. This lets the timekeeping + * CPU handle their global timers. Marking also isolated CPUs as idle would be + * too costly, hence they are completely excluded from the hierarchy. + * This check is necessary, for instance, to prevent offline isolated CPUs from + * being incorrectly marked as available once getting back online. + * + * This function returns false during early boot and the isolation logic is + * enabled only after isolated CPUs are marked as unavailable at late boot. + * The tick CPU can be isolated at boot, however we cannot mark it as + * unavailable to avoid having no global migrator for the nohz_full CPUs. This + * should be ensured by the callers of this function: implicitly from hotplug + * callbacks and explicitly in tmigr_init_isolation() and + * tmigr_isolated_exclude_cpumask(). + */ +static inline bool tmigr_is_isolated(int cpu) +{ + if (!static_branch_unlikely(&tmigr_exclude_isolated)) + return false; + return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) || + cpuset_cpu_is_isolated(cpu)) && + housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE); } /* @@ -502,11 +543,6 @@ static bool tmigr_check_lonely(struct tmigr_group *group) * @now: timer base monotonic * @check: is set if there is the need to handle remote timers; * required in tmigr_requires_handle_remote() only - * @tmc_active: this flag indicates, whether the CPU which triggers - * the hierarchy walk is !idle in the timer migration - * hierarchy. When the CPU is idle and the whole hierarchy is - * idle, only the first event of the top level has to be - * considered. */ struct tmigr_walk { u64 nextexp; @@ -517,16 +553,13 @@ struct tmigr_walk { unsigned long basej; u64 now; bool check; - bool tmc_active; }; typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); -static void __walk_groups(up_f up, struct tmigr_walk *data, - struct tmigr_cpu *tmc) +static void __walk_groups_from(up_f up, struct tmigr_walk *data, + struct tmigr_group *child, struct tmigr_group *group) { - struct tmigr_group *child = NULL, *group = tmc->tmgroup; - do { WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); @@ -544,6 +577,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, } while (group); } +static void __walk_groups(up_f up, struct tmigr_walk *data, + struct tmigr_cpu *tmc) +{ + __walk_groups_from(up, data, NULL, tmc->tmgroup); +} + static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { lockdep_assert_held(&tmc->lock); @@ -708,7 +747,7 @@ void tmigr_cpu_activate(void) /* * Returns true, if there is nothing to be propagated to the next level * - * @data->firstexp is set to expiry of first gobal event of the (top level of + * @data->firstexp is set to expiry of first global event of the (top level of * the) hierarchy, but only when hierarchy is completely idle. * * The child and group states need to be read under the lock, to prevent a race @@ -926,7 +965,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * updated the event takes care when hierarchy is completely * idle. Otherwise the migrator does it as the event is enqueued. */ - if (!tmc->online || tmc->remote || tmc->cpuevt.ignore || + if (!tmc->available || tmc->remote || tmc->cpuevt.ignore || now < tmc->cpuevt.nextevt.expires) { raw_spin_unlock_irq(&tmc->lock); return; @@ -973,7 +1012,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * (See also section "Required event and timerqueue update after a * remote expiry" in the documentation at the top) */ - if (!tmc->online || !tmc->idle) { + if (!tmc->available || !tmc->idle) { timer_unlock_remote_bases(cpu); goto unlock; } @@ -1113,15 +1152,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, */ if (!tmigr_check_migrator(group, childmask)) return true; - - /* - * When there is a parent group and the CPU which triggered the - * hierarchy walk is not active, proceed the walk to reach the top level - * group before reading the next_expiry value. - */ - if (group->parent && !data->tmc_active) - return false; - /* * The lock is required on 32bit architectures to read the variable * consistently with a concurrent writer. On 64bit the lock is not @@ -1166,7 +1196,6 @@ bool tmigr_requires_handle_remote(void) data.now = get_jiffies_update(&jif); data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; - data.tmc_active = !tmc->idle; data.check = false; /* @@ -1432,38 +1461,43 @@ static long tmigr_trigger_active(void *unused) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - WARN_ON_ONCE(!tmc->online || tmc->idle); + WARN_ON_ONCE(!tmc->available || tmc->idle); return 0; } -static int tmigr_cpu_offline(unsigned int cpu) +static int tmigr_clear_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); int migrator; u64 firstexp; - raw_spin_lock_irq(&tmc->lock); - tmc->online = false; - WRITE_ONCE(tmc->wakeup, KTIME_MAX); + guard(mutex)(&tmigr_available_mutex); - /* - * CPU has to handle the local events on his own, when on the way to - * offline; Therefore nextevt value is set to KTIME_MAX - */ - firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); - trace_tmigr_cpu_offline(tmc); - raw_spin_unlock_irq(&tmc->lock); + cpumask_clear_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (!tmc->available) + return 0; + tmc->available = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* + * CPU has to handle the local events on his own, when on the way to + * offline; Therefore nextevt value is set to KTIME_MAX + */ + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); + trace_tmigr_cpu_unavailable(tmc); + } if (firstexp != KTIME_MAX) { - migrator = cpumask_any_but(cpu_online_mask, cpu); + migrator = cpumask_any(tmigr_available_cpumask); work_on_cpu(migrator, tmigr_trigger_active, NULL); } return 0; } -static int tmigr_cpu_online(unsigned int cpu) +static int tmigr_set_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); @@ -1471,16 +1505,123 @@ static int tmigr_cpu_online(unsigned int cpu) if (WARN_ON_ONCE(!tmc->tmgroup)) return -EINVAL; - raw_spin_lock_irq(&tmc->lock); - trace_tmigr_cpu_online(tmc); - tmc->idle = timer_base_is_idle(); - if (!tmc->idle) - __tmigr_cpu_activate(tmc); - tmc->online = true; - raw_spin_unlock_irq(&tmc->lock); + if (tmigr_is_isolated(cpu)) + return 0; + + guard(mutex)(&tmigr_available_mutex); + + cpumask_set_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (tmc->available) + return 0; + trace_tmigr_cpu_available(tmc); + tmc->idle = timer_base_is_idle(); + if (!tmc->idle) + __tmigr_cpu_activate(tmc); + tmc->available = true; + } return 0; } +static void tmigr_cpu_isolate(struct work_struct *ignored) +{ + tmigr_clear_cpu_available(smp_processor_id()); +} + +static void tmigr_cpu_unisolate(struct work_struct *ignored) +{ + tmigr_set_cpu_available(smp_processor_id()); +} + +/** + * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy + * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy + * + * This function can be called from cpuset code to provide the new set of + * isolated CPUs that should be excluded from the hierarchy. + * Online CPUs not present in exclude_cpumask but already excluded are brought + * back to the hierarchy. + * Functions to isolate/unisolate need to be called locally and can sleep. + */ +int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) +{ + struct work_struct __percpu *works __free(free_percpu) = + alloc_percpu(struct work_struct); + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + int cpu; + + lockdep_assert_cpus_held(); + + if (!works) + return -ENOMEM; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + /* + * First set previously isolated CPUs as available (unisolate). + * This cpumask contains only CPUs that switched to available now. + */ + cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask); + cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask); + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_unisolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + + /* + * Then clear previously available CPUs (isolate). + * This cpumask contains only CPUs that switched to not available now. + * There cannot be overlap with the newly available ones. + */ + cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask); + cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)); + /* + * Handle this here and not in the cpuset code because exclude_cpumask + * might include also the tick CPU if included in isolcpus. + */ + for_each_cpu(cpu, cpumask) { + if (!tick_nohz_cpu_hotpluggable(cpu)) { + cpumask_clear_cpu(cpu, cpumask); + break; + } + } + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_isolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + + return 0; +} + +static int __init tmigr_init_isolation(void) +{ + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + + static_branch_enable(&tmigr_exclude_isolated); + + if (!housekeeping_enabled(HK_TYPE_DOMAIN)) + return 0; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + + /* Protect against RCU torture hotplug testing */ + guard(cpus_read_lock)(); + return tmigr_isolated_exclude_cpumask(cpumask); +} +late_initcall(tmigr_init_isolation); + static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, int node) { @@ -1498,21 +1639,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, s.seq = 0; atomic_set(&group->migr_state, s.state); - /* - * If this is a new top-level, prepare its groupmask in advance. - * This avoids accidents where yet another new top-level is - * created in the future and made visible before the current groupmask. - */ - if (list_empty(&tmigr_level_list[lvl])) { - group->groupmask = BIT(0); - /* - * The previous top level has prepared its groupmask already, - * simply account it as the first child. - */ - if (lvl > 0) - group->num_children = 1; - } - timerqueue_init_head(&group->events); timerqueue_init(&group->groupevt.nextevt); group->groupevt.nextevt.expires = KTIME_MAX; @@ -1520,8 +1646,7 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, group->groupevt.ignore = true; } -static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, - unsigned int lvl) +static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) { struct tmigr_group *tmp, *group = NULL; @@ -1567,25 +1692,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, return group; } +static bool tmigr_init_root(struct tmigr_group *group, bool activate) +{ + if (!group->parent && group != tmigr_root) { + /* + * This is the new top-level, prepare its groupmask in advance + * to avoid accidents where yet another new top-level is + * created in the future and made visible before this groupmask. + */ + group->groupmask = BIT(0); + WARN_ON_ONCE(activate); + + return true; + } + + return false; + +} + static void tmigr_connect_child_parent(struct tmigr_group *child, struct tmigr_group *parent, bool activate) { - struct tmigr_walk data; - - raw_spin_lock_irq(&child->lock); - raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); + if (tmigr_init_root(parent, activate)) { + /* + * The previous top level had prepared its groupmask already, + * simply account it in advance as the first child. If some groups + * have been created between the old and new root due to node + * mismatch, the new root's child will be intialized accordingly. + */ + parent->num_children = 1; + } - if (activate) { + /* Connecting old root to new root ? */ + if (!parent->parent && activate) { /* - * @child is the old top and @parent the new one. In this - * case groupmask is pre-initialized and @child already - * accounted, along with its new sibling corresponding to the - * CPU going up. + * @child is the old top, or in case of node mismatch, some + * intermediate group between the old top and the new one in + * @parent. In this case the @child must be pre-accounted above + * as the first child. Its new inactive sibling corresponding + * to the CPU going up has been accounted as the second child. */ - WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); + WARN_ON_ONCE(parent->num_children != 2); + child->groupmask = BIT(0); } else { - /* Adding @child for the CPU going up to @parent. */ + /* Common case adding @child for the CPU going up to @parent. */ child->groupmask = BIT(parent->num_children++); } @@ -1596,87 +1747,61 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, */ smp_store_release(&child->parent, parent); - raw_spin_unlock(&parent->lock); - raw_spin_unlock_irq(&child->lock); - trace_tmigr_connect_child_parent(child); - - if (!activate) - return; - - /* - * To prevent inconsistent states, active children need to be active in - * the new parent as well. Inactive children are already marked inactive - * in the parent group: - * - * * When new groups were created by tmigr_setup_groups() starting from - * the lowest level (and not higher then one level below the current - * top level), then they are not active. They will be set active when - * the new online CPU comes active. - * - * * But if a new group above the current top level is required, it is - * mandatory to propagate the active state of the already existing - * child to the new parent. So tmigr_connect_child_parent() is - * executed with the formerly top level group (child) and the newly - * created group (parent). - * - * * It is ensured that the child is active, as this setup path is - * executed in hotplug prepare callback. This is exectued by an - * already connected and !idle CPU. Even if all other CPUs go idle, - * the CPU executing the setup will be responsible up to current top - * level group. And the next time it goes inactive, it will release - * the new childmask and parent to subsequent walkers through this - * @child. Therefore propagate active state unconditionally. - */ - data.childmask = child->groupmask; - - /* - * There is only one new level per time (which is protected by - * tmigr_mutex). When connecting the child and the parent and set the - * child active when the parent is inactive, the parent needs to be the - * uppermost level. Otherwise there went something wrong! - */ - WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); } -static int tmigr_setup_groups(unsigned int cpu, unsigned int node) +static int tmigr_setup_groups(unsigned int cpu, unsigned int node, + struct tmigr_group *start, bool activate) { struct tmigr_group *group, *child, **stack; - int top = 0, err = 0, i = 0; - struct list_head *lvllist; + int i, top = 0, err = 0, start_lvl = 0; + bool root_mismatch = false; stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); if (!stack) return -ENOMEM; - do { - group = tmigr_get_group(cpu, node, i); + if (start) { + stack[start->level] = start; + start_lvl = start->level + 1; + } + + if (tmigr_root) + root_mismatch = tmigr_root->numa_node != node; + + for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { + group = tmigr_get_group(node, i); if (IS_ERR(group)) { err = PTR_ERR(group); + i--; break; } top = i; - stack[i++] = group; + stack[i] = group; /* * When booting only less CPUs of a system than CPUs are - * available, not all calculated hierarchy levels are required. + * available, not all calculated hierarchy levels are required, + * unless a node mismatch is detected. * * The loop is aborted as soon as the highest level, which might * be different from tmigr_hierarchy_levels, contains only a - * single group. + * single group, unless the nodes mismatch below tmigr_crossnode_level */ - if (group->parent || list_is_singular(&tmigr_level_list[i - 1])) + if (group->parent) break; + if ((!root_mismatch || i >= tmigr_crossnode_level) && + list_is_singular(&tmigr_level_list[i])) + break; + } - } while (i < tmigr_hierarchy_levels); - - /* Assert single root */ - WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); + /* Assert single root without parent */ + if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels)) + return -EINVAL; - while (i > 0) { - group = stack[--i]; + for (; i >= start_lvl; i--) { + group = stack[i]; if (err < 0) { list_del(&group->list); @@ -1692,12 +1817,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) if (i == 0) { struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); - raw_spin_lock_irq(&group->lock); - tmc->tmgroup = group; tmc->groupmask = BIT(group->num_children++); - raw_spin_unlock_irq(&group->lock); + tmigr_init_root(group, activate); trace_tmigr_connect_cpu_parent(tmc); @@ -1705,42 +1828,58 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) continue; } else { child = stack[i - 1]; - /* Will be activated at online time */ - tmigr_connect_child_parent(child, group, false); + tmigr_connect_child_parent(child, group, activate); } + } - /* check if uppermost level was newly created */ - if (top != i) - continue; - - WARN_ON_ONCE(top == 0); + if (err < 0) + goto out; - lvllist = &tmigr_level_list[top]; + if (activate) { + struct tmigr_walk data; + union tmigr_state state; /* - * Newly created root level should have accounted the upcoming - * CPU's child group and pre-accounted the old root. + * To prevent inconsistent states, active children need to be active in + * the new parent as well. Inactive children are already marked inactive + * in the parent group: + * + * * When new groups were created by tmigr_setup_groups() starting from + * the lowest level, then they are not active. They will be set active + * when the new online CPU comes active. + * + * * But if new groups above the current top level are required, it is + * mandatory to propagate the active state of the already existing + * child to the new parents. So tmigr_active_up() activates the + * new parents while walking up from the old root to the new. + * + * * It is ensured that @start is active, as this setup path is + * executed in hotplug prepare callback. This is executed by an + * already connected and !idle CPU. Even if all other CPUs go idle, + * the CPU executing the setup will be responsible up to current top + * level group. And the next time it goes inactive, it will release + * the new childmask and parent to subsequent walkers through this + * @child. Therefore propagate active state unconditionally. */ - if (group->num_children == 2 && list_is_singular(lvllist)) { - /* - * The target CPU must never do the prepare work, except - * on early boot when the boot CPU is the target. Otherwise - * it may spuriously activate the old top level group inside - * the new one (nevertheless whether old top level group is - * active or not) and/or release an uninitialized childmask. - */ - WARN_ON_ONCE(cpu == raw_smp_processor_id()); - - lvllist = &tmigr_level_list[top - 1]; - list_for_each_entry(child, lvllist, list) { - if (child->parent) - continue; + state.state = atomic_read(&start->migr_state); + WARN_ON_ONCE(!state.active); + WARN_ON_ONCE(!start->parent); + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); + } - tmigr_connect_child_parent(child, group, true); - } + /* Root update */ + if (list_is_singular(&tmigr_level_list[top])) { + group = list_first_entry(&tmigr_level_list[top], + typeof(*group), list); + WARN_ON_ONCE(group->parent); + if (tmigr_root) { + /* Old root should be the same or below */ + WARN_ON_ONCE(tmigr_root->level > top); } + tmigr_root = group; } - +out: kfree(stack); return err; @@ -1748,12 +1887,31 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) static int tmigr_add_cpu(unsigned int cpu) { + struct tmigr_group *old_root = tmigr_root; int node = cpu_to_node(cpu); int ret; - mutex_lock(&tmigr_mutex); - ret = tmigr_setup_groups(cpu, node); - mutex_unlock(&tmigr_mutex); + guard(mutex)(&tmigr_mutex); + + ret = tmigr_setup_groups(cpu, node, NULL, false); + + /* Root has changed? Connect the old one to the new */ + if (ret >= 0 && old_root && old_root != tmigr_root) { + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == raw_smp_processor_id()); + /* + * The (likely) current CPU is expected to be online in the hierarchy, + * otherwise the old root may not be active as expected. + */ + WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available); + ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true); + } return ret; } @@ -1798,6 +1956,11 @@ static int __init tmigr_init(void) if (ncpus == 1) return 0; + if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err; + } + /* * Calculate the required hierarchy levels. Unfortunately there is no * reliable information available, unless all possible CPUs have been @@ -1847,7 +2010,7 @@ static int __init tmigr_init(void) goto err; ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", - tmigr_cpu_online, tmigr_cpu_offline); + tmigr_set_cpu_available, tmigr_clear_cpu_available); if (ret) goto err; diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index ae19f70f8170..70879cde6fdd 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -97,7 +97,7 @@ struct tmigr_group { */ struct tmigr_cpu { raw_spinlock_t lock; - bool online; + bool available; bool idle; bool remote; struct tmigr_group *tmgroup; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d2c79da81e4f..4661b9e606e0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -80,6 +80,12 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE If the architecture generates __patchable_function_entries sections but does not want them included in the ftrace locations. +config HAVE_DYNAMIC_FTRACE_WITH_JMP + bool + help + If the architecture supports to replace the __fentry__ with a + "jmp" instruction. + config HAVE_SYSCALL_TRACEPOINTS bool help @@ -330,6 +336,12 @@ config DYNAMIC_FTRACE_WITH_ARGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS +config DYNAMIC_FTRACE_WITH_JMP + def_bool y + depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS + depends on HAVE_DYNAMIC_FTRACE_WITH_JMP + config FPROBE bool "Kernel Function Probe (fprobe)" depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 6941145b5058..af8cbc8e1a7c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -63,13 +63,116 @@ static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); +static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, + sector_t sector, int bytes, u64 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) + +{ + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + + t->sector = sector; + t->bytes = bytes; + t->action = lower_32_bits(what); + t->device = dev; + t->error = error; + t->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); +} + +static void record_blktrace_event2(struct blk_io_trace2 *t2, pid_t pid, int cpu, + sector_t sector, int bytes, u64 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, + int pdu_len) +{ + t2->pid = pid; + t2->cpu = cpu; + + t2->sector = sector; + t2->bytes = bytes; + t2->action = what; + t2->device = dev; + t2->error = error; + t2->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t2 + sizeof(*t2), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t2 + sizeof(*t2) + cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event1(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace *t; + size_t trace_len = sizeof(*t) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event2(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, + int bytes, u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace2 *t; + size_t trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE2_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event2(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + if (bt->version == 2) + return relay_blktrace_event2(bt, sequence, pid, cpu, sector, + bytes, what, error, cgid, cgid_len, + pdu_data, pdu_len); + return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes, + what, error, cgid, cgid_len, pdu_data, + pdu_len); +} + /* * Send out a notify message. */ -static void trace_note(struct blk_trace *bt, pid_t pid, int action, +static void trace_note(struct blk_trace *bt, pid_t pid, u64 action, const void *data, size_t len, u64 cgid) { - struct blk_io_trace *t; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; unsigned int trace_ctx = 0; @@ -77,38 +180,30 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0)); if (blk_tracer) { + struct blk_io_trace2 *t; + size_t trace_len = sizeof(*t) + cgid_len + len; + buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); - goto record_it; + record_blktrace_event2(t, pid, cpu, 0, 0, + action, bt->dev, 0, cgid, cgid_len, + (void *)data, len); + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); - if (t) { - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->time = ktime_to_ns(ktime_get()); -record_it: - t->device = bt->dev; - t->action = action | (cgid ? __BLK_TN_CGROUP : 0); - t->pid = pid; - t->cpu = cpu; - t->pdu_len = len + cgid_len; - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - memcpy((void *) t + sizeof(*t) + cgid_len, data, len); - - if (blk_tracer) - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); - } + relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid, + cgid_len, (void *)data, len); } /* @@ -182,7 +277,7 @@ void __blk_trace_note_message(struct blk_trace *bt, } EXPORT_SYMBOL_GPL(__blk_trace_note_message); -static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, +static int act_log_check(struct blk_trace *bt, u64 what, sector_t sector, pid_t pid) { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) @@ -213,13 +308,12 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - const blk_opf_t opf, u32 what, int error, + const blk_opf_t opf, u64 what, int error, int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; - struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; unsigned int trace_ctx = 0; @@ -228,6 +322,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; const enum req_op op = opf & REQ_OP_MASK; + size_t trace_len; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -238,10 +333,47 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(opf, META); what |= MASK_TC_BIT(opf, PREFLUSH); what |= MASK_TC_BIT(opf, FUA); - if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) + + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: what |= BLK_TC_ACT(BLK_TC_DISCARD); - if (op == REQ_OP_FLUSH) + break; + case REQ_OP_FLUSH: what |= BLK_TC_ACT(BLK_TC_FLUSH); + break; + case REQ_OP_ZONE_APPEND: + what |= BLK_TC_ACT(BLK_TC_ZONE_APPEND); + break; + case REQ_OP_ZONE_RESET: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET); + break; + case REQ_OP_ZONE_RESET_ALL: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET_ALL); + break; + case REQ_OP_ZONE_FINISH: + what |= BLK_TC_ACT(BLK_TC_ZONE_FINISH); + break; + case REQ_OP_ZONE_OPEN: + what |= BLK_TC_ACT(BLK_TC_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE); + break; + case REQ_OP_WRITE_ZEROES: + what |= BLK_TC_ACT(BLK_TC_WRITE_ZEROES); + break; + default: + break; + } + + /* Drop trace events for zone operations with blktrace v1 */ + if (bt->version == 1 && (what >> BLK_TC_SHIFT) > BLK_TC_END_V1) { + pr_debug_ratelimited("blktrace v1 cannot trace zone operation 0x%llx\n", + (unsigned long long)what); + return; + } + if (cgid) what |= __BLK_TA_CGROUP; @@ -255,13 +387,68 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); + switch (bt->version) { + case 1: + trace_len = sizeof(struct blk_io_trace); + break; + case 2: + default: + /* + * ftrace always uses v2 (blk_io_trace2) format. + * + * For sysfs-enabled tracing path (enabled via + * /sys/block/DEV/trace/enable), blk_trace_setup_queue() + * never initializes bt->version, leaving it 0 from + * kzalloc(). We must handle version==0 safely here. + * + * Fall through to default to ensure we never hit the + * old bug where default set trace_len=0, causing + * buffer underflow and memory corruption. + * + * Always use v2 format for ftrace and normalize + * bt->version to 2 when uninitialized. + */ + trace_len = sizeof(struct blk_io_trace2); + if (bt->version == 0) + bt->version = 2; + break; + } + trace_len += pdu_len + cgid_len; event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; - t = ring_buffer_event_data(event); - goto record_it; + + switch (bt->version) { + case 1: + record_blktrace_event(ring_buffer_event_data(event), + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); + break; + case 2: + default: + /* + * Use v2 recording function (record_blktrace_event2) + * which writes blk_io_trace2 structure with correct + * field layout: + * - 32-bit pid at offset 28 + * - 64-bit action at offset 32 + * + * Fall through to default handles version==0 case + * (from sysfs path), ensuring we always use correct + * v2 recording function to match the v2 buffer + * allocated above. + */ + record_blktrace_event2(ring_buffer_event_data(event), + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); + break; + } + + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (unlikely(tsk->btrace_seq != blktrace_seq)) @@ -273,41 +460,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); - if (t) { - sequence = per_cpu_ptr(bt->sequence, cpu); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->sequence = ++(*sequence); - t->time = ktime_to_ns(ktime_get()); -record_it: - /* - * These two are not needed in ftrace as they are in the - * generic trace_entry, filled by tracing_generic_entry_update, - * but for the trace_event->bin() synthesizer benefit we do it - * here too. - */ - t->cpu = cpu; - t->pid = pid; - - t->sector = sector; - t->bytes = bytes; - t->action = what; - t->device = bt->dev; - t->error = error; - t->pdu_len = pdu_len + cgid_len; - - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - if (pdu_len) - memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); - - if (blk_tracer) { - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); - return; - } - } - + sequence = per_cpu_ptr(bt->sequence, cpu); + (*sequence)++; + relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, + what, error, cgid, cgid_len, pdu_data, pdu_len); local_irq_restore(flags); } @@ -494,9 +650,10 @@ static void blk_trace_setup_lba(struct blk_trace *bt, /* * Setup everything required to start tracing */ -static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - struct blk_user_trace_setup *buts) +static struct blk_trace *blk_trace_setup_prepare(struct request_queue *q, + char *name, dev_t dev, + u32 buf_size, u32 buf_nr, + struct block_device *bdev) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; @@ -504,31 +661,19 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, lockdep_assert_held(&q->debugfs_mutex); - if (!buts->buf_size || !buts->buf_nr) - return -EINVAL; - - strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); - - /* - * some device names have larger paths - convert the slashes - * to underscores for this to work as expected - */ - strreplace(buts->name, '/', '_'); - /* * bdev can be NULL, as with scsi-generic, this is a helpful as * we can be. */ if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { - pr_warn("Concurrent blktraces are not allowed on %s\n", - buts->name); - return -EBUSY; + pr_warn("Concurrent blktraces are not allowed on %s\n", name); + return ERR_PTR(-EBUSY); } bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = -ENOMEM; bt->sequence = alloc_percpu(unsigned long); @@ -548,7 +693,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else - bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + bt->dir = dir = debugfs_create_dir(name, blk_debugfs_root); /* * As blktrace relies on debugfs for its interface the debugfs directory @@ -556,8 +701,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * files or directories. */ if (IS_ERR_OR_NULL(dir)) { - pr_warn("debugfs_dir not present for %s so skipping\n", - buts->name); + pr_warn("debugfs_dir not present for %s so skipping\n", name); ret = -ENOENT; goto err; } @@ -569,17 +713,40 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - bt->rchan = relay_open("trace", dir, buts->buf_size, - buts->buf_nr, &blk_relay_callbacks, bt); + bt->rchan = relay_open("trace", dir, buf_size, buf_nr, + &blk_relay_callbacks, bt); if (!bt->rchan) goto err; + blk_trace_setup_lba(bt, bdev); + + return bt; + +err: + blk_trace_free(q, bt); + + return ERR_PTR(ret); +} + +static void blk_trace_setup_finalize(struct request_queue *q, + char *name, int version, + struct blk_trace *bt, + struct blk_user_trace_setup2 *buts) + +{ + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2); + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + strreplace(buts->name, '/', '_'); + + bt->version = version; bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; - blk_trace_setup_lba(bt, bdev); - /* overwrite with user settings */ if (buts->start_lba) bt->start_lba = buts->start_lba; @@ -591,30 +758,43 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); - - ret = 0; -err: - if (ret) - blk_trace_free(q, bt); - return ret; } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { + struct blk_user_trace_setup2 buts2; struct blk_user_trace_setup buts; + struct blk_trace *bt; int ret; ret = copy_from_user(&buts, arg, sizeof(buts)); if (ret) return -EFAULT; + if (!buts.buf_size || !buts.buf_nr) + return -EINVAL; + + buts2 = (struct blk_user_trace_setup2) { + .act_mask = buts.act_mask, + .buf_size = buts.buf_size, + .buf_nr = buts.buf_nr, + .start_lba = buts.start_lba, + .end_lba = buts.end_lba, + .pid = buts.pid, + }; + mutex_lock(&q->debugfs_mutex); - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 1, bt, &buts2); + strcpy(buts.name, buts2.name); mutex_unlock(&q->debugfs_mutex); - if (ret) - return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { blk_trace_remove(q); @@ -624,19 +804,54 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } EXPORT_SYMBOL_GPL(blk_trace_setup); +static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) +{ + struct blk_user_trace_setup2 buts2; + struct blk_trace *bt; + + if (copy_from_user(&buts2, arg, sizeof(buts2))) + return -EFAULT; + + if (!buts2.buf_size || !buts2.buf_nr) + return -EINVAL; + + if (buts2.flags != 0) + return -EINVAL; + + mutex_lock(&q->debugfs_mutex); + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 2, bt, &buts2); + mutex_unlock(&q->debugfs_mutex); + + if (copy_to_user(arg, &buts2, sizeof(buts2))) { + blk_trace_remove(q); + return -EFAULT; + } + return 0; +} + #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { - struct blk_user_trace_setup buts; + struct blk_user_trace_setup2 buts2; struct compat_blk_user_trace_setup cbuts; - int ret; + struct blk_trace *bt; if (copy_from_user(&cbuts, arg, sizeof(cbuts))) return -EFAULT; - buts = (struct blk_user_trace_setup) { + if (!cbuts.buf_size || !cbuts.buf_nr) + return -EINVAL; + + buts2 = (struct blk_user_trace_setup2) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, .buf_nr = cbuts.buf_nr, @@ -646,12 +861,16 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, }; mutex_lock(&q->debugfs_mutex); - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 1, bt, &buts2); mutex_unlock(&q->debugfs_mutex); - if (ret) - return ret; - if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { + if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) { blk_trace_remove(q); return -EFAULT; } @@ -707,6 +926,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) char b[BDEVNAME_SIZE]; switch (cmd) { + case BLKTRACESETUP2: + snprintf(b, sizeof(b), "%pg", bdev); + ret = blk_trace_setup2(q, b, bdev->bd_dev, bdev, arg); + break; case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); @@ -794,7 +1017,7 @@ blk_trace_request_get_cgid(struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, blk_status_t error, - unsigned int nr_bytes, u32 what, u64 cgid) + unsigned int nr_bytes, u64 what, u64 cgid) { struct blk_trace *bt; @@ -846,6 +1069,22 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, blk_trace_request_get_cgid(rq)); } +static void blk_add_trace_zone_update_request(void *ignore, struct request *rq) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt) || bt->version < 2) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ZONE_APPEND, + blk_trace_request_get_cgid(rq)); +} + /** * blk_add_trace_bio - Add a trace for a bio oriented action * @q: queue the io is for @@ -858,7 +1097,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u64 what, int error) { struct blk_trace *bt; @@ -924,7 +1163,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); - u32 what; + u64 what; if (explicit) what = BLK_TA_UNPLUG_IO; @@ -936,6 +1175,37 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, rcu_read_unlock(); } +static void blk_add_trace_zone_plug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + + return; +} + +static void blk_add_trace_zone_unplug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_UNPLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + return; +} + static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; @@ -1076,6 +1346,15 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); + ret = register_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); + WARN_ON(ret); + ret = register_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, + NULL); + WARN_ON(ret); + ret = register_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, + NULL); + WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); @@ -1095,6 +1374,10 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, NULL); + unregister_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, NULL); + unregister_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); @@ -1113,7 +1396,7 @@ static void blk_unregister_tracepoints(void) * struct blk_io_tracer formatting routines */ -static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t) { int i = 0; int tc = t->action >> BLK_TC_SHIFT; @@ -1128,7 +1411,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; - else if (tc & BLK_TC_WRITE) + else if (tc & BLK_TC_WRITE_ZEROES) { + rwbs[i++] = 'W'; + rwbs[i++] = 'Z'; + } else if (tc & BLK_TC_WRITE) rwbs[i++] = 'W'; else if (t->bytes) rwbs[i++] = 'R'; @@ -1148,9 +1434,9 @@ out: } static inline -const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +const struct blk_io_trace2 *te_blk_io_trace(const struct trace_entry *ent) { - return (const struct blk_io_trace *)ent; + return (const struct blk_io_trace2 *)ent; } static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) @@ -1209,7 +1495,7 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act, unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); @@ -1223,7 +1509,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); if (has_cg) { @@ -1444,7 +1730,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - const struct blk_io_trace *t; + const struct blk_io_trace2 *t; u16 what; bool long_act; blk_log_action_t *log_action; @@ -1481,8 +1767,8 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; - struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace2 *t = (struct blk_io_trace2 *)iter->ent; + const int offset = offsetof(struct blk_io_trace2, sector); struct blk_io_trace old = { .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, .time = iter->ts, @@ -1559,6 +1845,10 @@ static int __init init_blk_tracer(void) return 1; } + BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) % + __alignof__(long)); + BUILD_BUG_ON(__alignof__(struct blk_io_trace2) % __alignof__(long)); + return 0; } @@ -1667,6 +1957,7 @@ static const struct { { BLK_TC_DISCARD, "discard" }, { BLK_TC_DRV_DATA, "drv_data" }, { BLK_TC_FUA, "fua" }, + { BLK_TC_WRITE_ZEROES, "write-zeroes" }, }; static int blk_trace_str2mask(const char *str) @@ -1880,6 +2171,10 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) rwbs[i++] = 'Z'; rwbs[i++] = 'C'; break; + case REQ_OP_WRITE_ZEROES: + rwbs[i++] = 'W'; + rwbs[i++] = 'Z'; + break; default: rwbs[i++] = 'N'; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4f87c16d915a..d57727abaade 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2529,7 +2529,7 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) return run_ctx->entry_ip; } -static int +static __always_inline int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, unsigned long entry_ip, struct ftrace_regs *fregs, bool is_return, void *data) @@ -3372,13 +3372,13 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc * direct calls into all the specific callback implementations * (copy_user_data_sleepable, copy_user_data_nofault, and so on) */ -static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size, +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, const void *unsafe_src, copy_fn_t str_copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; - u32 chunk_sz, off; + u64 chunk_sz, off; void *dst_slice; int cnt, err; char buf[256]; @@ -3392,7 +3392,7 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do return -E2BIG; for (off = 0; off < size; off += chunk_sz - 1) { - chunk_sz = min_t(u32, sizeof(buf), size - off); + chunk_sz = min_t(u64, sizeof(buf), size - off); /* Expect str_copy_fn to return count of copied bytes, including * zero terminator. Next iteration increment off by chunk_sz - 1 to * overwrite NUL. @@ -3409,14 +3409,14 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do return off; } -static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff, - u32 size, const void *unsafe_src, +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff, + u64 size, const void *unsafe_src, copy_fn_t copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; void *dst_slice; char buf[256]; - u32 off, chunk_sz; + u64 off, chunk_sz; int err; dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); @@ -3428,7 +3428,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 return -E2BIG; for (off = 0; off < size; off += chunk_sz) { - chunk_sz = min_t(u32, sizeof(buf), size - off); + chunk_sz = min_t(u64, sizeof(buf), size - off); err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); if (err) return err; @@ -3514,58 +3514,58 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } -__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, copy_kernel_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, copy_kernel_str_nofault, NULL); } -__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign, +__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, tsk); } -__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign, +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 42bd2ba68a82..bbb37c0f8c6c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1971,7 +1971,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops) */ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_hash *old_hash, - struct ftrace_hash *new_hash) + struct ftrace_hash *new_hash, + bool update_target) { struct ftrace_page *pg; struct dyn_ftrace *rec, *end = NULL; @@ -2006,10 +2007,13 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (rec->flags & FTRACE_FL_DISABLED) continue; - /* We need to update only differences of filter_hash */ + /* + * Unless we are updating the target of a direct function, + * we only need to update differences of filter_hash + */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); - if (in_old == in_new) + if (!update_target && (in_old == in_new)) continue; if (in_new) { @@ -2020,7 +2024,16 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (is_ipmodify) goto rollback; - FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + /* + * If this is called by __modify_ftrace_direct() + * then it is only changing where the direct + * pointer is jumping to, and the record already + * points to a direct trampoline. If it isn't, + * then it is a bug to update ipmodify on a direct + * caller. + */ + FTRACE_WARN_ON(!update_target && + (rec->flags & FTRACE_FL_DIRECT)); /* * Another ops with IPMODIFY is already @@ -2076,7 +2089,7 @@ static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash, false); } /* Disabling always succeeds */ @@ -2087,7 +2100,7 @@ static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH, false); } static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, @@ -2101,7 +2114,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, if (ftrace_hash_empty(new_hash)) new_hash = NULL; - return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash, false); } static void print_ip_ins(const char *fmt, const unsigned char *p) @@ -5938,7 +5951,8 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { del = __ftrace_lookup_ip(direct_functions, entry->ip); - if (del && del->direct == addr) { + if (del && ftrace_jmp_get(del->direct) == + ftrace_jmp_get(addr)) { remove_hash_entry(direct_functions, del); kfree(del); } @@ -5953,6 +5967,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp) free_ftrace_hash(fhp); } +static void reset_direct(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + remove_direct_functions_hash(hash, addr); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -5992,8 +6017,15 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) if (ftrace_hash_empty(hash)) return -EINVAL; + /* This is a "raw" address, and this should never happen. */ + if (WARN_ON_ONCE(ftrace_is_jmp(addr))) + return -EINVAL; + mutex_lock(&direct_mutex); + if (ops->flags & FTRACE_OPS_FL_JMP) + addr = ftrace_jmp_set(addr); + /* Make sure requested entries are not already registered.. */ size = 1 << hash->size_bits; for (i = 0; i < size; i++) { @@ -6048,6 +6080,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) ops->direct_call = addr; err = register_ftrace_function_nolock(ops); + if (err) + reset_direct(ops, addr); out_unlock: mutex_unlock(&direct_mutex); @@ -6080,7 +6114,6 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, bool free_filters) { - struct ftrace_hash *hash = ops->func_hash->filter_hash; int err; if (check_direct_multi(ops)) @@ -6090,13 +6123,9 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, mutex_lock(&direct_mutex); err = unregister_ftrace_function(ops); - remove_direct_functions_hash(hash, addr); + reset_direct(ops, addr); mutex_unlock(&direct_mutex); - /* cleanup for possible another register call */ - ops->func = NULL; - ops->trampoline = 0; - if (free_filters) ftrace_free_filter(ops); return err; @@ -6106,7 +6135,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct); static int __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash; + struct ftrace_hash *hash = ops->func_hash->filter_hash; struct ftrace_func_entry *entry, *iter; static struct ftrace_ops tmp_ops = { .func = ftrace_stub, @@ -6117,6 +6146,13 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) lockdep_assert_held_once(&direct_mutex); + /* This is a "raw" address, and this should never happen. */ + if (WARN_ON_ONCE(ftrace_is_jmp(addr))) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_JMP) + addr = ftrace_jmp_set(addr); + /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; @@ -6127,12 +6163,20 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) return err; /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, hash, hash, true); + if (err) + goto out; + + /* * Now the ftrace_ops_list_func() is called to do the direct callers. * We can safely change the direct functions attached to each entry. */ mutex_lock(&ftrace_lock); - hash = ops->func_hash->filter_hash; size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(iter, &hash->buckets[i], hlist) { @@ -6147,6 +6191,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) mutex_unlock(&ftrace_lock); +out: /* Removing the tmp_ops will add the updated direct callers to the functions */ unregister_ftrace_function(&tmp_ops); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1244d2c5c384..afcd3747264d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7344,6 +7344,10 @@ consume: goto out; } + /* Did the reader catch up with the writer? */ + if (cpu_buffer->reader_page == cpu_buffer->commit_page) + goto out; + reader = rb_get_reader_page(cpu_buffer); if (WARN_ON(!reader)) goto out; diff --git a/kernel/trace/rv/monitors/pagefault/Kconfig b/kernel/trace/rv/monitors/pagefault/Kconfig index 5e16625f1653..0e013f00c33b 100644 --- a/kernel/trace/rv/monitors/pagefault/Kconfig +++ b/kernel/trace/rv/monitors/pagefault/Kconfig @@ -5,6 +5,7 @@ config RV_MON_PAGEFAULT select RV_LTL_MONITOR depends on RV_MON_RTAPP depends on X86 || RISCV + depends on MMU default y select LTL_MON_EVENTS_ID bool "pagefault monitor" diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 48338520376f..43e9ea473cda 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -501,7 +501,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) list_for_each_entry_continue(mon, &rv_monitors_list, list) { if (mon->enabled) - return mon; + return &mon->list; } return NULL; @@ -509,7 +509,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) { - struct rv_monitor *mon; + struct list_head *head; loff_t l; mutex_lock(&rv_interface_lock); @@ -517,15 +517,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) if (list_empty(&rv_monitors_list)) return NULL; - mon = list_entry(&rv_monitors_list, struct rv_monitor, list); + head = &rv_monitors_list; for (l = 0; l <= *pos; ) { - mon = enabled_monitors_next(m, mon, &l); - if (!mon) + head = enabled_monitors_next(m, head, &l); + if (!head) break; } - return mon; + return head; } /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d1e527cf2aae..304e93597126 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8781,8 +8781,18 @@ static void tracing_buffers_mmap_close(struct vm_area_struct *vma) put_snapshot_map(iter->tr); } +static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Trace buffer mappings require the complete buffer including + * the meta page. Partial mappings are not supported. + */ + return -EINVAL; +} + static const struct vm_operations_struct tracing_buffers_vmops = { .close = tracing_buffers_mmap_close, + .may_split = tracing_buffers_may_split, }; static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1d536219b624..6bfaf1210dd2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3272,14 +3272,16 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); - kfree(val); + destroy_hist_field(val, 0); ret = PTR_ERR(var); goto err; } field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL); if (!field_var) { - kfree(val); + destroy_hist_field(val, 0); + kfree_const(var->type); + kfree(var->var.name); kfree(var); ret = -ENOMEM; goto err; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index c428dafe7496..b15854c75d4f 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1449,12 +1449,7 @@ static struct trace_event_functions user_event_funcs = { static int user_event_set_call_visible(struct user_event *user, bool visible) { - int ret; - const struct cred *old_cred; - struct cred *cred; - - cred = prepare_creds(); - + CLASS(prepare_creds, cred)(); if (!cred) return -ENOMEM; @@ -1469,17 +1464,12 @@ static int user_event_set_call_visible(struct user_event *user, bool visible) */ cred->fsuid = GLOBAL_ROOT_UID; - old_cred = override_creds(cred); - - if (visible) - ret = trace_add_event_call(&user->call); - else - ret = trace_remove_event_call(&user->call); - - revert_creds(old_cred); - put_cred(cred); + scoped_with_creds(cred) { + if (visible) + return trace_add_event_call(&user->call); - return ret; + return trace_remove_event_call(&user->call); + } } static int destroy_user_event(struct user_event *user) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index ad9d6347b5fa..8001dbf16891 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -106,13 +106,14 @@ static struct tracepoint_user *__tracepoint_user_init(const char *name, struct t if (!tuser->name) return NULL; + /* Register tracepoint if it is loaded. */ if (tpoint) { + tuser->tpoint = tpoint; ret = tracepoint_user_register(tuser); if (ret) return ERR_PTR(ret); } - tuser->tpoint = tpoint; tuser->refcount = 1; INIT_LIST_HEAD(&tuser->list); list_add(&tuser->list, &tracepoint_user_list); @@ -1513,6 +1514,10 @@ static int disable_trace_fprobe(struct trace_event_call *call, if (!trace_probe_is_enabled(tp)) { list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { unregister_fprobe(&tf->fp); + if (tf->tuser) { + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; + } } } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 97db0b0ccf3e..14f86f0a8bc7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1467,12 +1467,12 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld count:%d", + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ptSp count:%d", field->seqnum, field->duration, field->outer_duration, - (long long)field->timestamp.tv_sec, - field->timestamp.tv_nsec, field->count); + &field->timestamp, + field->count); if (field->nmi_count) { /* diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index dc6040aae3ee..a88fb481c4a3 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -53,7 +53,7 @@ DEFINE_STATIC_SRCU(unwind_srcu); static inline bool unwind_pending(struct unwind_task_info *info) { - return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); + return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING; } /* @@ -79,6 +79,8 @@ static u64 get_cookie(struct unwind_task_info *info) { u32 cnt = 1; + lockdep_assert_irqs_disabled(); + if (info->id.cpu) return info->id.id; @@ -126,23 +128,20 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) cache = info->cache; trace->entries = cache->entries; - - if (cache->nr_entries) { - /* - * The user stack has already been previously unwound in this - * entry context. Skip the unwind and use the cache. - */ - trace->nr = cache->nr_entries; + trace->nr = cache->nr_entries; + /* + * The user stack has already been previously unwound in this + * entry context. Skip the unwind and use the cache. + */ + if (trace->nr) return 0; - } - trace->nr = 0; unwind_user(trace, UNWIND_MAX_ENTRIES); cache->nr_entries = trace->nr; /* Clear nr_entries on way back to user space */ - set_bit(UNWIND_USED_BIT, &info->unwind_mask); + atomic_long_or(UNWIND_USED, &info->unwind_mask); return 0; } @@ -160,7 +159,7 @@ static void process_unwind_deferred(struct task_struct *task) /* Clear pending bit but make sure to have the current bits */ bits = atomic_long_fetch_andnot(UNWIND_PENDING, - (atomic_long_t *)&info->unwind_mask); + &info->unwind_mask); /* * From here on out, the callback must always be called, even if it's * just an empty trace. @@ -231,6 +230,7 @@ void unwind_deferred_task_exit(struct task_struct *task) int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { struct unwind_task_info *info = ¤t->unwind_info; + int twa_mode = TWA_RESUME; unsigned long old, bits; unsigned long bit; int ret; @@ -246,8 +246,11 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) * Trigger a warning to make it obvious that an architecture * is using this in NMI when it should not be. */ - if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) - return -EINVAL; + if (in_nmi()) { + if (WARN_ON_ONCE(!CAN_USE_IN_NMI)) + return -EINVAL; + twa_mode = TWA_NMI_CURRENT; + } /* Do not allow cancelled works to request again */ bit = READ_ONCE(work->bit); @@ -261,7 +264,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) *cookie = get_cookie(info); - old = READ_ONCE(info->unwind_mask); + old = atomic_long_read(&info->unwind_mask); /* Is this already queued or executed */ if (old & bit) @@ -274,7 +277,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) * to have a callback. */ bits = UNWIND_PENDING | bit; - old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); + old = atomic_long_fetch_or(bits, &info->unwind_mask); if (old & bits) { /* * If the work's bit was set, whatever set it had better @@ -285,10 +288,10 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) } /* The work has been claimed, now schedule it. */ - ret = task_work_add(current, &info->work, TWA_RESUME); + ret = task_work_add(current, &info->work, twa_mode); if (WARN_ON_ONCE(ret)) - WRITE_ONCE(info->unwind_mask, 0); + atomic_long_set(&info->unwind_mask, 0); return ret; } @@ -320,7 +323,8 @@ void unwind_deferred_cancel(struct unwind_work *work) guard(rcu)(); /* Clear this bit from all threads */ for_each_process_thread(g, t) { - clear_bit(bit, &t->unwind_info.unwind_mask); + atomic_long_andnot(BIT(bit), + &t->unwind_info.unwind_mask); if (t->unwind_info.cache) clear_bit(bit, &t->unwind_info.cache->unwind_completed); } @@ -350,7 +354,7 @@ void unwind_task_init(struct task_struct *task) memset(info, 0, sizeof(*info)); init_task_work(&info->work, unwind_deferred_task_work); - info->unwind_mask = 0; + atomic_long_set(&info->unwind_mask, 0); } void unwind_task_free(struct task_struct *task) diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 97a8415e3216..39e270789444 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -8,18 +8,28 @@ #include <linux/unwind_user.h> #include <linux/uaccess.h> -static const struct unwind_user_frame fp_frame = { - ARCH_INIT_USER_FP_FRAME -}; - #define for_each_user_frame(state) \ for (unwind_user_start(state); !(state)->done; unwind_user_next(state)) -static int unwind_user_next_fp(struct unwind_user_state *state) +static inline int +get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws) +{ + unsigned long __user *addr = (void __user *)base + off; +#ifdef CONFIG_COMPAT + if (ws == sizeof(int)) { + unsigned int data; + int ret = get_user(data, (unsigned int __user *)addr); + *word = data; + return ret; + } +#endif + return get_user(*word, addr); +} + +static int unwind_user_next_common(struct unwind_user_state *state, + const struct unwind_user_frame *frame) { - const struct unwind_user_frame *frame = &fp_frame; unsigned long cfa, fp, ra; - unsigned int shift; if (frame->use_fp) { if (state->fp < state->sp) @@ -37,24 +47,45 @@ static int unwind_user_next_fp(struct unwind_user_state *state) return -EINVAL; /* Make sure that the address is word aligned */ - shift = sizeof(long) == 4 ? 2 : 3; - if (cfa & ((1 << shift) - 1)) + if (cfa & (state->ws - 1)) return -EINVAL; /* Find the Return Address (RA) */ - if (get_user(ra, (unsigned long *)(cfa + frame->ra_off))) + if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) return -EINVAL; - if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off))) + if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) return -EINVAL; state->ip = ra; state->sp = cfa; if (frame->fp_off) state->fp = fp; + state->topmost = false; return 0; } +static int unwind_user_next_fp(struct unwind_user_state *state) +{ +#ifdef CONFIG_HAVE_UNWIND_USER_FP + struct pt_regs *regs = task_pt_regs(current); + + if (state->topmost && unwind_user_at_function_start(regs)) { + const struct unwind_user_frame fp_entry_frame = { + ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_entry_frame); + } + + const struct unwind_user_frame fp_frame = { + ARCH_INIT_USER_FP_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_frame); +#else + return -EINVAL; +#endif +} + static int unwind_user_next(struct unwind_user_state *state) { unsigned long iter_mask = state->available_types; @@ -102,6 +133,12 @@ static int unwind_user_start(struct unwind_user_state *state) state->ip = instruction_pointer(regs); state->sp = user_stack_pointer(regs); state->fp = frame_pointer(regs); + state->ws = unwind_user_word_size(regs); + if (!state->ws) { + state->done = true; + return -EINVAL; + } + state->topmost = true; return 0; } diff --git a/kernel/user.c b/kernel/user.c index 0163665914c9..7aef4e679a6a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -35,6 +35,7 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc); * and 1 for... ? */ struct user_namespace init_user_ns = { + .ns = NS_COMMON_INIT(init_user_ns), .uid_map = { { .extent[0] = { @@ -65,14 +66,8 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.ns_type = ns_common_type(&init_user_ns), - .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .ns.inum = ns_init_inum(&init_user_ns), -#ifdef CONFIG_USER_NS - .ns.ops = &userns_operations, -#endif .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 7e45559521af..52f89f1137da 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -119,9 +119,9 @@ static bool post_one_notification(struct watch_queue *wqueue, offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; get_page(page); len = n->info & WATCH_INFO_LENGTH; - p = kmap_atomic(page); + p = kmap_local_page(page); memcpy(p + offset, n, len); - kunmap_atomic(p); + kunmap_local(p); buf = pipe_buf(pipe, head); buf->page = page; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5b62d1002783..873020a2a581 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -196,6 +196,15 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) #ifdef CONFIG_SYSFS ++hardlockup_count; #endif + /* + * A poorly behaving BPF scheduler can trigger hard lockup by + * e.g. putting numerous affinitized tasks in a single queue and + * directing all CPUs at it. The following call can return true + * only once when sched_ext is enabled and will immediately + * abort the BPF scheduler and print out a warning message. + */ + if (scx_hardlockup(cpu)) + return; /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 45320e27a16c..253311af47c6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -541,12 +541,6 @@ static void show_one_worker_pool(struct worker_pool *pool); !lockdep_is_held(&wq_pool_mutex), \ "RCU or wq_pool_mutex should be held") -#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ - !lockdep_is_held(&wq->mutex) && \ - !lockdep_is_held(&wq_pool_mutex), \ - "RCU, wq->mutex or wq_pool_mutex should be held") - #define for_each_bh_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \ (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -3443,6 +3437,27 @@ sleep: goto woke_up; } +static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer) +{ + struct worker_pool *pool = pwq->pool; + struct work_struct *work, *n; + + /* need rescue? */ + if (!pwq->nr_active || !need_to_create_worker(pool)) + return false; + + /* + * Slurp in all works issued via this workqueue and + * process'em. + */ + list_for_each_entry_safe(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) + pwq->stats[PWQ_STAT_RESCUED]++; + } + + return !list_empty(&rescuer->scheduled); +} + /** * rescuer_thread - the rescuer thread function * @__rescuer: self @@ -3497,7 +3512,6 @@ repeat: struct pool_workqueue *pwq = list_first_entry(&wq->maydays, struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; - struct work_struct *work, *n; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -3508,18 +3522,9 @@ repeat: raw_spin_lock_irq(&pool->lock); - /* - * Slurp in all works issued via this workqueue and - * process'em. - */ WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); - list_for_each_entry_safe(work, n, &pool->worklist, entry) { - if (get_work_pwq(work) == pwq && - assign_work(work, rescuer, &n)) - pwq->stats[PWQ_STAT_RESCUED]++; - } - if (!list_empty(&rescuer->scheduled)) { + if (assign_rescuer_work(pwq, rescuer)) { process_scheduled_works(rescuer); /* @@ -3534,10 +3539,9 @@ repeat: if (pwq->nr_active && need_to_create_worker(pool)) { raw_spin_lock(&wq_mayday_lock); /* - * Queue iff we aren't racing destruction - * and somebody else hasn't queued it already. + * Queue iff somebody else hasn't queued it already. */ - if (wq->rescuer && list_empty(&pwq->mayday_node)) { + if (list_empty(&pwq->mayday_node)) { get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); } @@ -5376,11 +5380,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) /* update node_nr_active->max */ wq_update_node_max_active(ctx->wq, -1); - /* rescuer needs to respect wq cpumask changes */ - if (ctx->wq->rescuer) - set_cpus_allowed_ptr(ctx->wq->rescuer->task, - unbound_effective_cpumask(ctx->wq)); - mutex_unlock(&ctx->wq->mutex); } @@ -5614,10 +5613,13 @@ static int init_rescuer(struct workqueue_struct *wq) } wq->rescuer = rescuer; - if (wq->flags & WQ_UNBOUND) - kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq)); + + /* initial cpumask is consistent with the detached rescuer and unbind_worker() */ + if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) + kthread_bind_mask(rescuer->task, wq_unbound_cpumask); else kthread_bind_mask(rescuer->task, cpu_possible_mask); + wake_up_process(rescuer->task); return 0; @@ -5902,16 +5904,10 @@ void destroy_workqueue(struct workqueue_struct *wq) /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ if (wq->rescuer) { - struct worker *rescuer = wq->rescuer; - - /* this prevents new queueing */ - raw_spin_lock_irq(&wq_mayday_lock); - wq->rescuer = NULL; - raw_spin_unlock_irq(&wq_mayday_lock); - /* rescuer will empty maydays list before exiting */ - kthread_stop(rescuer->task); - kfree(rescuer); + kthread_stop(wq->rescuer->task); + kfree(wq->rescuer); + wq->rescuer = NULL; } /* @@ -6937,8 +6933,26 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) } if (!ret) { + int cpu; + struct worker_pool *pool; + struct worker *worker; + mutex_lock(&wq_pool_attach_mutex); cpumask_copy(wq_unbound_cpumask, unbound_cpumask); + /* rescuer needs to respect cpumask changes when it is not attached */ + list_for_each_entry(wq, &workqueues, list) { + if (wq->rescuer && !wq->rescuer->pool) + unbind_worker(wq->rescuer); + } + /* DISASSOCIATED worker needs to respect wq_unbound_cpumask */ + for_each_possible_cpu(cpu) { + for_each_cpu_worker_pool(pool, cpu) { + if (!(pool->flags & POOL_DISASSOCIATED)) + continue; + for_each_pool_worker(worker, pool) + unbind_worker(worker); + } + } mutex_unlock(&wq_pool_attach_mutex); } return ret; |
