diff options
Diffstat (limited to 'include/linux')
111 files changed, 1987 insertions, 602 deletions
diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index 45f2f278b50a..70807c679f1a 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -185,6 +185,7 @@ struct atmdev_ops { /* only send is required */ int (*compat_ioctl)(struct atm_dev *dev,unsigned int cmd, void __user *arg); #endif + int (*pre_send)(struct atm_vcc *vcc, struct sk_buff *skb); int (*send)(struct atm_vcc *vcc,struct sk_buff *skb); int (*send_bh)(struct atm_vcc *vcc, struct sk_buff *skb); int (*send_oam)(struct atm_vcc *vcc,void *cell,int flags); diff --git a/include/linux/audit.h b/include/linux/audit.h index a394614ccd0b..536f8ee8da81 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -37,6 +37,8 @@ struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; +struct lsm_id; +struct lsm_prop; struct audit_krule { u32 pflags; @@ -147,6 +149,10 @@ extern unsigned compat_signal_class[]; #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) +/* bit values for audit_cfg_lsm */ +#define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) +#define AUDIT_CFG_LSM_SECCTX_OBJECT BIT(1) + struct filename; #define AUDIT_OFF 0 @@ -185,6 +191,8 @@ extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); +extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); +extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -210,6 +218,8 @@ extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); +extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags); + #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, @@ -245,6 +255,16 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } +static inline int audit_log_subj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} +static inline int audit_log_obj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; @@ -269,6 +289,9 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return 0; } +static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ } + #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC @@ -527,7 +550,7 @@ static inline void audit_log_kern_module(const char *name) static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { - if (!audit_dummy_context()) + if (audit_enabled) __audit_fanotify(response, friar); } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 2ad261082bba..c5c9d89c73ed 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -152,6 +152,10 @@ struct bdi_writeback { struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ struct list_head offline_node; /* anchored at offline_cgwbs */ + struct work_struct switch_work; /* work used to perform inode switching + * to this wb */ + struct llist_head switch_wbs_ctxs; /* queued contexts for + * writeback switching */ union { struct work_struct release_work; diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 9be2d50da09a..ea7898cc5903 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -267,7 +267,7 @@ static inline int parity8(u8 val) * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ -static inline unsigned int __ffs64(u64 word) +static inline __attribute_const__ unsigned int __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 95886b404b16..fe1797bbec42 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -656,6 +656,7 @@ enum { QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ + QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ QUEUE_FLAG_MAX }; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc700925b802..a98c83346134 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -7,6 +7,7 @@ #include <uapi/linux/bpf.h> #include <uapi/linux/filter.h> +#include <crypto/sha2.h> #include <linux/workqueue.h> #include <linux/file.h> #include <linux/percpu.h> @@ -109,6 +110,7 @@ struct bpf_map_ops { long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); + int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -206,6 +208,7 @@ enum btf_field_type { BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), BPF_RES_SPIN_LOCK = (1 << 12), + BPF_TASK_WORK = (1 << 13), }; enum bpf_cgroup_storage_type { @@ -259,6 +262,7 @@ struct btf_record { int timer_off; int wq_off; int refcount_off; + int task_work_off; struct btf_field fields[]; }; @@ -285,9 +289,11 @@ struct bpf_map_owner { bool xdp_has_frags; u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE]; const struct btf_type *attach_func_proto; + enum bpf_attach_type expected_attach_type; }; struct bpf_map { + u8 sha[SHA256_DIGEST_SIZE]; const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY @@ -328,6 +334,7 @@ struct bpf_map { atomic64_t sleepable_refcnt; s64 __percpu *elem_count; u64 cookie; /* write-once */ + char *excl_prog_sha; }; static inline const char *btf_field_type_name(enum btf_field_type type) @@ -358,6 +365,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; + case BPF_TASK_WORK: + return "bpf_task_work"; default: WARN_ON_ONCE(1); return "unknown"; @@ -396,6 +405,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); + case BPF_TASK_WORK: + return sizeof(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -428,6 +439,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); + case BPF_TASK_WORK: + return __alignof__(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -459,6 +472,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: + case BPF_TASK_WORK: break; default: WARN_ON_ONCE(1); @@ -595,6 +609,7 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); void bpf_wq_cancel_and_free(void *timer); +void bpf_task_work_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, @@ -767,12 +782,15 @@ enum bpf_type_flag { */ MEM_WRITE = BIT(18 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */ + DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ - | DYNPTR_TYPE_XDP) + | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1110,7 +1128,7 @@ struct bpf_prog_offload { */ #define MAX_BPF_FUNC_REG_ARGS 5 -/* The argument is a structure. */ +/* The argument is a structure or a union. */ #define BTF_FMODEL_STRUCT_ARG BIT(0) /* The argument is signed. */ @@ -1358,6 +1376,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_SKB, /* Underlying data is a xdp_buff */ BPF_DYNPTR_TYPE_XDP, + /* Points to skb_metadata_end()-skb_metadata_len() */ + BPF_DYNPTR_TYPE_SKB_META, }; int bpf_dynptr_check_size(u32 size); @@ -1619,6 +1639,7 @@ struct bpf_prog_aux { bool priv_stack_requested; bool changes_pkt_data; bool might_sleep; + bool kprobe_write_ctx; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; @@ -1628,6 +1649,7 @@ struct bpf_prog_aux { /* function name for valid attach_btf_id */ const char *attach_func_name; struct bpf_prog **func; + struct bpf_prog_aux *main_prog_aux; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; struct bpf_kfunc_desc_tab *kfunc_tab; @@ -1711,7 +1733,10 @@ struct bpf_prog { enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ - u8 tag[BPF_TAG_SIZE]; + union { + u8 digest[SHA256_DIGEST_SIZE]; + u8 tag[BPF_TAG_SIZE]; + }; struct bpf_prog_stats __percpu *stats; int __percpu *active; unsigned int (*bpf_func)(const void *ctx, @@ -1985,6 +2010,7 @@ static inline void bpf_module_put(const void *data, struct module *owner) module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); +u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ @@ -2411,6 +2437,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); @@ -2697,7 +2724,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 flags); -int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); +int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, bool delete); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); @@ -2874,6 +2901,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -3161,6 +3189,11 @@ static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { } + +static inline void bpf_prog_report_arena_violation(bool write, unsigned long addr, + unsigned long fault_ip) +{ +} #endif /* CONFIG_BPF_SYSCALL */ static __always_inline int @@ -3403,6 +3436,38 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, #endif /* CONFIG_BPF_SYSCALL */ #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ +#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) + +struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags); +struct bpf_key *bpf_lookup_system_key(u64 id); +void bpf_key_put(struct bpf_key *bkey); +int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring); + +#else +static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +{ + return NULL; +} + +static inline struct bpf_key *bpf_lookup_system_key(u64 id) +{ + return NULL; +} + +static inline void bpf_key_put(struct bpf_key *bkey) +{ +} + +static inline int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring) +{ + return -EOPNOTSUPP; +} +#endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 94defa405c85..4c497e839526 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -26,28 +26,6 @@ /* Patch buffer size */ #define INSN_BUF_SIZE 32 -/* Liveness marks, used for registers and spilled-regs (in stack slots). - * Read marks propagate upwards until they find a write mark; they record that - * "one of this state's descendants read this reg" (and therefore the reg is - * relevant for states_equal() checks). - * Write marks collect downwards and do not propagate; they record that "the - * straight-line code that reached this state (from its parent) wrote this reg" - * (and therefore that reads propagated from this state or its descendants - * should not propagate to its parent). - * A state with a write mark can receive read marks; it just won't propagate - * them to its parent, since the write mark is a property, not of the state, - * but of the link between it and its parent. See mark_reg_read() and - * mark_stack_slot_read() in kernel/bpf/verifier.c. - */ -enum bpf_reg_liveness { - REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ - REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ - REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ - REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, - REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ - REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ -}; - #define ITER_PREFIX "bpf_iter_" enum bpf_iter_state { @@ -212,8 +190,6 @@ struct bpf_reg_state { * allowed and has the same effect as bpf_sk_release(sk). */ u32 ref_obj_id; - /* parentage chain for liveness checking */ - struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' @@ -226,7 +202,6 @@ struct bpf_reg_state { * patching which only happens after main verification finished. */ s32 subreg_def; - enum bpf_reg_liveness live; /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ bool precise; }; @@ -445,6 +420,7 @@ struct bpf_verifier_state { bool speculative; bool in_sleepable; + bool cleaned; /* first and last insn idx of this verifier state */ u32 first_insn_idx; @@ -665,6 +641,7 @@ struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ + u32 postorder_start; /* The idx to the env->cfg.insn_postorder */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; /* offsets in range [stack_depth .. fastcall_stack_off) @@ -744,6 +721,8 @@ struct bpf_scc_info { struct bpf_scc_visit visits[]; }; +struct bpf_liveness; + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -794,7 +773,10 @@ struct bpf_verifier_env { struct { int *insn_state; int *insn_stack; - /* vector of instruction indexes sorted in post-order */ + /* + * vector of instruction indexes sorted in post-order, grouped by subprogram, + * see bpf_subprog_info->postorder_start. + */ int *insn_postorder; int cur_stack; /* current position in the insn_postorder vector */ @@ -842,6 +824,7 @@ struct bpf_verifier_env { struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; struct bpf_scc_callchain callchain_buf; + struct bpf_liveness *liveness; /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; @@ -875,13 +858,15 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, #define verifier_bug_if(cond, env, fmt, args...) \ ({ \ bool __cond = (cond); \ - if (unlikely(__cond)) { \ - BPF_WARN_ONCE(1, "verifier bug: " fmt "(" #cond ")\n", ##args); \ - bpf_log(&env->log, "verifier bug: " fmt "(" #cond ")\n", ##args); \ - } \ + if (unlikely(__cond)) \ + verifier_bug(env, fmt " (" #cond ")", ##args); \ (__cond); \ }) -#define verifier_bug(env, fmt, args...) verifier_bug_if(1, env, fmt, ##args) +#define verifier_bug(env, fmt, args...) \ + ({ \ + BPF_WARN_ONCE(1, "verifier bug: " fmt "\n", ##args); \ + bpf_log(&env->log, "verifier bug: " fmt "\n", ##args); \ + }) static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { @@ -962,6 +947,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) case BPF_PROG_TYPE_STRUCT_OPS: return prog->aux->jits_use_priv_stack; case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_SYSCALL: return false; default: return true; @@ -1062,4 +1048,21 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, u32 frameno); +struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); +int bpf_jmp_offset(struct bpf_insn *insn); +int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); +void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); +bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); + +int bpf_stack_liveness_init(struct bpf_verifier_env *env); +void bpf_stack_liveness_free(struct bpf_verifier_env *env); +int bpf_update_live_stack(struct bpf_verifier_env *env); +int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frameno, u32 insn_idx, u64 mask); +void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frameno, u64 mask); +int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx); +int bpf_commit_stack_write_marks(struct bpf_verifier_env *env); +int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st); +bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi); +void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/btf.h b/include/linux/btf.h index 9eda6b113f9b..f06976ffb63f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -86,7 +86,7 @@ * as to avoid issues such as the compiler inlining or eliding either a static * kfunc, or a global kfunc in an LTO build. */ -#define __bpf_kfunc __used __retain noinline +#define __bpf_kfunc __used __retain __noclone noinline #define __bpf_kfunc_start_defs() \ __diag_push(); \ diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index 0bf7d33a1048..7fcec025c5e0 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -96,6 +96,14 @@ enum cc_attr { * enabled to run SEV-SNP guests. */ CC_ATTR_HOST_SEV_SNP, + + /** + * @CC_ATTR_SNP_SECURE_AVIC: Secure AVIC mode is active. + * + * The host kernel is running with the necessary features enabled + * to run SEV-SNP guests with full Secure AVIC capabilities. + */ + CC_ATTR_SNP_SECURE_AVIC, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM diff --git a/include/linux/cdx/bitfield.h b/include/linux/cdx/bitfield.h new file mode 100644 index 000000000000..567f8ec47582 --- /dev/null +++ b/include/linux/cdx/bitfield.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2005-2006 Fen Systems Ltd. + * Copyright 2006-2013 Solarflare Communications Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef CDX_BITFIELD_H +#define CDX_BITFIELD_H + +#include <linux/bitfield.h> + +/* Lowest bit numbers and widths */ +#define CDX_DWORD_LBN 0 +#define CDX_DWORD_WIDTH 32 + +/* Specified attribute (e.g. LBN) of the specified field */ +#define CDX_VAL(field, attribute) field ## _ ## attribute +/* Low bit number of the specified field */ +#define CDX_LOW_BIT(field) CDX_VAL(field, LBN) +/* Bit width of the specified field */ +#define CDX_WIDTH(field) CDX_VAL(field, WIDTH) +/* High bit number of the specified field */ +#define CDX_HIGH_BIT(field) (CDX_LOW_BIT(field) + CDX_WIDTH(field) - 1) + +/* A doubleword (i.e. 4 byte) datatype - little-endian in HW */ +struct cdx_dword { + __le32 cdx_u32; +}; + +/* Value expanders for printk */ +#define CDX_DWORD_VAL(dword) \ + ((unsigned int)le32_to_cpu((dword).cdx_u32)) + +/* + * Extract bit field portion [low,high) from the 32-bit little-endian + * element which contains bits [min,max) + */ +#define CDX_DWORD_FIELD(dword, field) \ + (FIELD_GET(GENMASK(CDX_HIGH_BIT(field), CDX_LOW_BIT(field)), \ + le32_to_cpu((dword).cdx_u32))) + +/* + * Creates the portion of the named bit field that lies within the + * range [min,max). + */ +#define CDX_INSERT_FIELD(field, value) \ + (FIELD_PREP(GENMASK(CDX_HIGH_BIT(field), \ + CDX_LOW_BIT(field)), value)) + +/* + * Creates the portion of the named bit fields that lie within the + * range [min,max). + */ +#define CDX_INSERT_FIELDS(field1, value1, \ + field2, value2, \ + field3, value3, \ + field4, value4, \ + field5, value5, \ + field6, value6, \ + field7, value7) \ + (CDX_INSERT_FIELD(field1, (value1)) | \ + CDX_INSERT_FIELD(field2, (value2)) | \ + CDX_INSERT_FIELD(field3, (value3)) | \ + CDX_INSERT_FIELD(field4, (value4)) | \ + CDX_INSERT_FIELD(field5, (value5)) | \ + CDX_INSERT_FIELD(field6, (value6)) | \ + CDX_INSERT_FIELD(field7, (value7))) + +#define CDX_POPULATE_DWORD(dword, ...) \ + (dword).cdx_u32 = cpu_to_le32(CDX_INSERT_FIELDS(__VA_ARGS__)) + +/* Populate a dword field with various numbers of arguments */ +#define CDX_POPULATE_DWORD_7 CDX_POPULATE_DWORD +#define CDX_POPULATE_DWORD_6(dword, ...) \ + CDX_POPULATE_DWORD_7(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_5(dword, ...) \ + CDX_POPULATE_DWORD_6(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_4(dword, ...) \ + CDX_POPULATE_DWORD_5(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_3(dword, ...) \ + CDX_POPULATE_DWORD_4(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_2(dword, ...) \ + CDX_POPULATE_DWORD_3(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_1(dword, ...) \ + CDX_POPULATE_DWORD_2(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_SET_DWORD(dword) \ + CDX_POPULATE_DWORD_1(dword, CDX_DWORD, 0xffffffff) + +#endif /* CDX_BITFIELD_H */ diff --git a/include/linux/cdx/edac_cdx_pcol.h b/include/linux/cdx/edac_cdx_pcol.h new file mode 100644 index 000000000000..749db33bb482 --- /dev/null +++ b/include/linux/cdx/edac_cdx_pcol.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Driver for AMD network controllers and boards + * + * Copyright (C) 2021, Xilinx, Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef MC_CDX_PCOL_H +#define MC_CDX_PCOL_H +#include <linux/cdx/mcdi.h> + +#define MC_CMD_EDAC_GET_DDR_CONFIG_OUT_WORD_LENGTH_LEN 4 +/* Number of registers for the DDR controller */ +#define MC_CMD_GET_DDR_CONFIG_OFST 4 +#define MC_CMD_GET_DDR_CONFIG_LEN 4 + +/***********************************/ +/* MC_CMD_EDAC_GET_DDR_CONFIG + * Provides detailed configuration for the DDR controller of the given index. + */ +#define MC_CMD_EDAC_GET_DDR_CONFIG 0x3 + +/* MC_CMD_EDAC_GET_DDR_CONFIG_IN msgrequest */ +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_OFST 0 +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_LEN 4 + +#endif /* MC_CDX_PCOL_H */ diff --git a/include/linux/cdx/mcdi.h b/include/linux/cdx/mcdi.h new file mode 100644 index 000000000000..74075305cba4 --- /dev/null +++ b/include/linux/cdx/mcdi.h @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2008-2013 Solarflare Communications Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef CDX_MCDI_H +#define CDX_MCDI_H + +#include <linux/mutex.h> +#include <linux/kref.h> +#include <linux/rpmsg.h> + +#include "linux/cdx/bitfield.h" + +/** + * enum cdx_mcdi_mode - MCDI transaction mode + * @MCDI_MODE_EVENTS: wait for an mcdi response callback. + * @MCDI_MODE_FAIL: we think MCDI is dead, so fail-fast all calls + */ +enum cdx_mcdi_mode { + MCDI_MODE_EVENTS, + MCDI_MODE_FAIL, +}; + +#define MCDI_RPC_TIMEOUT (10 * HZ) +#define MCDI_RPC_LONG_TIMEOU (60 * HZ) +#define MCDI_RPC_POST_RST_TIME (10 * HZ) + +/** + * enum cdx_mcdi_cmd_state - State for an individual MCDI command + * @MCDI_STATE_QUEUED: Command not started and is waiting to run. + * @MCDI_STATE_RETRY: Command was submitted and MC rejected with no resources, + * as MC have too many outstanding commands. Command will be retried once + * another command returns. + * @MCDI_STATE_RUNNING: Command was accepted and is running. + * @MCDI_STATE_RUNNING_CANCELLED: Command is running but the issuer cancelled + * the command. + * @MCDI_STATE_FINISHED: Processing of this command has completed. + */ + +enum cdx_mcdi_cmd_state { + MCDI_STATE_QUEUED, + MCDI_STATE_RETRY, + MCDI_STATE_RUNNING, + MCDI_STATE_RUNNING_CANCELLED, + MCDI_STATE_FINISHED, +}; + +/** + * struct cdx_mcdi - CDX MCDI Firmware interface, to interact + * with CDX controller. + * @mcdi: MCDI interface + * @mcdi_ops: MCDI operations + * @r5_rproc : R5 Remoteproc device handle + * @rpdev: RPMsg device + * @ept: RPMsg endpoint + * @work: Post probe work + */ +struct cdx_mcdi { + /* MCDI interface */ + struct cdx_mcdi_data *mcdi; + const struct cdx_mcdi_ops *mcdi_ops; + + struct rproc *r5_rproc; + struct rpmsg_device *rpdev; + struct rpmsg_endpoint *ept; + struct work_struct work; +}; + +struct cdx_mcdi_ops { + void (*mcdi_request)(struct cdx_mcdi *cdx, + const struct cdx_dword *hdr, size_t hdr_len, + const struct cdx_dword *sdu, size_t sdu_len); + unsigned int (*mcdi_rpc_timeout)(struct cdx_mcdi *cdx, unsigned int cmd); +}; + +typedef void cdx_mcdi_async_completer(struct cdx_mcdi *cdx, + unsigned long cookie, int rc, + struct cdx_dword *outbuf, + size_t outlen_actual); + +/** + * struct cdx_mcdi_cmd - An outstanding MCDI command + * @ref: Reference count. There will be one reference if the command is + * in the mcdi_iface cmd_list, another if it's on a cleanup list, + * and a third if it's queued in the work queue. + * @list: The data for this entry in mcdi->cmd_list + * @cleanup_list: The data for this entry in a cleanup list + * @work: The work item for this command, queued in mcdi->workqueue + * @mcdi: The mcdi_iface for this command + * @state: The state of this command + * @inlen: inbuf length + * @inbuf: Input buffer + * @quiet: Whether to silence errors + * @reboot_seen: Whether a reboot has been seen during this command, + * to prevent duplicates + * @seq: Sequence number + * @started: Jiffies this command was started at + * @cookie: Context for completion function + * @completer: Completion function + * @handle: Command handle + * @cmd: Command number + * @rc: Return code + * @outlen: Length of output buffer + * @outbuf: Output buffer + */ +struct cdx_mcdi_cmd { + struct kref ref; + struct list_head list; + struct list_head cleanup_list; + struct work_struct work; + struct cdx_mcdi_iface *mcdi; + enum cdx_mcdi_cmd_state state; + size_t inlen; + const struct cdx_dword *inbuf; + bool quiet; + bool reboot_seen; + u8 seq; + unsigned long started; + unsigned long cookie; + cdx_mcdi_async_completer *completer; + unsigned int handle; + unsigned int cmd; + int rc; + size_t outlen; + struct cdx_dword *outbuf; + /* followed by inbuf data if necessary */ +}; + +/** + * struct cdx_mcdi_iface - MCDI protocol context + * @cdx: The associated NIC + * @iface_lock: Serialise access to this structure + * @outstanding_cleanups: Count of cleanups + * @cmd_list: List of outstanding and running commands + * @workqueue: Workqueue used for delayed processing + * @cmd_complete_wq: Waitqueue for command completion + * @db_held_by: Command the MC doorbell is in use by + * @seq_held_by: Command each sequence number is in use by + * @prev_handle: The last used command handle + * @mode: Poll for mcdi completion, or wait for an mcdi_event + * @prev_seq: The last used sequence number + * @new_epoch: Indicates start of day or start of MC reboot recovery + */ +struct cdx_mcdi_iface { + struct cdx_mcdi *cdx; + /* Serialise access */ + struct mutex iface_lock; + unsigned int outstanding_cleanups; + struct list_head cmd_list; + struct workqueue_struct *workqueue; + wait_queue_head_t cmd_complete_wq; + struct cdx_mcdi_cmd *db_held_by; + struct cdx_mcdi_cmd *seq_held_by[16]; + unsigned int prev_handle; + enum cdx_mcdi_mode mode; + u8 prev_seq; + bool new_epoch; +}; + +/** + * struct cdx_mcdi_data - extra state for NICs that implement MCDI + * @iface: Interface/protocol state + * @fn_flags: Flags for this function, as returned by %MC_CMD_DRV_ATTACH. + */ +struct cdx_mcdi_data { + struct cdx_mcdi_iface iface; + u32 fn_flags; +}; + +void cdx_mcdi_finish(struct cdx_mcdi *cdx); +int cdx_mcdi_init(struct cdx_mcdi *cdx); +void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len); +int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, + const struct cdx_dword *inbuf, size_t inlen, + struct cdx_dword *outbuf, size_t outlen, size_t *outlen_actual); + +/* + * We expect that 16- and 32-bit fields in MCDI requests and responses + * are appropriately aligned, but 64-bit fields are only + * 32-bit-aligned. + */ +#define MCDI_DECLARE_BUF(_name, _len) struct cdx_dword _name[DIV_ROUND_UP(_len, 4)] = {{0}} +#define _MCDI_PTR(_buf, _offset) \ + ((u8 *)(_buf) + (_offset)) +#define MCDI_PTR(_buf, _field) \ + _MCDI_PTR(_buf, MC_CMD_ ## _field ## _OFST) +#define _MCDI_CHECK_ALIGN(_ofst, _align) \ + ((void)BUILD_BUG_ON_ZERO((_ofst) & ((_align) - 1)), \ + (_ofst)) +#define _MCDI_DWORD(_buf, _field) \ + ((_buf) + (_MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _OFST, 4) >> 2)) + +#define MCDI_SET_DWORD(_buf, _field, _value) \ + CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), CDX_DWORD, _value) +#define MCDI_DWORD(_buf, _field) \ + CDX_DWORD_FIELD(*_MCDI_DWORD(_buf, _field), CDX_DWORD) +#endif /* CDX_MCDI_H */ diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 52a98886a455..1fd22ea6eba4 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,7 +11,7 @@ #include <linux/module.h> #include <asm/cfi.h> -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI extern bool cfi_warn; enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, @@ -52,7 +52,7 @@ static inline u32 cfi_get_func_hash(void *func) extern u32 cfi_bpf_hash; extern u32 cfi_bpf_subprog_hash; -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ static inline int cfi_get_offset(void) { return 0; } static inline u32 cfi_get_func_hash(void *func) { return 0; } @@ -60,7 +60,7 @@ static inline u32 cfi_get_func_hash(void *func) { return 0; } #define cfi_bpf_hash 0U #define cfi_bpf_subprog_hash 0U -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifdef CONFIG_ARCH_USES_CFI_TRAPS bool is_cfi_trap(unsigned long addr); diff --git a/include/linux/cfi_types.h b/include/linux/cfi_types.h index 685f7181780f..a86af9bc8bdc 100644 --- a/include/linux/cfi_types.h +++ b/include/linux/cfi_types.h @@ -8,7 +8,7 @@ #ifdef __ASSEMBLY__ #include <linux/linkage.h> -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI /* * Use the __kcfi_typeid_<function> type identifier symbol to * annotate indirectly called assembly functions. The compiler emits @@ -29,12 +29,12 @@ #define SYM_TYPED_START(name, linkage, align...) \ SYM_TYPED_ENTRY(name, linkage, align) -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ #define SYM_TYPED_START(name, linkage, align...) \ SYM_START(name, linkage, align) -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifndef SYM_TYPED_FUNC_START #define SYM_TYPED_FUNC_START(name) \ @@ -43,7 +43,7 @@ #else /* __ASSEMBLY__ */ -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI #define DEFINE_CFI_TYPE(name, func) \ /* \ * Force a reference to the function so the compiler generates \ diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 6b93a64115fe..93318fce31f3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -91,6 +91,12 @@ enum { * cgroup_threadgroup_rwsem. This makes hot path operations such as * forks and exits into the slow path and more expensive. * + * Alleviate the contention between fork, exec, exit operations and + * writing to cgroup.procs by taking a per threadgroup rwsem instead of + * the global cgroup_threadgroup_rwsem. Fork and other operations + * from threads in different thread groups no longer contend with + * writing to cgroup.procs. + * * The static usage pattern of creating a cgroup, enabling controllers, * and then seeding it with CLONE_INTO_CGROUP doesn't require write * locking cgroup_threadgroup_rwsem and thus doesn't benefit from @@ -140,6 +146,17 @@ enum { __CFTYPE_ADDED = (1 << 18), }; +enum cgroup_attach_lock_mode { + /* Default */ + CGRP_ATTACH_LOCK_GLOBAL, + + /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */ + CGRP_ATTACH_LOCK_NONE, + + /* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */ + CGRP_ATTACH_LOCK_PER_THREADGROUP, +}; + /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can @@ -433,6 +450,23 @@ struct cgroup_freezer_state { * frozen, SIGSTOPped, and PTRACEd. */ int nr_frozen_tasks; + + /* Freeze time data consistency protection */ + seqcount_t freeze_seq; + + /* + * Most recent time the cgroup was requested to freeze. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 freeze_start_nsec; + + /* + * Total duration the cgroup has spent freezing. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 frozen_nsec; }; struct cgroup { @@ -746,7 +780,6 @@ struct cgroup_subsys { int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); - void (*post_attach)(void); int (*can_fork)(struct task_struct *task, struct css_set *cset); void (*cancel_fork)(struct task_struct *task, struct css_set *cset); @@ -822,6 +855,7 @@ struct cgroup_subsys { }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +extern bool cgroup_enable_per_threadgroup_rwsem; struct cgroup_of_peak { unsigned long value; @@ -833,11 +867,14 @@ struct cgroup_of_peak { * @tsk: target task * * Allows cgroup operations to synchronize against threadgroup changes - * using a percpu_rw_semaphore. + * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when + * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition. */ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { percpu_down_read(&cgroup_threadgroup_rwsem); + if (cgroup_enable_per_threadgroup_rwsem) + down_read(&tsk->signal->cgroup_threadgroup_rwsem); } /** @@ -848,6 +885,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) */ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) { + if (cgroup_enable_per_threadgroup_rwsem) + up_read(&tsk->signal->cgroup_threadgroup_rwsem); percpu_up_read(&cgroup_threadgroup_rwsem); } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..6ed477338b16 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,6 +27,7 @@ #include <linux/kernel_stat.h> #include <linux/cgroup-defs.h> +#include <linux/cgroup_namespace.h> struct kernel_clone_args; @@ -354,6 +355,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css) return css->flags & CSS_DYING; } +static inline bool css_is_online(struct cgroup_subsys_state *css) +{ + return css->flags & CSS_ONLINE; +} + static inline bool css_is_self(struct cgroup_subsys_state *css) { if (css == &css->cgroup->self) { @@ -650,6 +656,7 @@ static inline void cgroup_kthread_ready(void) } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); +struct cgroup *__cgroup_get_from_id(u64 id); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ @@ -783,52 +790,6 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ -struct cgroup_namespace { - struct ns_common ns; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct css_set *root_cset; -}; - -extern struct cgroup_namespace init_cgroup_ns; - -#ifdef CONFIG_CGROUPS - -void free_cgroup_ns(struct cgroup_namespace *ns); - -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, - struct user_namespace *user_ns, - struct cgroup_namespace *old_ns); - -int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns); - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -static inline void put_cgroup_ns(struct cgroup_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_cgroup_ns(ns); -} - -#else /* !CONFIG_CGROUPS */ - -static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } -static inline struct cgroup_namespace * -copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, - struct cgroup_namespace *old_ns) -{ - return old_ns; -} - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } -static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } - -#endif /* !CONFIG_CGROUPS */ - #ifdef CONFIG_CGROUPS void cgroup_enter_frozen(void); diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h new file mode 100644 index 000000000000..78a8418558a4 --- /dev/null +++ b/include/linux/cgroup_namespace.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CGROUP_NAMESPACE_H +#define _LINUX_CGROUP_NAMESPACE_H + +#include <linux/ns_common.h> + +struct cgroup_namespace { + struct ns_common ns; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct css_set *root_cset; +}; + +extern struct cgroup_namespace init_cgroup_ns; + +#ifdef CONFIG_CGROUPS + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +void free_cgroup_ns(struct cgroup_namespace *ns); + +struct cgroup_namespace *copy_cgroup_ns(u64 flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns); + +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) +{ + ns_ref_inc(ns); +} + +static inline void put_cgroup_ns(struct cgroup_namespace *ns) +{ + if (ns_ref_put(ns)) + free_cgroup_ns(ns); +} + +#else /* !CONFIG_CGROUPS */ + +static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } +static inline struct cgroup_namespace * +copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + return old_ns; +} + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } +static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } + +#endif /* !CONFIG_CGROUPS */ + +#endif /* _LINUX_CGROUP_NAMESPACE_H */ diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index fa4ffe037bc7..8720a0705900 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -18,23 +18,42 @@ #define KASAN_ABI_VERSION 5 /* + * Clang 22 added preprocessor macros to match GCC, in hopes of eventually + * dropping __has_feature support for sanitizers: + * https://github.com/llvm/llvm-project/commit/568c23bbd3303518c5056d7f03444dae4fdc8a9c + * Create these macros for older versions of clang so that it is easy to clean + * up once the minimum supported version of LLVM for building the kernel always + * creates these macros. + * * Note: Checking __has_feature(*_sanitizer) is only true if the feature is * enabled. Therefore it is not required to additionally check defined(CONFIG_*) * to avoid adding redundant attributes in other configurations. */ +#if __has_feature(address_sanitizer) && !defined(__SANITIZE_ADDRESS__) +#define __SANITIZE_ADDRESS__ +#endif +#if __has_feature(hwaddress_sanitizer) && !defined(__SANITIZE_HWADDRESS__) +#define __SANITIZE_HWADDRESS__ +#endif +#if __has_feature(thread_sanitizer) && !defined(__SANITIZE_THREAD__) +#define __SANITIZE_THREAD__ +#endif -#if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer) -/* Emulate GCC's __SANITIZE_ADDRESS__ flag */ +/* + * Treat __SANITIZE_HWADDRESS__ the same as __SANITIZE_ADDRESS__ in the kernel. + */ +#ifdef __SANITIZE_HWADDRESS__ #define __SANITIZE_ADDRESS__ +#endif + +#ifdef __SANITIZE_ADDRESS__ #define __no_sanitize_address \ __attribute__((no_sanitize("address", "hwaddress"))) #else #define __no_sanitize_address #endif -#if __has_feature(thread_sanitizer) -/* emulate gcc's __SANITIZE_THREAD__ flag */ -#define __SANITIZE_THREAD__ +#ifdef __SANITIZE_THREAD__ #define __no_sanitize_thread \ __attribute__((no_sanitize("thread"))) #else diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 6f04a1d8c720..5b45ea7dff3e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -248,7 +248,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ -#if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_CFI) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) /* * Force a reference to the external symbol so the compiler generates * __kcfi_typid. @@ -288,14 +288,6 @@ static inline void *offset_to_ptr(const int *off) #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -#define __ADDRESSABLE_ASM(sym) \ - .pushsection .discard.addressable,"aw"; \ - .align ARCH_SEL(8,4); \ - ARCH_SEL(.quad, .long) __stringify(sym); \ - .popsection; - -#define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) - /* * This returns a constant expression while determining if an argument is * a constant expression, most importantly without evaluating the argument. diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 16755431fc11..2f3e80bf9f35 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -330,6 +330,29 @@ struct ftrace_likely_data { #endif /* + * The assume attribute is used to indicate that a certain condition is + * assumed to be true. If this condition is violated at runtime, the behavior + * is undefined. Compilers may or may not use this indication to generate + * optimized code. + * + * Note that the clang documentation states that optimizers may react + * differently to this attribute, and this may even have a negative + * performance impact. Therefore this attribute should be used with care. + * + * Optional: only supported since gcc >= 13 + * Optional: only supported since clang >= 19 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html#index-assume-statement-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#id13 + * + */ +#ifdef CONFIG_CC_HAS_ASSUME +# define __assume(expr) __attribute__((__assume__(expr))) +#else +# define __assume(expr) +#endif + +/* * Optional: only supported since gcc >= 15 * Optional: only supported since clang >= 18 * diff --git a/include/linux/cpu.h b/include/linux/cpu.h index b91b993f58ee..487b3bf2e1ea 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -83,6 +83,7 @@ extern ssize_t cpu_show_old_microcode(struct device *dev, extern ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index edfa61d80702..62cd7b35a29c 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -168,6 +168,7 @@ enum cpuhp_state { CPUHP_AP_QCOM_TIMER_STARTING, CPUHP_AP_TEGRA_TIMER_STARTING, CPUHP_AP_ARMADA_TIMER_STARTING, + CPUHP_AP_LOONGARCH_ARCH_TIMER_STARTING, CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, CPUHP_AP_REALTEK_TIMER_STARTING, diff --git a/include/linux/cred.h b/include/linux/cred.h index a102a10f833f..89ae50ad2ace 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -148,7 +148,7 @@ struct cred { extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); -extern int copy_creds(struct task_struct *, unsigned long); +extern int copy_creds(struct task_struct *, u64); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); diff --git a/include/linux/damon.h b/include/linux/damon.h index f13664c62ddd..9e62b2a85538 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -636,6 +636,7 @@ struct damon_operations { * @data: Data that will be passed to @fn. * @repeat: Repeat invocations. * @return_code: Return code from @fn invocation. + * @dealloc_on_cancel: De-allocate when canceled. * * Control damon_call(), which requests specific kdamond to invoke a given * function. Refer to damon_call() for more details. @@ -645,6 +646,7 @@ struct damon_call_control { void *data; bool repeat; int return_code; + bool dealloc_on_cancel; /* private: internal use only */ /* informs if the kdamond finished handling of the request */ struct completion completion; diff --git a/include/linux/dlm.h b/include/linux/dlm.h index bacda9898f2b..7e7b45b0d097 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -88,12 +88,43 @@ int dlm_new_lockspace(const char *name, const char *cluster, int *ops_result, dlm_lockspace_t **lockspace); /* + * dlm_release_lockspace() release_option values: + * + * DLM_RELEASE_NO_LOCKS returns -EBUSY if any locks (lkb's) + * exist in the local lockspace. + * + * DLM_RELEASE_UNUSED previous value that is no longer used. + * + * DLM_RELEASE_NORMAL releases the lockspace regardless of any + * locks managed in the local lockspace. + * + * DLM_RELEASE_NO_EVENT release the lockspace regardless of any + * locks managed in the local lockspace, and does not submit + * a leave event to the cluster manager, so other nodes will + * not be notified that the node should be removed from the + * list of lockspace members. + * + * DLM_RELEASE_RECOVER like DLM_RELEASE_NORMAL, but the remaining + * nodes will handle the removal of the node as if the node + * had failed, e.g. the recover_slot() callback would be used. + */ +#define DLM_RELEASE_NO_LOCKS 0 +#define DLM_RELEASE_UNUSED 1 +#define DLM_RELEASE_NORMAL 2 +#define DLM_RELEASE_NO_EVENT 3 +#define DLM_RELEASE_RECOVER 4 +#define __DLM_RELEASE_MAX DLM_RELEASE_RECOVER + +/* * dlm_release_lockspace * * Stop a lockspace. + * + * release_option: see DLM_RELEASE values above. */ -int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force); +int dlm_release_lockspace(dlm_lockspace_t *lockspace, + unsigned int release_option); /* * dlm_lock diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index f48e5fb88bd5..332b80c42b6f 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -153,6 +153,9 @@ static inline void dma_free_contiguous(struct device *dev, struct page *page, { __free_pages(page, get_order(size)); } +static inline void dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) +{ +} #endif /* CONFIG_DMA_CMA*/ #ifdef CONFIG_DMA_DECLARE_COHERENT diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 7fa1eb3cc823..61d50571ad88 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -171,6 +171,9 @@ int em_dev_update_perf_domain(struct device *dev, int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts); +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts); void em_dev_unregister_perf_domain(struct device *dev); struct em_perf_table *em_table_alloc(struct em_perf_domain *pd); void em_table_free(struct em_perf_table *table); @@ -350,6 +353,13 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, { return -EINVAL; } +static inline +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) +{ + return -EINVAL; +} static inline void em_dev_unregister_perf_domain(struct device *dev) { } diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index de5bd76a400c..d7d757e72554 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -856,8 +856,8 @@ struct kernel_ethtool_ts_info { enum hwtstamp_provider_qualifier phc_qualifier; enum hwtstamp_source phc_source; int phc_phyindex; - enum hwtstamp_tx_types tx_types; - enum hwtstamp_rx_filters rx_filters; + u32 tx_types; + u32 rx_filters; }; /** diff --git a/include/linux/export.h b/include/linux/export.h index f35d03b4113b..a686fd0ba406 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -91,6 +91,6 @@ #define EXPORT_SYMBOL_NS(sym, ns) __EXPORT_SYMBOL(sym, "", ns) #define EXPORT_SYMBOL_NS_GPL(sym, ns) __EXPORT_SYMBOL(sym, "GPL", ns) -#define EXPORT_SYMBOL_GPL_FOR_MODULES(sym, mods) __EXPORT_SYMBOL(sym, "GPL", "module:" mods) +#define EXPORT_SYMBOL_FOR_MODULES(sym, mods) __EXPORT_SYMBOL(sym, "GPL", "module:" mods) #endif /* _LINUX_EXPORT_H */ diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cfb0dd1ea49c..3aac58a520c7 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -123,6 +123,12 @@ enum fid_type { FILEID_BCACHEFS_WITH_PARENT = 0xb2, /* + * + * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. + */ + FILEID_NSFS = 0xf1, + + /* * 64 bit unique kernfs id */ FILEID_KERNFS = 0xfe, diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e7fd3ee759e..f5c859b8131a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -78,6 +78,9 @@ struct ctl_table_header; /* unused opcode to mark special atomic instruction */ #define BPF_PROBE_ATOMIC 0xe0 +/* unused opcode to mark special ldsx instruction. Same as BPF_NOSPEC */ +#define BPF_PROBE_MEM32SX 0xc0 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 @@ -997,12 +1000,6 @@ static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) return prog->len * sizeof(struct bpf_insn); } -static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) -{ - return round_up(bpf_prog_insn_size(prog) + - sizeof(__be64) + 1, SHA1_BLOCK_SIZE); -} - static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), @@ -1296,7 +1293,7 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { - pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen, + pr_err("flen=%u proglen=%u pass=%u image=%p from=%s pid=%d\n", flen, proglen, pass, image, current->comm, task_pid_nr(current)); if (image) @@ -1784,6 +1781,7 @@ int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1818,6 +1816,11 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi unsigned long len, bool flush) { } + +static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ diff --git a/include/linux/firewire.h b/include/linux/firewire.h index cceb70415ed2..d38c6e538e5c 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -341,7 +341,11 @@ struct fw_address_handler { u64 length; fw_address_callback_t address_callback; void *callback_data; + + // Only for core functions. struct list_head link; + struct kref kref; + struct completion done; }; struct fw_address_region { diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index d4212bc42b2c..a33b45027356 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -26,13 +26,43 @@ #define SCMI_IMX94_CTRL_SAI3_MCLK 5U /*!< WAKE SAI3 MCLK */ #define SCMI_IMX94_CTRL_SAI4_MCLK 6U /*!< WAKE SAI4 MCLK */ +#if IS_ENABLED(CONFIG_IMX_SCMI_MISC_DRV) int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val); int scmi_imx_misc_ctrl_set(u32 id, u32 val); +#else +static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val) +{ + return -EOPNOTSUPP; +} +static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val) +{ + return -EOPNOTSUPP; +} +#endif + +#if IS_ENABLED(CONFIG_IMX_SCMI_CPU_DRV) int scmi_imx_cpu_start(u32 cpuid, bool start); int scmi_imx_cpu_started(u32 cpuid, bool *started); int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, bool boot, bool resume); +#else +static inline int scmi_imx_cpu_start(u32 cpuid, bool start) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_cpu_started(u32 cpuid, bool *started) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, + bool boot, bool resume) +{ + return -EOPNOTSUPP; +} +#endif enum scmi_imx_lmm_op { SCMI_IMX_LMM_BOOT, @@ -44,7 +74,24 @@ enum scmi_imx_lmm_op { #define SCMI_IMX_LMM_OP_FORCEFUL 0 #define SCMI_IMX_LMM_OP_GRACEFUL BIT(0) +#if IS_ENABLED(CONFIG_IMX_SCMI_LMM_DRV) int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags); int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info); int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector); +#else +static inline int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector) +{ + return -EOPNOTSUPP; +} +#endif #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..9e9d7c757efe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -72,9 +72,7 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; -struct fscrypt_inode_info; struct fscrypt_operations; -struct fsverity_info; struct fsverity_operations; struct fsnotify_mark_connector; struct fsnotify_sb_info; @@ -149,7 +147,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) -/* FMODE_* bit 13 */ +/* Supports IOCB_HAS_METADATA */ +#define FMODE_HAS_METADATA ((__force fmode_t)(1 << 13)) /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) @@ -356,6 +355,7 @@ struct readahead_control; #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE +#define IOCB_NOSIGNAL (__force int) RWF_NOSIGNAL /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -667,6 +667,124 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_CACHED_LINK 0x0040 /* + * Inode state bits. Protected by inode->i_lock + * + * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, + * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. + * + * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, + * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at + * various stages of removing an inode. + * + * Two bits are used for locking and completion notification, I_NEW and I_SYNC. + * + * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on + * fdatasync() (unless I_DIRTY_DATASYNC is also set). + * Timestamp updates are the usual cause. + * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of + * these changes separately from I_DIRTY_SYNC so that we + * don't have to write inode on fdatasync() when only + * e.g. the timestamps have changed. + * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. + * I_DIRTY_TIME The inode itself has dirty timestamps, and the + * lazytime mount option is enabled. We keep track of this + * separately from I_DIRTY_SYNC in order to implement + * lazytime. This gets cleared if I_DIRTY_INODE + * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But + * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already + * in place because writeback might already be in progress + * and we don't want to lose the time update + * I_NEW Serves as both a mutex and completion notification. + * New inodes set I_NEW. If two processes both create + * the same inode, one of them will release its inode and + * wait for I_NEW to be released before returning. + * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can + * also cause waiting on I_NEW, without I_NEW actually + * being set. find_inode() uses this to prevent returning + * nearly-dead inodes. + * I_WILL_FREE Must be set when calling write_inode_now() if i_count + * is zero. I_FREEING must be set when I_WILL_FREE is + * cleared. + * I_FREEING Set when inode is about to be freed but still has dirty + * pages or buffers attached or the inode itself is still + * dirty. + * I_CLEAR Added by clear_inode(). In this state the inode is + * clean and can be destroyed. Inode keeps I_FREEING. + * + * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are + * prohibited for many purposes. iget() must wait for + * the inode to be completely released, then create it + * anew. Other functions will just ignore such inodes, + * if appropriate. I_NEW is used for waiting. + * + * I_SYNC Writeback of inode is running. The bit is set during + * data writeback, and cleared with a wakeup on the bit + * address once it is done. The bit is also used to pin + * the inode in memory for flusher thread. + * + * I_REFERENCED Marks the inode as recently references on the LRU list. + * + * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to + * synchronize competing switching instances and to tell + * wb stat updates to grab the i_pages lock. See + * inode_switch_wbs_work_fn() for details. + * + * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper + * and work dirs among overlayfs mounts. + * + * I_CREATING New object's inode in the middle of setting up. + * + * I_DONTCACHE Evict inode as soon as it is not used anymore. + * + * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. + * Used to detect that mark_inode_dirty() should not move + * inode between dirty lists. + * + * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. + * + * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding + * i_count. + * + * Q: What is the difference between I_WILL_FREE and I_FREEING? + * + * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait + * upon. There's one free address left. + */ + +enum inode_state_bits { + __I_NEW = 0U, + __I_SYNC = 1U, + __I_LRU_ISOLATING = 2U + /* reserved wait address bit 3 */ +}; + +enum inode_state_flags_t { + I_NEW = (1U << __I_NEW), + I_SYNC = (1U << __I_SYNC), + I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING), + /* reserved flag bit 3 */ + I_DIRTY_SYNC = (1U << 4), + I_DIRTY_DATASYNC = (1U << 5), + I_DIRTY_PAGES = (1U << 6), + I_WILL_FREE = (1U << 7), + I_FREEING = (1U << 8), + I_CLEAR = (1U << 9), + I_REFERENCED = (1U << 10), + I_LINKABLE = (1U << 11), + I_DIRTY_TIME = (1U << 12), + I_WB_SWITCH = (1U << 13), + I_OVL_INUSE = (1U << 14), + I_CREATING = (1U << 15), + I_DONTCACHE = (1U << 16), + I_SYNC_QUEUED = (1U << 17), + I_PINNING_NETFS_WB = (1U << 18) +}; + +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) +#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) +#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) + +/* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode' @@ -724,7 +842,7 @@ struct inode { #endif /* Misc */ - u32 i_state; + enum inode_state_flags_t i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -780,14 +898,6 @@ struct inode { struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif -#ifdef CONFIG_FS_ENCRYPTION - struct fscrypt_inode_info *i_crypt_info; -#endif - -#ifdef CONFIG_FS_VERITY - struct fsverity_info *i_verity_info; -#endif - void *i_private; /* fs or device private pointer */ } __randomize_layout; @@ -2008,20 +2118,18 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, /** * struct renamedata - contains all information required for renaming - * @old_mnt_idmap: idmap of the old mount the inode was found from + * @mnt_idmap: idmap of the mount in which the rename is happening. * @old_parent: parent of source * @old_dentry: source - * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_parent: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { - struct mnt_idmap *old_mnt_idmap; + struct mnt_idmap *mnt_idmap; struct dentry *old_parent; struct dentry *old_dentry; - struct mnt_idmap *new_mnt_idmap; struct dentry *new_parent; struct dentry *new_dentry; struct inode **delegated_inode; @@ -2052,8 +2160,6 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); -int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); - #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -2492,117 +2598,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, }; } -/* - * Inode state bits. Protected by inode->i_lock - * - * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, - * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. - * - * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, - * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at - * various stages of removing an inode. - * - * Two bits are used for locking and completion notification, I_NEW and I_SYNC. - * - * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on - * fdatasync() (unless I_DIRTY_DATASYNC is also set). - * Timestamp updates are the usual cause. - * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of - * these changes separately from I_DIRTY_SYNC so that we - * don't have to write inode on fdatasync() when only - * e.g. the timestamps have changed. - * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. - * I_DIRTY_TIME The inode itself has dirty timestamps, and the - * lazytime mount option is enabled. We keep track of this - * separately from I_DIRTY_SYNC in order to implement - * lazytime. This gets cleared if I_DIRTY_INODE - * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But - * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already - * in place because writeback might already be in progress - * and we don't want to lose the time update - * I_NEW Serves as both a mutex and completion notification. - * New inodes set I_NEW. If two processes both create - * the same inode, one of them will release its inode and - * wait for I_NEW to be released before returning. - * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can - * also cause waiting on I_NEW, without I_NEW actually - * being set. find_inode() uses this to prevent returning - * nearly-dead inodes. - * I_WILL_FREE Must be set when calling write_inode_now() if i_count - * is zero. I_FREEING must be set when I_WILL_FREE is - * cleared. - * I_FREEING Set when inode is about to be freed but still has dirty - * pages or buffers attached or the inode itself is still - * dirty. - * I_CLEAR Added by clear_inode(). In this state the inode is - * clean and can be destroyed. Inode keeps I_FREEING. - * - * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are - * prohibited for many purposes. iget() must wait for - * the inode to be completely released, then create it - * anew. Other functions will just ignore such inodes, - * if appropriate. I_NEW is used for waiting. - * - * I_SYNC Writeback of inode is running. The bit is set during - * data writeback, and cleared with a wakeup on the bit - * address once it is done. The bit is also used to pin - * the inode in memory for flusher thread. - * - * I_REFERENCED Marks the inode as recently references on the LRU list. - * - * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to - * synchronize competing switching instances and to tell - * wb stat updates to grab the i_pages lock. See - * inode_switch_wbs_work_fn() for details. - * - * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper - * and work dirs among overlayfs mounts. - * - * I_CREATING New object's inode in the middle of setting up. - * - * I_DONTCACHE Evict inode as soon as it is not used anymore. - * - * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. - * Used to detect that mark_inode_dirty() should not move - * inode between dirty lists. - * - * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. - * - * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding - * i_count. - * - * Q: What is the difference between I_WILL_FREE and I_FREEING? - * - * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait - * upon. There's one free address left. - */ -#define __I_NEW 0 -#define I_NEW (1 << __I_NEW) -#define __I_SYNC 1 -#define I_SYNC (1 << __I_SYNC) -#define __I_LRU_ISOLATING 2 -#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) - -#define I_DIRTY_SYNC (1 << 3) -#define I_DIRTY_DATASYNC (1 << 4) -#define I_DIRTY_PAGES (1 << 5) -#define I_WILL_FREE (1 << 6) -#define I_FREEING (1 << 7) -#define I_CLEAR (1 << 8) -#define I_REFERENCED (1 << 9) -#define I_LINKABLE (1 << 10) -#define I_DIRTY_TIME (1 << 11) -#define I_WB_SWITCH (1 << 12) -#define I_OVL_INUSE (1 << 13) -#define I_CREATING (1 << 14) -#define I_DONTCACHE (1 << 15) -#define I_SYNC_QUEUED (1 << 16) -#define I_PINNING_NETFS_WB (1 << 17) - -#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) -#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) -#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) - extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { @@ -2614,6 +2609,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline int icount_read(const struct inode *inode) +{ + return atomic_read(&inode->i_count); +} + /* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. @@ -2713,12 +2713,6 @@ static inline bool is_mgtime(const struct inode *inode) return inode->i_opflags & IOP_MGTIME; } -extern struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); @@ -3281,7 +3275,7 @@ static inline bool is_dot_dotdot(const char *name, size_t len) /** * name_contains_dotdot - check if a file name contains ".." path components - * + * @name: File path string to check * Search for ".." surrounded by either '/' or start/end of string. */ static inline bool name_contains_dotdot(const char *name) @@ -3313,8 +3307,8 @@ extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); -extern int generic_delete_inode(struct inode *inode); -static inline int generic_drop_inode(struct inode *inode) +extern int inode_just_drop(struct inode *inode); +static inline int inode_generic_drop(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } @@ -3393,7 +3387,6 @@ static inline struct inode *new_inode_pseudo(struct super_block *sb) extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); -extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); @@ -4023,4 +4016,18 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); +static inline bool extensible_ioctl_valid(unsigned int cmd_a, + unsigned int cmd_b, size_t min_size) +{ + if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b)) + return false; + if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b)) + return false; + if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b)) + return false; + if (_IOC_SIZE(cmd_a) < min_size) + return false; + return true; +} + #endif /* _LINUX_FS_H */ diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 7773eb870039..671f031be173 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -186,10 +186,12 @@ struct fc_log { extern __attribute__((format(printf, 4, 5))) void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...); -#define __logfc(fc, l, fmt, ...) logfc((fc)->log.log, NULL, \ - l, fmt, ## __VA_ARGS__) -#define __plog(p, l, fmt, ...) logfc((p)->log, (p)->prefix, \ - l, fmt, ## __VA_ARGS__) +#define __logfc(fc, l, fmt, ...) \ + logfc((fc)->log.log, NULL, (l), (fmt), ## __VA_ARGS__) +#define __plogp(p, prefix, l, fmt, ...) \ + logfc((p)->log, (prefix), (l), (fmt), ## __VA_ARGS__) +#define __plog(p, l, fmt, ...) __plogp(p, (p)->prefix, l, fmt, ## __VA_ARGS__) + /** * infof - Store supplementary informational message * @fc: The context in which to log the informational message @@ -201,6 +203,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__) #define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__) #define infofc(fc, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__) +#define infofcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'i', fmt, ## __VA_ARGS__) /** * warnf - Store supplementary warning message @@ -213,6 +217,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define warnf(fc, fmt, ...) __logfc(fc, 'w', fmt, ## __VA_ARGS__) #define warn_plog(p, fmt, ...) __plog(p, 'w', fmt, ## __VA_ARGS__) #define warnfc(fc, fmt, ...) __plog((&(fc)->log), 'w', fmt, ## __VA_ARGS__) +#define warnfcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'w', fmt, ## __VA_ARGS__) /** * errorf - Store supplementary error message @@ -225,6 +231,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define errorf(fc, fmt, ...) __logfc(fc, 'e', fmt, ## __VA_ARGS__) #define error_plog(p, fmt, ...) __plog(p, 'e', fmt, ## __VA_ARGS__) #define errorfc(fc, fmt, ...) __plog((&(fc)->log), 'e', fmt, ## __VA_ARGS__) +#define errorfcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'e', fmt, ## __VA_ARGS__) /** * invalf - Store supplementary invalid argument error message @@ -237,5 +245,7 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define invalf(fc, fmt, ...) (errorf(fc, fmt, ## __VA_ARGS__), -EINVAL) #define inval_plog(p, fmt, ...) (error_plog(p, fmt, ## __VA_ARGS__), -EINVAL) #define invalfc(fc, fmt, ...) (errorfc(fc, fmt, ## __VA_ARGS__), -EINVAL) +#define invalfcp(fc, prefix, fmt, ...) \ + (errorfcp(fc, prefix, fmt, ## __VA_ARGS__), -EINVAL) #endif /* _LINUX_FS_CONTEXT_H */ diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 10dd161690a2..516aba5b858b 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -61,6 +61,12 @@ struct fscrypt_name { /* Crypto operations for filesystems */ struct fscrypt_operations { + /* + * The offset of the pointer to struct fscrypt_inode_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /* * If set, then fs/crypto/ will allocate a global bounce page pool the @@ -195,16 +201,44 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +/* + * Returns the address of the fscrypt info pointer within the + * filesystem-specific part of the inode. (To save memory on filesystems that + * don't support fscrypt, a field in 'struct inode' itself is no longer used.) + */ +static inline struct fscrypt_inode_info ** +fscrypt_inode_info_addr(const struct inode *inode) +{ + VFS_WARN_ON_ONCE(inode->i_sb->s_cop->inode_info_offs == 0); + return (void *)inode + inode->i_sb->s_cop->inode_info_offs; +} + +/* + * Load the inode's fscrypt info pointer, using a raw dereference. Since this + * uses a raw dereference with no memory barrier, it is appropriate to use only + * when the caller knows the inode's key setup already happened, resulting in + * non-NULL fscrypt info. E.g., the file contents en/decryption functions use + * this, since fscrypt_file_open() set up the key. + */ +static inline struct fscrypt_inode_info * +fscrypt_get_inode_info_raw(const struct inode *inode) +{ + struct fscrypt_inode_info *ci = *fscrypt_inode_info_addr(inode); + + VFS_WARN_ON_ONCE(ci == NULL); + return ci; +} + static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). - * I.e., another task may publish ->i_crypt_info concurrently, executing - * a RELEASE barrier. We need to use smp_load_acquire() here to safely + * I.e., another task may publish the fscrypt info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ - return smp_load_acquire(&inode->i_crypt_info); + return smp_load_acquire(fscrypt_inode_info_addr(inode)); } /** diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 1eb7eae580be..5bc7280425a7 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -26,8 +26,16 @@ /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 +struct fsverity_info; + /* Verity operations for filesystems */ struct fsverity_operations { + /** + * The offset of the pointer to struct fsverity_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /** * Begin enabling verity on the given file. @@ -124,15 +132,37 @@ struct fsverity_operations { #ifdef CONFIG_FS_VERITY +/* + * Returns the address of the verity info pointer within the filesystem-specific + * part of the inode. (To save memory on filesystems that don't support + * fsverity, a field in 'struct inode' itself is no longer used.) + */ +static inline struct fsverity_info ** +fsverity_info_addr(const struct inode *inode) +{ + VFS_WARN_ON_ONCE(inode->i_sb->s_vop->inode_info_offs == 0); + return (void *)inode + inode->i_sb->s_vop->inode_info_offs; +} + static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { /* - * Pairs with the cmpxchg_release() in fsverity_set_info(). - * I.e., another task may publish ->i_verity_info concurrently, - * executing a RELEASE barrier. We need to use smp_load_acquire() here - * to safely ACQUIRE the memory the other task published. + * Since this function can be called on inodes belonging to filesystems + * that don't support fsverity at all, and fsverity_info_addr() doesn't + * work on such filesystems, we have to start with an IS_VERITY() check. + * Checking IS_VERITY() here is also useful to minimize the overhead of + * fsverity_active() on non-verity files. + */ + if (!IS_VERITY(inode)) + return NULL; + + /* + * Pairs with the cmpxchg_release() in fsverity_set_info(). I.e., + * another task may publish the inode's verity info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely + * ACQUIRE the memory the other task published. */ - return smp_load_acquire(&inode->i_verity_info); + return smp_load_acquire(fsverity_info_addr(inode)); } /* enable.c */ @@ -156,12 +186,19 @@ void __fsverity_cleanup_inode(struct inode *inode); * fsverity_cleanup_inode() - free the inode's verity info, if present * @inode: an inode being evicted * - * Filesystems must call this on inode eviction to free ->i_verity_info. + * Filesystems must call this on inode eviction to free the inode's verity info. */ static inline void fsverity_cleanup_inode(struct inode *inode) { - if (inode->i_verity_info) + /* + * Only IS_VERITY() inodes can have verity info, so start by checking + * for IS_VERITY() (which is faster than retrieving the pointer to the + * verity info). This minimizes overhead for non-verity inodes. + */ + if (IS_VERITY(inode)) __fsverity_cleanup_inode(inode); + else + VFS_WARN_ON_ONCE(*fsverity_info_addr(inode) != NULL); } /* read_metadata.c */ @@ -267,12 +304,12 @@ static inline bool fsverity_verify_page(struct page *page) * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check * - * This checks whether ->i_verity_info has been set. + * This checks whether the inode's verity info has been set. * * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with - * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) + * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before the verity info.) * * Return: true if reads need to go through fs-verity, otherwise false */ @@ -287,7 +324,7 @@ static inline bool fsverity_active(const struct inode *inode) * @filp: the struct file being set up * * When opening a verity file, deny the open if it is for writing. Otherwise, - * set up the inode's ->i_verity_info if not already done. + * set up the inode's verity info if not already done. * * When combined with fscrypt, this must be called after fscrypt_file_open(). * Otherwise, we won't have the key set up to decrypt the verity metadata. diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h new file mode 100644 index 000000000000..8838ca2f3d08 --- /dev/null +++ b/include/linux/hfs_common.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * HFS/HFS+ common definitions, inline functions, + * and shared functionality. + */ + +#ifndef _HFS_COMMON_H_ +#define _HFS_COMMON_H_ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define hfs_dbg(fmt, ...) \ + pr_debug("pid %d:%s:%d %s(): " fmt, \ + current->pid, __FILE__, __LINE__, __func__, ##__VA_ARGS__) \ + +#endif /* _HFS_COMMON_H_ */ diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1ef867bb8c44..2cf1bf65b225 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -154,14 +154,11 @@ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) return ktime_to_ns(timer->node.expires); } -static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) -{ - return ktime_sub(timer->node.expires, timer->base->get_time()); -} +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer); -static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) +static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { - return timer->base->get_time(); + return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); } static inline int hrtimer_is_hres_active(struct hrtimer *timer) @@ -200,8 +197,7 @@ __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) static inline ktime_t hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) { - return __hrtimer_expires_remaining_adjusted(timer, - timer->base->get_time()); + return __hrtimer_expires_remaining_adjusted(timer, hrtimer_cb_get_time(timer)); } #ifdef CONFIG_TIMERFD @@ -363,7 +359,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { - return hrtimer_forward(timer, timer->base->get_time(), interval); + return hrtimer_forward(timer, hrtimer_cb_get_time(timer), interval); } /* Precise sleep: */ diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 84a5045f80f3..aa49ffa130e5 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -41,7 +41,6 @@ * @seq: seqcount around __run_hrtimer * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers - * @get_time: function to retrieve the current time of the clock * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { @@ -51,7 +50,6 @@ struct hrtimer_clock_base { seqcount_raw_spinlock_t seq; struct hrtimer *running; struct timerqueue_head active; - ktime_t (*get_time)(void); ktime_t offset; } __hrtimer_clock_base_align; diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 80a178f3d896..12f5ee43850e 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -420,9 +420,6 @@ struct io_ring_ctx { struct list_head defer_list; unsigned nr_drained; - struct io_alloc_cache msg_cache; - spinlock_t msg_lock; - #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 14f7eaf1b443..079d8773790c 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -118,8 +118,8 @@ struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk); -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk); +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { if (!current->io_context) return 0; @@ -129,7 +129,7 @@ static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h index 4696abfd311c..3e85afe794c0 100644 --- a/include/linux/iosys-map.h +++ b/include/linux/iosys-map.h @@ -264,12 +264,7 @@ static inline bool iosys_map_is_set(const struct iosys_map *map) */ static inline void iosys_map_clear(struct iosys_map *map) { - if (map->is_iomem) { - map->vaddr_iomem = NULL; - map->is_iomem = false; - } else { - map->vaddr = NULL; - } + memset(map, 0, sizeof(*map)); } /** diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h index c4aa58032faf..f9a17fbbd398 100644 --- a/include/linux/iov_iter.h +++ b/include/linux/iov_iter.h @@ -160,7 +160,7 @@ size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2 do { struct folio *folio = folioq_folio(folioq, slot); - size_t part, remain, consumed; + size_t part, remain = 0, consumed; size_t fsize; void *base; @@ -168,14 +168,16 @@ size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2 break; fsize = folioq_folio_size(folioq, slot); - base = kmap_local_folio(folio, skip); - part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); - remain = step(base, progress, part, priv, priv2); - kunmap_local(base); - consumed = part - remain; - len -= consumed; - progress += consumed; - skip += consumed; + if (skip < fsize) { + base = kmap_local_folio(folio, skip); + part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); + remain = step(base, progress, part, priv, priv2); + kunmap_local(base); + consumed = part - remain; + len -= consumed; + progress += consumed; + skip += consumed; + } if (skip >= fsize) { skip = 0; slot++; diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611a..12faca29bbb9 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,20 +129,25 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) -extern struct ipc_namespace *copy_ipcs(unsigned long flags, +static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) +{ + return container_of(ns, struct ipc_namespace, ns); +} + +extern struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) { if (ns) { - if (refcount_inc_not_zero(&ns->ns.count)) + if (ns_ref_get(ns)) return ns; } @@ -151,7 +156,7 @@ static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns extern void put_ipc_ns(struct ipc_namespace *ns); #else -static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +static inline struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) diff --git a/include/linux/irq.h b/include/linux/irq.h index 1d6b606a81ef..c67e76fbcc07 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -669,6 +669,8 @@ extern int irq_chip_set_parent_state(struct irq_data *data, extern int irq_chip_get_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); +extern void irq_chip_shutdown_parent(struct irq_data *data); +extern unsigned int irq_chip_startup_parent(struct irq_data *data); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); @@ -976,10 +978,6 @@ static inline void irq_free_desc(unsigned int irq) irq_free_descs(irq, 1); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq); -#endif - /** * struct irq_chip_regs - register offsets for struct irq_gci * @enable: Enable register offset to reg_base diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 91b20788273d..0d1927da8055 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -61,7 +61,7 @@ extern void register_refined_jiffies(long clock_tick_rate); -/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ +/* TICK_USEC is the time between ticks in usec */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 890011071f2b..fe5ce9215821 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -562,7 +562,7 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size); +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, @@ -574,7 +574,7 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } @@ -610,7 +610,7 @@ static __always_inline void kasan_poison_vmalloc(const void *start, static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } diff --git a/include/linux/kcov.h b/include/linux/kcov.h index 75a2fb8b16c3..0143358874b0 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -57,47 +57,21 @@ static inline void kcov_remote_start_usb(u64 id) /* * The softirq flavor of kcov_remote_*() functions is introduced as a temporary - * workaround for KCOV's lack of nested remote coverage sections support. - * - * Adding support is tracked in https://bugzilla.kernel.org/show_bug.cgi?id=210337. - * - * kcov_remote_start_usb_softirq(): - * - * 1. Only collects coverage when called in the softirq context. This allows - * avoiding nested remote coverage collection sections in the task context. - * For example, USB/IP calls usb_hcd_giveback_urb() in the task context - * within an existing remote coverage collection section. Thus, KCOV should - * not attempt to start collecting coverage within the coverage collection - * section in __usb_hcd_giveback_urb() in this case. - * - * 2. Disables interrupts for the duration of the coverage collection section. - * This allows avoiding nested remote coverage collection sections in the - * softirq context (a softirq might occur during the execution of a work in - * the BH workqueue, which runs with in_serving_softirq() > 0). - * For example, usb_giveback_urb_bh() runs in the BH workqueue with - * interrupts enabled, so __usb_hcd_giveback_urb() might be interrupted in - * the middle of its remote coverage collection section, and the interrupt - * handler might invoke __usb_hcd_giveback_urb() again. + * work around for kcov's lack of nested remote coverage sections support in + * task context. Adding support for nested sections is tracked in: + * https://bugzilla.kernel.org/show_bug.cgi?id=210337 */ -static inline unsigned long kcov_remote_start_usb_softirq(u64 id) +static inline void kcov_remote_start_usb_softirq(u64 id) { - unsigned long flags = 0; - - if (in_serving_softirq()) { - local_irq_save(flags); + if (in_serving_softirq() && !in_hardirq()) kcov_remote_start_usb(id); - } - - return flags; } -static inline void kcov_remote_stop_softirq(unsigned long flags) +static inline void kcov_remote_stop_softirq(void) { - if (in_serving_softirq()) { + if (in_serving_softirq() && !in_hardirq()) kcov_remote_stop(); - local_irq_restore(flags); - } } #ifdef CONFIG_64BIT @@ -131,11 +105,8 @@ static inline u64 kcov_common_handle(void) } static inline void kcov_remote_start_common(u64 id) {} static inline void kcov_remote_start_usb(u64 id) {} -static inline unsigned long kcov_remote_start_usb_softirq(u64 id) -{ - return 0; -} -static inline void kcov_remote_stop_softirq(unsigned long flags) {} +static inline void kcov_remote_start_usb_softirq(u64 id) {} +static inline void kcov_remote_stop_softirq(void) {} #endif /* CONFIG_KCOV */ #endif /* _LINUX_KCOV_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 1b10a5d84b68..39fe3e6cd282 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -460,7 +460,8 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ - KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG) + KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ + KEXEC_FILE_NO_CMA) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index fd11fffdd3c3..adbe234a6f6c 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -211,7 +211,7 @@ LSM_HOOK(int, 0, file_open, struct file *file) LSM_HOOK(int, 0, file_post_open, struct file *file, int mask) LSM_HOOK(int, 0, file_truncate, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task) LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 090d1d3e19fe..79ec5a2bdcca 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -116,6 +116,9 @@ struct lsm_blob_sizes { int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ int lbs_tun_dev; int lbs_bdev; + int lbs_bpf_map; + int lbs_bpf_prog; + int lbs_bpf_token; }; /* diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b96746376e17..fcda8481de9a 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -40,8 +40,9 @@ extern unsigned long long max_possible_pfn; * via a driver, and never indicated in the firmware-provided memory map as * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the * kernel resource tree. - * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are - * not initialized (only for reserved regions). + * @MEMBLOCK_RSRV_NOINIT: reserved memory region for which struct pages are not + * fully initialized. Users of this flag are responsible to properly initialize + * struct pages of this region * @MEMBLOCK_RSRV_KERN: memory region that is reserved for kernel use, * either explictitly with memblock_reserve_kern() or via memblock * allocation APIs. All memblock allocations set this flag. diff --git a/include/linux/migrate.h b/include/linux/migrate.h index acadd41e0b5c..9009e27b5f44 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -79,6 +79,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count); +int set_movable_ops(const struct movable_operations *ops, enum pagetype type); #else @@ -100,6 +101,10 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, { return -ENOSYS; } +static inline int set_movable_ops(const struct movable_operations *ops, enum pagetype type) +{ + return -ENOSYS; +} #endif /* CONFIG_MIGRATION */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c5fbfb85749..10fe492e1fed 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -663,6 +663,7 @@ struct mlx5e_resources { bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; + netdevice_tracker tracker; struct mutex uplink_netdev_lock; struct mlx5_crypto_dek_priv *dek_priv; }; diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 86055d55836d..6ac76a0c3827 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -308,6 +308,8 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); struct mlx5_fc *mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size); void mlx5_fc_local_destroy(struct mlx5_fc *counter); +void mlx5_fc_local_get(struct mlx5_fc *counter); +void mlx5_fc_local_put(struct mlx5_fc *counter); u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db93..7f625c35128b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -631,6 +631,11 @@ static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) { return atomic_read(&ptdesc->pt_share_count); } + +static inline bool ptdesc_pmd_is_shared(struct ptdesc *ptdesc) +{ + return !!ptdesc_pmd_pts_count(ptdesc); +} #else static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) { @@ -1102,6 +1107,11 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + /* the ABI-related flags from the ELF header. Used for core dump */ + unsigned long saved_e_flags; +#endif + struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 70b366b64816..0acd1089d149 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,7 +11,9 @@ struct fs_struct; struct user_namespace; struct ns_common; -extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, +extern struct mnt_namespace init_mnt_ns; + +extern struct mnt_namespace *copy_mnt_ns(u64, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T)) diff --git a/include/linux/msi.h b/include/linux/msi.h index e5e86a8529fb..d415dd15a0a9 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -431,8 +431,6 @@ struct msi_domain_info; * function. * @domain_free_irqs: Optional function to override the default free * function. - * @msi_post_free: Optional function which is invoked after freeing - * all interrupts. * @msi_translate: Optional translate callback to support the odd wire to * MSI bridges, e.g. MBIGEN * @@ -473,8 +471,6 @@ struct msi_domain_ops { struct device *dev, int nvec); void (*domain_free_irqs)(struct irq_domain *domain, struct device *dev); - void (*msi_post_free)(struct irq_domain *domain, - struct device *dev); int (*msi_translate)(struct irq_domain *domain, struct irq_fwspec *fwspec, irq_hw_number_t *hwirq, unsigned int *type); }; @@ -568,6 +564,8 @@ enum { MSI_FLAG_PARENT_PM_DEV = (1 << 8), /* Support for parent mask/unmask */ MSI_FLAG_PCI_MSI_MASK_PARENT = (1 << 9), + /* Support for parent startup/shutdown */ + MSI_FLAG_PCI_MSI_STARTUP_PARENT = (1 << 10), /* Mask for the generic functionality */ MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0), diff --git a/include/linux/namei.h b/include/linux/namei.h index 5d085428e471..a7800ef04e76 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -57,13 +57,17 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags); extern int kern_path(const char *, unsigned, struct path *); - -extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); -extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); -extern void done_path_create(struct path *, struct dentry *); -extern struct dentry *kern_path_locked(const char *, struct path *); -extern struct dentry *kern_path_locked_negative(const char *, struct path *); -extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); +struct dentry *kern_path_parent(const char *name, struct path *parent); + +extern struct dentry *start_creating_path(int, const char *, struct path *, unsigned int); +extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int); +extern void end_creating_path(struct path *, struct dentry *); +extern struct dentry *start_removing_path(const char *, struct path *); +extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *); +static inline void end_removing_path(struct path *path , struct dentry *dentry) +{ + end_creating_path(path, dentry); +} int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root); @@ -80,6 +84,9 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base); extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5e5de4b0a433..f3a3b761abfb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2071,6 +2071,8 @@ enum netdev_reg_state { * @max_pacing_offload_horizon: max EDT offload horizon in nsec. * @napi_config: An array of napi_config structures containing per-NAPI * settings. + * @num_napi_configs: number of allocated NAPI config structs, + * always >= max(num_rx_queues, num_tx_queues). * @gro_flush_timeout: timeout for GRO layer in NAPI * @napi_defer_hard_irqs: If not zero, provides a counter that would * allow to avoid NIC hard IRQ, on busy queues. @@ -2482,8 +2484,9 @@ struct net_device { u64 max_pacing_offload_horizon; struct napi_config *napi_config; - unsigned long gro_flush_timeout; + u32 num_napi_configs; u32 napi_defer_hard_irqs; + unsigned long gro_flush_timeout; /** * @up: copy of @state's IFF_UP, but safe to read with just @lock. diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 185bd8196503..98c96d649bf9 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -150,6 +150,7 @@ struct netfs_io_stream { bool active; /* T if stream is active */ bool need_retry; /* T if this stream needs retrying */ bool failed; /* T if this stream failed */ + bool transferred_valid; /* T is ->transferred is valid */ }; /* diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 169b4ae30ff4..9aed39abc94b 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -160,6 +160,7 @@ extern void nfs_join_page_group(struct nfs_page *head, extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); +extern bool nfs_page_group_sync_on_bit_locked(struct nfs_page *, unsigned int); extern int nfs_page_set_headlock(struct nfs_page *req); extern void nfs_page_clear_headlock(struct nfs_page *req); extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7d22ea50b098..f5b68b8abb54 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -3,14 +3,151 @@ #define _LINUX_NS_COMMON_H #include <linux/refcount.h> +#include <linux/rbtree.h> +#include <uapi/linux/sched.h> struct proc_ns_operations; +struct cgroup_namespace; +struct ipc_namespace; +struct mnt_namespace; +struct net; +struct pid_namespace; +struct time_namespace; +struct user_namespace; +struct uts_namespace; + +extern struct cgroup_namespace init_cgroup_ns; +extern struct ipc_namespace init_ipc_ns; +extern struct mnt_namespace init_mnt_ns; +extern struct net init_net; +extern struct pid_namespace init_pid_ns; +extern struct time_namespace init_time_ns; +extern struct user_namespace init_user_ns; +extern struct uts_namespace init_uts_ns; + +extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations utsns_operations; +extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; + struct ns_common { + u32 ns_type; struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; - refcount_t count; + refcount_t __ns_ref; /* do not use directly */ + union { + struct { + u64 ns_id; + struct rb_node ns_tree_node; + struct list_head ns_list_node; + }; + struct rcu_head ns_rcu; + }; }; +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); +void __ns_common_free(struct ns_common *ns); + +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + const struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + const struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + const struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + const struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + const struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + const struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + const struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns, \ + const struct uts_namespace *: &(__ns)->ns) + +#define ns_init_inum(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ + struct ipc_namespace *: IPC_NS_INIT_INO, \ + struct mnt_namespace *: MNT_NS_INIT_INO, \ + struct net *: NET_NS_INIT_INO, \ + struct pid_namespace *: PID_NS_INIT_INO, \ + struct time_namespace *: TIME_NS_INIT_INO, \ + struct user_namespace *: USER_NS_INIT_INO, \ + struct uts_namespace *: UTS_NS_INIT_INO) + +#define ns_init_ns(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &init_cgroup_ns, \ + struct ipc_namespace *: &init_ipc_ns, \ + struct mnt_namespace *: &init_mnt_ns, \ + struct net *: &init_net, \ + struct pid_namespace *: &init_pid_ns, \ + struct time_namespace *: &init_time_ns, \ + struct user_namespace *: &init_user_ns, \ + struct uts_namespace *: &init_uts_ns) + +#define to_ns_operations(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ + struct mnt_namespace *: &mntns_operations, \ + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) + +#define ns_common_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CLONE_NEWCGROUP, \ + struct ipc_namespace *: CLONE_NEWIPC, \ + struct mnt_namespace *: CLONE_NEWNS, \ + struct net *: CLONE_NEWNET, \ + struct pid_namespace *: CLONE_NEWPID, \ + struct time_namespace *: CLONE_NEWTIME, \ + struct user_namespace *: CLONE_NEWUSER, \ + struct uts_namespace *: CLONE_NEWUTS) + +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + __inum) + +#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) + +static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) +{ + return refcount_dec_and_test(&ns->__ns_ref); +} + +static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) +{ + return refcount_inc_not_zero(&ns->__ns_ref); +} + +#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) +#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) +#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) +#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) +#define ns_ref_put_and_lock(__ns, __lock) \ + refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) + #endif diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h new file mode 100644 index 000000000000..e5a5fa83d36b --- /dev/null +++ b/include/linux/nsfs.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ + +#ifndef _LINUX_NSFS_H +#define _LINUX_NSFS_H + +#include <linux/ns_common.h> +#include <linux/cred.h> +#include <linux/pid_namespace.h> + +struct path; +struct task_struct; +struct proc_ns_operations; + +int ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +typedef struct ns_common *ns_get_path_helper_t(void *); +int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, + void *private_data); + +bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); + +int ns_get_name(char *buf, size_t size, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +void nsfs_init(void); + +#define __current_namespace_from_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: current->nsproxy->cgroup_ns, \ + struct ipc_namespace *: current->nsproxy->ipc_ns, \ + struct net *: current->nsproxy->net_ns, \ + struct pid_namespace *: task_active_pid_ns(current), \ + struct mnt_namespace *: current->nsproxy->mnt_ns, \ + struct time_namespace *: current->nsproxy->time_ns, \ + struct user_namespace *: current_user_ns(), \ + struct uts_namespace *: current->nsproxy->uts_ns) + +#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) + +#endif /* _LINUX_NSFS_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a22..bd118a187dec 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -42,17 +42,6 @@ struct nsproxy { }; extern struct nsproxy init_nsproxy; -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns->ns), \ - struct ipc_namespace *: &(__ns->ns), \ - struct net *: &(__ns->ns), \ - struct pid_namespace *: &(__ns->ns), \ - struct mnt_namespace *: &(__ns->ns), \ - struct time_namespace *: &(__ns->ns), \ - struct user_namespace *: &(__ns->ns), \ - struct uts_namespace *: &(__ns->ns)) - /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. @@ -103,7 +92,7 @@ static inline struct cred *nsset_cred(struct nsset *set) * */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk); +int copy_namespaces(u64 flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); diff --git a/include/linux/nstree.h b/include/linux/nstree.h new file mode 100644 index 000000000000..8b8636690473 --- /dev/null +++ b/include/linux/nstree.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NSTREE_H +#define _LINUX_NSTREE_H + +#include <linux/ns_common.h> +#include <linux/nsproxy.h> +#include <linux/rbtree.h> +#include <linux/seqlock.h> +#include <linux/rculist.h> +#include <linux/cookie.h> + +extern struct ns_tree cgroup_ns_tree; +extern struct ns_tree ipc_ns_tree; +extern struct ns_tree mnt_ns_tree; +extern struct ns_tree net_ns_tree; +extern struct ns_tree pid_ns_tree; +extern struct ns_tree time_ns_tree; +extern struct ns_tree user_ns_tree; +extern struct ns_tree uts_ns_tree; + +#define to_ns_tree(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(cgroup_ns_tree), \ + struct ipc_namespace *: &(ipc_ns_tree), \ + struct net *: &(net_ns_tree), \ + struct pid_namespace *: &(pid_ns_tree), \ + struct mnt_namespace *: &(mnt_ns_tree), \ + struct time_namespace *: &(time_ns_tree), \ + struct user_namespace *: &(user_ns_tree), \ + struct uts_namespace *: &(uts_ns_tree)) + +u64 ns_tree_gen_id(struct ns_common *ns); +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, + bool previous); + +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) +{ + ns_tree_gen_id(ns); + __ns_tree_add_raw(ns, ns_tree); +} + +/** + * ns_tree_add_raw - Add a namespace to a namespace + * @ns: Namespace to add + * + * This function adds a namespace to the appropriate namespace tree + * without assigning a id. + */ +#define ns_tree_add_raw(__ns) __ns_tree_add_raw(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_add - Add a namespace to a namespace tree + * @ns: Namespace to add + * + * This function assigns a new id to the namespace and adds it to the + * appropriate namespace tree and list. + */ +#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_remove - Remove a namespace from a namespace tree + * @ns: Namespace to remove + * + * This function removes a namespace from the appropriate namespace + * tree and list. + */ +#define ns_tree_remove(__ns) __ns_tree_remove(to_ns_common(__ns), to_ns_tree(__ns)) + +#define ns_tree_adjoined_rcu(__ns, __previous) \ + __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) + +#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) + +#endif /* _LINUX_NSTREE_H */ diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 682472c15495..88e18615dd72 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -134,6 +134,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); +int walk_kernel_page_table_range_lockless(unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ec9d96025683..fd1d91017b99 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -859,7 +859,7 @@ struct perf_event { /* mmap bits */ struct mutex mmap_mutex; - atomic_t mmap_count; + refcount_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; @@ -1719,7 +1719,7 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); diff --git a/include/linux/pgalloc.h b/include/linux/pgalloc.h new file mode 100644 index 000000000000..9174fa59bbc5 --- /dev/null +++ b/include/linux/pgalloc.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PGALLOC_H +#define _LINUX_PGALLOC_H + +#include <linux/pgtable.h> +#include <asm/pgalloc.h> + +/* + * {pgd,p4d}_populate_kernel() are defined as macros to allow + * compile-time optimization based on the configured page table levels. + * Without this, linking may fail because callers (e.g., KASAN) may rely + * on calls to these functions being optimized away when passing symbols + * that exist only for certain page table levels. + */ +#define pgd_populate_kernel(addr, pgd, p4d) \ + do { \ + pgd_populate(&init_mm, pgd, p4d); \ + if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PGD_MODIFIED) \ + arch_sync_kernel_mappings(addr, addr); \ + } while (0) + +#define p4d_populate_kernel(addr, p4d, pud) \ + do { \ + p4d_populate(&init_mm, p4d, pud); \ + if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_P4D_MODIFIED) \ + arch_sync_kernel_mappings(addr, addr); \ + } while (0) + +#endif /* _LINUX_PGALLOC_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 4c035637eeb7..25a7257052ff 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -232,9 +232,9 @@ static inline int pmd_dirty(pmd_t pmd) * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() do {} while (0) -#define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_flush_lazy_mmu_mode() do {} while (0) +static inline void arch_enter_lazy_mmu_mode(void) {} +static inline void arch_leave_lazy_mmu_mode(void) {} +static inline void arch_flush_lazy_mmu_mode(void) {} #endif #ifndef pte_batch_hint @@ -1467,6 +1467,22 @@ static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned } #endif +/* + * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values + * and let generic vmalloc, ioremap and page table update code know when + * arch_sync_kernel_mappings() needs to be called. + */ +#ifndef ARCH_PAGE_TABLE_SYNC_MASK +#define ARCH_PAGE_TABLE_SYNC_MASK 0 +#endif + +/* + * There is no default implementation for arch_sync_kernel_mappings(). It is + * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK + * is 0. + */ +void arch_sync_kernel_mappings(unsigned long start, unsigned long end); + #endif /* CONFIG_MMU */ /* @@ -1938,10 +1954,11 @@ static inline bool arch_has_pfn_modify_check(void) /* * Page Table Modification bits for pgtbl_mod_mask. * - * These are used by the p?d_alloc_track*() set of functions an in the generic - * vmalloc/ioremap code to track at which page-table levels entries have been - * modified. Based on that the code can better decide when vmalloc and ioremap - * mapping changes need to be synchronized to other page-tables in the system. + * These are used by the p?d_alloc_track*() and p*d_populate_kernel() + * functions in the generic vmalloc, ioremap and page table update code + * to track at which page-table levels entries have been modified. + * Based on that the code can better decide when page table changes need + * to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 diff --git a/include/linux/phy.h b/include/linux/phy.h index 4c2b8b6e7187..bb45787d8684 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -169,6 +169,11 @@ static inline bool phy_interface_empty(const unsigned long *intf) return bitmap_empty(intf, PHY_INTERFACE_MODE_MAX); } +static inline unsigned int phy_interface_weight(const unsigned long *intf) +{ + return bitmap_weight(intf, PHY_INTERFACE_MODE_MAX); +} + static inline void phy_interface_and(unsigned long *dst, const unsigned long *a, const unsigned long *b) { diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..445517a72ad0 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -54,10 +54,15 @@ extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -78,12 +83,15 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) } #endif -extern struct pid_namespace *copy_pid_ns(unsigned long flags, +extern struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); +extern bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor); + #else /* !CONFIG_PID_NS */ #include <linux/err.h> @@ -97,7 +105,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) return 0; } -static inline struct pid_namespace *copy_pid_ns(unsigned long flags, +static inline struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) @@ -118,6 +126,12 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } + +static inline bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + return false; +} #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); diff --git a/include/linux/platform_data/dmtimer-omap.h b/include/linux/platform_data/dmtimer-omap.h index 95d852aef130..726d89143842 100644 --- a/include/linux/platform_data/dmtimer-omap.h +++ b/include/linux/platform_data/dmtimer-omap.h @@ -36,9 +36,13 @@ struct omap_dm_timer_ops { int (*set_pwm)(struct omap_dm_timer *timer, int def_on, int toggle, int trigger, int autoreload); int (*get_pwm_status)(struct omap_dm_timer *timer); + int (*set_cap)(struct omap_dm_timer *timer, + int autoreload, bool config_period); + int (*get_cap_status)(struct omap_dm_timer *timer); int (*set_prescaler)(struct omap_dm_timer *timer, int prescaler); unsigned int (*read_counter)(struct omap_dm_timer *timer); + unsigned int (*read_cap)(struct omap_dm_timer *timer, bool is_period); int (*write_counter)(struct omap_dm_timer *timer, unsigned int value); unsigned int (*read_status)(struct omap_dm_timer *timer); diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index 78276a11c48d..1571e9157fa5 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -27,6 +27,7 @@ #define INT3472_GPIO_TYPE_CLK_ENABLE 0x0c #define INT3472_GPIO_TYPE_PRIVACY_LED 0x0d #define INT3472_GPIO_TYPE_HANDSHAKE 0x12 +#define INT3472_GPIO_TYPE_HOTPLUG_DETECT 0x13 #define INT3472_PDEV_MAX_NAME_LEN 23 #define INT3472_MAX_SENSOR_GPIOS 3 diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index c84edf217819..f67a2cb7d781 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -115,6 +115,12 @@ struct dev_pm_domain_list { * genpd provider specific way, likely through a * parent device node. This flag makes genpd to * skip its internal support for this. + * + * GENPD_FLAG_NO_STAY_ON: For genpd OF providers a powered-on PM domain at + * initialization is prevented from being + * powered-off until the ->sync_state() callback is + * invoked. This flag informs genpd to allow a + * power-off without waiting for ->sync_state(). */ #define GENPD_FLAG_PM_CLK (1U << 0) #define GENPD_FLAG_IRQ_SAFE (1U << 1) @@ -126,6 +132,7 @@ struct dev_pm_domain_list { #define GENPD_FLAG_OPP_TABLE_FW (1U << 7) #define GENPD_FLAG_DEV_NAME_FW (1U << 8) #define GENPD_FLAG_NO_SYNC_STATE (1U << 9) +#define GENPD_FLAG_NO_STAY_ON (1U << 10) enum gpd_status { GENPD_STATE_ON = 0, /* PM domain is on */ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 1fad1c8a4c76..102202185d7a 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -372,7 +372,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, /* * Migrate-Disable and why it is undesired. * - * When a preempted task becomes elegible to run under the ideal model (IOW it + * When a preempted task becomes eligible to run under the ideal model (IOW it * becomes one of the M highest priority tasks), it might still have to wait * for the preemptee's migrate_disable() section to complete. Thereby suffering * a reduction in bandwidth in the exact duration of the migrate_disable() @@ -387,7 +387,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * - a lower priority tasks; which under preempt_disable() could've instantly * migrated away when another CPU becomes available, is now constrained * by the ability to push the higher priority task away, which might itself be - * in a migrate_disable() section, reducing it's available bandwidth. + * in a migrate_disable() section, reducing its available bandwidth. * * IOW it trades latency / moves the interference term, but it stays in the * system, and as long as it remains unbounded, the system is not fully @@ -399,7 +399,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a * number of primitives into becoming preemptible, they would also allow * migration. This turns out to break a bunch of per-cpu usage. To this end, - * all these primitives employ migirate_disable() to restore this implicit + * all these primitives employ migrate_disable() to restore this implicit * assumption. * * This is a 'temporary' work-around at best. The correct solution is getting @@ -407,7 +407,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * per-cpu locking or short preempt-disable regions. * * The end goal must be to get rid of migrate_disable(), alternatively we need - * a schedulability theory that does not depend on abritrary migration. + * a schedulability theory that does not depend on arbitrary migration. * * * Notes on the implementation. @@ -424,8 +424,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * work-conserving schedulers. * */ -extern void migrate_disable(void); -extern void migrate_enable(void); /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section @@ -471,7 +469,6 @@ static __always_inline void preempt_enable_nested(void) DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) -DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #ifdef CONFIG_PREEMPT_DYNAMIC diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 4b20375f3783..e81b8e596e4f 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -5,7 +5,7 @@ #ifndef _LINUX_PROC_NS_H #define _LINUX_PROC_NS_H -#include <linux/ns_common.h> +#include <linux/nsfs.h> #include <uapi/linux/nsfs.h> struct pid_namespace; @@ -17,7 +17,6 @@ struct inode; struct proc_ns_operations { const char *name; const char *real_ns_name; - int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); int (*install)(struct nsset *nsset, struct ns_common *ns); @@ -66,25 +65,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -static inline int ns_alloc_inum(struct ns_common *ns) -{ - WRITE_ONCE(ns->stashed, NULL); - return proc_alloc_inum(&ns->inum); -} - -#define ns_free_inum(ns) proc_free_inum((ns)->inum) - #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) -extern int ns_get_path(struct path *path, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -typedef struct ns_common *ns_get_path_helper_t(void *); -extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, - void *private_data); - -extern bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); - -extern int ns_get_name(char *buf, size_t size, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -extern void nsfs_init(void); #endif /* _LINUX_PROC_NS_H */ diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index 1504fb012c05..540abf7de048 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -7,6 +7,8 @@ enum psp_platform_access_msg { PSP_CMD_NONE = 0x0, + PSP_SFS_GET_FW_VERSIONS, + PSP_SFS_UPDATE, PSP_CMD_HSTI_QUERY = 0x14, PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb..f67f96711f0d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -24,7 +24,7 @@ #include <linux/compiler.h> #include <linux/atomic.h> #include <linux/irqflags.h> -#include <linux/preempt.h> +#include <linux/sched.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <linux/cleanup.h> @@ -962,6 +962,20 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) preempt_enable_notrace(); } +static __always_inline void rcu_read_lock_dont_migrate(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + migrate_disable(); + rcu_read_lock(); +} + +static inline void rcu_read_unlock_migrate(void) +{ + rcu_read_unlock(); + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + migrate_enable(); +} + /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 6fb4894b8cfd..a7d92718b653 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -157,27 +157,42 @@ struct rdt_ctrl_domain { }; /** + * struct mbm_cntr_cfg - Assignable counter configuration. + * @evtid: MBM event to which the counter is assigned. Only valid + * if @rdtgroup is not NULL. + * @rdtgrp: resctrl group assigned to the counter. NULL if the + * counter is free. + */ +struct mbm_cntr_cfg { + enum resctrl_event_id evtid; + struct rdtgroup *rdtgrp; +}; + +/** * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold - * @mbm_total: saved state for MBM total bandwidth - * @mbm_local: saved state for MBM local bandwidth + * @mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct mbm_state + * indexed by RMID on x86 or combined CLOSID, RMID on Arm. * @mbm_over: worker to periodically read MBM h/w counters * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters * @cqm_work_cpu: worker CPU for CQM h/w counters + * @cntr_cfg: array of assignable counters' configuration (indexed + * by counter ID) */ struct rdt_mon_domain { struct rdt_domain_hdr hdr; unsigned int ci_id; unsigned long *rmid_busy_llc; - struct mbm_state *mbm_total; - struct mbm_state *mbm_local; + struct mbm_state *mbm_states[QOS_NUM_L3_MBM_EVENTS]; struct delayed_work mbm_over; struct delayed_work cqm_limbo; int mbm_work_cpu; int cqm_work_cpu; + struct mbm_cntr_cfg *cntr_cfg; }; /** @@ -256,39 +271,52 @@ enum resctrl_schema_fmt { }; /** + * struct resctrl_mon - Monitoring related data of a resctrl resource. + * @num_rmid: Number of RMIDs available. + * @mbm_cfg_mask: Memory transactions that can be tracked when bandwidth + * monitoring events can be configured. + * @num_mbm_cntrs: Number of assignable counters. + * @mbm_cntr_assignable:Is system capable of supporting counter assignment? + * @mbm_assign_on_mkdir:True if counters should automatically be assigned to MBM + * events of monitor groups created via mkdir. + */ +struct resctrl_mon { + int num_rmid; + unsigned int mbm_cfg_mask; + int num_mbm_cntrs; + bool mbm_cntr_assignable; + bool mbm_assign_on_mkdir; +}; + +/** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine - * @num_rmid: Number of RMIDs available * @ctrl_scope: Scope of this resource for control functions * @mon_scope: Scope of this resource for monitor functions * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. + * @mon: Monitoring related data. * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. - * @evt_list: List of monitoring events - * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth - * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { int rid; bool alloc_capable; bool mon_capable; - int num_rmid; enum resctrl_scope ctrl_scope; enum resctrl_scope mon_scope; struct resctrl_cache cache; struct resctrl_membw membw; + struct resctrl_mon mon; struct list_head ctrl_domains; struct list_head mon_domains; char *name; enum resctrl_schema_fmt schema_fmt; - struct list_head evt_list; - unsigned int mbm_cfg_mask; bool cdp_capable; }; @@ -372,8 +400,29 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); +void resctrl_enable_mon_event(enum resctrl_event_id eventid); + +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); +static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid) +{ + return (eventid >= QOS_L3_MBM_TOTAL_EVENT_ID && + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id eventid); + +/* Iterate over all memory bandwidth events */ +#define for_each_mbm_event_id(eventid) \ + for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID; \ + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID; eventid++) + +/* Iterate over memory bandwidth arrays in domain structures */ +#define for_each_mbm_idx(idx) \ + for (idx = 0; idx < QOS_NUM_L3_MBM_EVENTS; idx++) + /** * resctrl_arch_mon_event_config_write() - Write the config for an event. * @config_info: struct resctrl_mon_config_info describing the resource, domain @@ -416,6 +465,26 @@ static inline u32 resctrl_get_config_index(u32 closid, bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l); int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); +/** + * resctrl_arch_mbm_cntr_assign_enabled() - Check if MBM counter assignment + * mode is enabled. + * @r: Pointer to the resource structure. + * + * Return: + * true if the assignment mode is enabled, false otherwise. + */ +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); + +/** + * resctrl_arch_mbm_cntr_assign_set() - Configure the MBM counter assignment mode. + * @r: Pointer to the resource structure. + * @enable: Set to true to enable, false to disable the assignment mode. + * + * Return: + * 0 on success, < 0 on error. + */ +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. @@ -528,6 +597,63 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * */ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); +/** + * resctrl_arch_config_cntr() - Configure the counter with its new RMID + * and event details. + * @r: Resource structure. + * @d: The domain in which counter with ID @cntr_id should be configured. + * @evtid: Monitoring event type (e.g., QOS_L3_MBM_TOTAL_EVENT_ID + * or QOS_L3_MBM_LOCAL_EVENT_ID). + * @rmid: RMID. + * @closid: CLOSID. + * @cntr_id: Counter ID to configure. + * @assign: True to assign the counter or update an existing assignment, + * false to unassign the counter. + * + * This can be called from any CPU. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign); + +/** + * resctrl_arch_cntr_read() - Read the event data corresponding to the counter ID + * assigned to the RMID, event pair for this resource + * and domain. + * @r: Resource that the counter should be read from. + * @d: Domain that the counter should be read from. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to read. + * @eventid: The MBM event to which @cntr_id is assigned. + * @val: Result of the counter read in bytes. + * + * Called on a CPU that belongs to domain @d when "mbm_event" mode is enabled. + * Called from a non-migrateable process context via smp_call_on_cpu() unless all + * CPUs are nohz_full, in which case it is called via IPI (smp_call_function_any()). + * + * Return: + * 0 on success, or -EIO, -EINVAL etc on error. + */ +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val); + +/** + * resctrl_arch_reset_cntr() - Reset any private state associated with counter ID. + * @r: The domain's resource. + * @d: The counter ID's domain. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to reset. + * @eventid: The MBM event to which @cntr_id is assigned. + * + * This can be called from any CPU. + */ +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index a25fb9c4070d..acfe07860b34 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,11 +34,18 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) -/* - * Event IDs, the values match those used to program IA32_QM_EVTSEL before - * reading IA32_QM_CTR on RDT systems. - */ +/* Number of memory transactions that an MBM event can be configured with */ +#define NUM_MBM_TRANSACTIONS 7 + +/* Event IDs */ enum resctrl_event_id { + /* Must match value of first event below */ + QOS_FIRST_EVENT = 0x01, + + /* + * These values match those used to program IA32_QM_EVTSEL before + * reading IA32_QM_CTR on RDT systems. + */ QOS_L3_OCCUP_EVENT_ID = 0x01, QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, @@ -47,4 +54,7 @@ enum resctrl_event_id { QOS_NUM_EVENTS, }; +#define QOS_NUM_L3_MBM_EVENTS (QOS_L3_MBM_LOCAL_EVENT_ID - QOS_L3_MBM_TOTAL_EVENT_ID + 1) +#define MBM_STATE_IDX(evt) ((evt) - QOS_L3_MBM_TOTAL_EVENT_ID) + #endif /* __LINUX_RESCTRL_TYPES_H */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bc8af3eb5598..69553e7c14c1 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,6 +7,12 @@ #include <linux/preempt.h> #include <linux/sched.h> +#ifdef CONFIG_MEMBARRIER +# define RSEQ_EVENT_GUARD irq +#else +# define RSEQ_EVENT_GUARD preempt +#endif + /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. @@ -41,9 +47,8 @@ static inline void rseq_handle_notify_resume(struct ksignal *ksig, static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - preempt_disable(); - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - preempt_enable(); + scoped_guard(RSEQ_EVENT_GUARD) + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); rseq_handle_notify_resume(ksig, regs); } @@ -65,7 +70,7 @@ static inline void rseq_migrate(struct task_struct *t) * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; @@ -107,7 +112,7 @@ static inline void rseq_preempt(struct task_struct *t) static inline void rseq_migrate(struct task_struct *t) { } -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) diff --git a/include/linux/rv.h b/include/linux/rv.h index 14410a42faef..9520aab34bcb 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -7,16 +7,14 @@ #ifndef _LINUX_RV_H #define _LINUX_RV_H -#include <linux/types.h> -#include <linux/list.h> - #define MAX_DA_NAME_LEN 32 #define MAX_DA_RETRY_RACING_EVENTS 3 #ifdef CONFIG_RV +#include <linux/array_size.h> #include <linux/bitops.h> +#include <linux/list.h> #include <linux/types.h> -#include <linux/array_size.h> /* * Deterministic automaton per-object variables. diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h index 309ca72f2dfb..adcc43042c90 100644 --- a/include/linux/rw_hint.h +++ b/include/linux/rw_hint.h @@ -14,6 +14,7 @@ enum rw_hint { WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, + WRITE_LIFE_HINT_NR, } __packed; /* Sparse ignores __packed annotations on enums, hence the #ifndef below. */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b272382673d..cbb7340c5866 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -49,6 +49,9 @@ #include <linux/tracepoint-defs.h> #include <linux/unwind_deferred_types.h> #include <asm/kmap_size.h> +#ifndef COMPILE_OFFSETS +#include <generated/rq-offsets.h> +#endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -706,7 +709,6 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; - unsigned int dl_server_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -733,7 +735,6 @@ struct sched_dl_entity { * runnable task. */ struct rq *rq; - dl_server_has_tasks_f server_has_tasks; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES @@ -883,6 +884,11 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; +#ifdef CONFIG_CFS_BANDWIDTH + struct callback_head sched_throttle_work; + struct list_head throttle_node; + bool throttled; +#endif #endif @@ -2152,6 +2158,8 @@ static inline struct mutex *__get_task_blocked_on(struct task_struct *p) static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { + struct mutex *blocked_on = READ_ONCE(p->blocked_on); + WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); @@ -2162,8 +2170,8 @@ static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ - WARN_ON_ONCE(p->blocked_on && p->blocked_on != m); - p->blocked_on = m; + WARN_ON_ONCE(blocked_on && blocked_on != m); + WRITE_ONCE(p->blocked_on, m); } static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) @@ -2174,16 +2182,19 @@ static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - WARN_ON_ONCE(!m); - /* Currently we serialize blocked_on under the mutex::wait_lock */ - lockdep_assert_held_once(&m->wait_lock); - /* - * There may be cases where we re-clear already cleared - * blocked_on relationships, but make sure we are not - * clearing the relationship with a different lock. - */ - WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); - p->blocked_on = NULL; + if (m) { + struct mutex *blocked_on = READ_ONCE(p->blocked_on); + + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(blocked_on && blocked_on != m); + } + WRITE_ONCE(p->blocked_on, NULL); } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) @@ -2307,4 +2318,114 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #define alloc_tag_restore(_tag, _old) do {} while (0) #endif +#ifndef MODULE +#ifndef COMPILE_OFFSETS + +extern void ___migrate_enable(void); + +struct rq; +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +/* + * The "struct rq" is not available here, so we can't access the + * "runqueues" with this_cpu_ptr(), as the compilation will fail in + * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): + * typeof((ptr) + 0) + * + * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. + */ +#ifdef CONFIG_SMP +#define this_rq_raw() arch_raw_cpu_ptr(&runqueues) +#else +#define this_rq_raw() PERCPU_PTR(&runqueues) +#endif +#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) + +static inline void __migrate_enable(void) +{ + struct task_struct *p = current; + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + guard(preempt)(); + if (unlikely(p->cpus_ptr != &p->cpus_mask)) + ___migrate_enable(); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq_pinned()--; +} + +static inline void __migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif + p->migration_disabled++; + return; + } + + guard(preempt)(); + this_rq_pinned()++; + p->migration_disabled = 1; +} +#else /* !COMPILE_OFFSETS */ +static inline void __migrate_disable(void) { } +static inline void __migrate_enable(void) { } +#endif /* !COMPILE_OFFSETS */ + +/* + * So that it is possible to not export the runqueues variable, define and + * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use + * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will + * be defined in kernel/sched/core.c. + */ +#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE +static inline void migrate_disable(void) +{ + __migrate_disable(); +} + +static inline void migrate_enable(void) +{ + __migrate_enable(); +} +#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ + +#else /* MODULE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* MODULE */ + +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) + #endif diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 7047101dbf58..d82b7a9b0658 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -108,7 +108,11 @@ enum scx_kf_mask { SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ - /* ops.dequeue (in REST) may be nested inside DISPATCH */ + /* + * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and + * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be + * nested inside DISPATCH. + */ SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 1ef1edbaaf79..7d6449982822 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -226,6 +226,10 @@ struct signal_struct { struct tty_audit_buf *tty_audit_buf; #endif +#ifdef CONFIG_CGROUPS + struct rw_semaphore cgroup_threadgroup_rwsem; +#endif + /* * Thread is the potential origin of an oom condition; kill first on * oom diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ea41795a352b..34d6a0e108c3 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,7 @@ extern int lockdep_tasklist_lock_is_held(void); extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); -extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern int sched_fork(u64 clone_flags, struct task_struct *p); extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 5263746b63e8..bbcfdf12aa6e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -30,33 +30,24 @@ struct sd_flag_debug { }; extern const struct sd_flag_debug sd_flag_debug[]; +struct sched_domain_topology_level; + #ifdef CONFIG_SCHED_SMT -static inline int cpu_smt_flags(void) -{ - return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; -} +extern int cpu_smt_flags(void); +extern const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_CLUSTER -static inline int cpu_cluster_flags(void) -{ - return SD_CLUSTER | SD_SHARE_LLC; -} +extern int cpu_cluster_flags(void); +extern const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_MC -static inline int cpu_core_flags(void) -{ - return SD_SHARE_LLC; -} +extern int cpu_core_flags(void); +extern const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu); #endif -#ifdef CONFIG_NUMA -static inline int cpu_numa_flags(void) -{ - return SD_NUMA; -} -#endif +extern const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu); extern int arch_asym_cpu_priority(int cpu); @@ -172,7 +163,7 @@ bool cpus_equal_capacity(int this_cpu, int that_cpu); bool cpus_share_cache(int this_cpu, int that_cpu); bool cpus_share_resources(int this_cpu, int that_cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu); typedef int (*sched_domain_flags_f)(void); struct sd_data { diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b9717..bd33f194c94a 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -489,7 +489,7 @@ int security_file_receive(struct file *file); int security_file_open(struct file *file); int security_file_post_open(struct file *file, int mask); int security_file_truncate(struct file *file); -int security_task_alloc(struct task_struct *task, unsigned long clone_flags); +int security_task_alloc(struct task_struct *task, u64 clone_flags); void security_task_free(struct task_struct *task); int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); void security_cred_free(struct cred *cred); @@ -567,7 +567,8 @@ int security_getprocattr(struct task_struct *p, int lsmid, const char *name, int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, struct lsm_context *cp); -int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp); +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp, + int lsmid); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); @@ -1215,7 +1216,7 @@ static inline int security_file_truncate(struct file *file) } static inline int security_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { return 0; } @@ -1551,7 +1552,8 @@ static inline int security_secid_to_secctx(u32 secid, struct lsm_context *cp) } static inline int security_lsmprop_to_secctx(struct lsm_prop *prop, - struct lsm_context *cp) + struct lsm_context *cp, + int lsmid) { return -EOPNOTSUPP; } diff --git a/include/linux/sem.h b/include/linux/sem.h index c4deefe42aeb..275269ce2ec8 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -9,12 +9,12 @@ struct task_struct; #ifdef CONFIG_SYSVIPC -extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern int copy_semundo(u64 clone_flags, struct task_struct *tsk); extern void exit_sem(struct task_struct *tsk); #else -static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_semundo(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 14b923ddb6df..fa633657e4c0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4172,6 +4172,8 @@ int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, u32 *crcp); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); +int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset, + struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); diff --git a/include/linux/stddef.h b/include/linux/stddef.h index dab49e2ec8c0..80b6bfb944f0 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -94,7 +94,8 @@ enum { __DECLARE_FLEX_ARRAY(TYPE, NAME) /** - * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members. + * __TRAILING_OVERLAP() - Overlap a flexible-array member with trailing + * members. * * Creates a union between a flexible-array member (FAM) in a struct and a set * of additional members that would otherwise follow it. @@ -102,15 +103,30 @@ enum { * @TYPE: Flexible structure type name, including "struct" keyword. * @NAME: Name for a variable to define. * @FAM: The flexible-array member within @TYPE + * @ATTRS: Any struct attributes (usually empty) * @MEMBERS: Trailing overlapping members. */ -#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS) \ +#define __TRAILING_OVERLAP(TYPE, NAME, FAM, ATTRS, MEMBERS) \ union { \ TYPE NAME; \ struct { \ - unsigned char __offset_to_##FAM[offsetof(TYPE, FAM)]; \ + unsigned char __offset_to_FAM[offsetof(TYPE, FAM)]; \ MEMBERS \ - }; \ + } ATTRS; \ } +/** + * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members. + * + * Creates a union between a flexible-array member (FAM) in a struct and a set + * of additional members that would otherwise follow it. + * + * @TYPE: Flexible structure type name, including "struct" keyword. + * @NAME: Name for a variable to define. + * @FAM: The flexible-array member within @TYPE + * @MEMBERS: Trailing overlapping members. + */ +#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS) \ + __TRAILING_OVERLAP(TYPE, NAME, FAM, /* no attrs */, MEMBERS) + #endif diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index f3ba4f52ff26..6c4077be7742 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -17,6 +17,12 @@ #include <linux/types.h> +static inline const char *str_assert_deassert(bool v) +{ + return v ? "assert" : "deassert"; +} +#define str_deassert_assert(v) str_assert_deassert(!(v)) + static inline const char *str_enable_disable(bool v) { return v ? "enable" : "disable"; diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe6ed2cc3fd..7012a0f758d8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -385,6 +385,16 @@ void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); +static inline bool folio_may_be_lru_cached(struct folio *folio) +{ + /* + * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting. + * Holding small numbers of low-order mTHP folios in per-CPU LRU cache + * will be sensible, but nobody has implemented and tested that yet. + */ + return !folio_test_large(folio); +} + extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 77f45e5d4413..66c06fcdfe19 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); asmlinkage long sys_uretprobe(void); +asmlinkage long sys_uprobe(void); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb2c52f4fc94..c514d0e5a45c 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,17 +33,22 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +static inline struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} +void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns); void free_time_ns(struct time_namespace *ns); @@ -52,7 +57,7 @@ struct page *find_timens_vvar_page(struct vm_area_struct *vma); static inline void put_time_ns(struct time_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_time_ns(ns); } @@ -108,6 +113,10 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #else +static inline void __init time_ns_init(void) +{ +} + static inline int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { @@ -129,7 +138,7 @@ static inline void put_time_ns(struct time_namespace *ns) } static inline -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index c27aac67cb3f..b8ae89ea28ab 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -76,6 +76,7 @@ struct tk_read_base { * @cs_was_changed_seq: The sequence number of clocksource change events * @clock_valid: Indicator for valid clock * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset + * @monotonic_to_aux: CLOCK_MONOTONIC to CLOCK_AUX offset * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP * interval. @@ -117,6 +118,9 @@ struct tk_read_base { * @offs_aux is used by the auxiliary timekeepers which do not utilize any * of the regular timekeeper offset fields. * + * @monotonic_to_aux is a timespec64 representation of @offs_aux to + * accelerate the VDSO update for CLOCK_AUX. + * * The cacheline ordering of the structure is optimized for in kernel usage of * the ktime_get() and ktime_get_ts64() family of time accessors. Struct * timekeeper is prepended in the core timekeeping code with a sequence count, @@ -159,7 +163,10 @@ struct timekeeper { u8 cs_was_changed_seq; u8 clock_valid; - struct timespec64 monotonic_to_boot; + union { + struct timespec64 monotonic_to_boot; + struct timespec64 monotonic_to_aux; + }; u64 cycle_interval; u64 xtime_interval; diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 57ed3035cc30..c52b862dad45 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -51,9 +51,15 @@ struct tnum tnum_xor(struct tnum a, struct tnum b); /* Multiply two tnums, return @a * @b */ struct tnum tnum_mul(struct tnum a, struct tnum b); +/* Return true if the known bits of both tnums have the same value */ +bool tnum_overlap(struct tnum a, struct tnum b); + /* Return a tnum representing numbers satisfying both @a and @b */ struct tnum tnum_intersect(struct tnum a, struct tnum b); +/* Returns a tnum representing numbers satisfying either @a or @b */ +struct tnum tnum_union(struct tnum t1, struct tnum t2); + /* Return @a with all but the lowest @size bytes cleared */ struct tnum tnum_cast(struct tnum a, u8 size); diff --git a/include/linux/topology.h b/include/linux/topology.h index 33b7fda97d39..6575af39fd10 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -260,7 +260,7 @@ static inline bool topology_is_primary_thread(unsigned int cpu) #endif -static inline const struct cpumask *cpu_cpu_mask(int cpu) +static inline const struct cpumask *cpu_node_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c39094..ee3d36eda45d 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -17,6 +17,7 @@ #include <linux/wait.h> #include <linux/timer.h> #include <linux/seqlock.h> +#include <linux/mutex.h> struct uprobe; struct vm_area_struct; @@ -185,8 +186,14 @@ struct xol_area; struct uprobes_state { struct xol_area *xol_area; +#ifdef CONFIG_X86_64 + struct hlist_head head_tramps; +#endif }; +typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, + uprobe_opcode_t *insn, int nbytes, void *data); + extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -194,7 +201,11 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, + bool is_register); +extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); @@ -205,7 +216,7 @@ extern void uprobe_start_dup_mmap(void); extern void uprobe_end_dup_mmap(void); extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); -extern void uprobe_copy_process(struct task_struct *t, unsigned long flags); +extern void uprobe_copy_process(struct task_struct *t, u64 flags); extern int uprobe_post_sstep_notifier(struct pt_regs *regs); extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); @@ -224,8 +235,13 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); extern void uprobe_handle_trampoline(struct pt_regs *regs); -extern void *arch_uprobe_trampoline(unsigned long *psize); +extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); +extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); +extern void arch_uprobe_clear_state(struct mm_struct *mm); +extern void arch_uprobe_init_state(struct mm_struct *mm); +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; @@ -281,7 +297,7 @@ static inline bool uprobe_deny_signal(void) static inline void uprobe_free_utask(struct task_struct *t) { } -static inline void uprobe_copy_process(struct task_struct *t, unsigned long flags) +static inline void uprobe_copy_process(struct task_struct *t, u64 flags) { } static inline void uprobe_clear_state(struct mm_struct *mm) diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 8afa8c3a0973..57d1ff006090 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -33,7 +33,7 @@ extern void user_event_mm_dup(struct task_struct *t, extern void user_event_mm_remove(struct task_struct *t); static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { struct user_event_mm *old_mm; @@ -68,7 +68,7 @@ static inline void user_events_exit(struct task_struct *t) } #else static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { } diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a0bb6d012137..9a9aebbf96b9 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -168,10 +168,15 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -181,7 +186,7 @@ extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { - if (ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns_ref_put(ns)) __put_user_ns(ns); } diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h new file mode 100644 index 000000000000..60f37fec0f4b --- /dev/null +++ b/include/linux/uts_namespace.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_UTS_NAMESPACE_H +#define _LINUX_UTS_NAMESPACE_H + +#include <linux/ns_common.h> +#include <uapi/linux/utsname.h> + +struct user_namespace; +extern struct user_namespace init_user_ns; + +struct uts_namespace { + struct new_utsname name; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct ns_common ns; +} __randomize_layout; + +extern struct uts_namespace init_uts_ns; + +#ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + +static inline void get_uts_ns(struct uts_namespace *ns) +{ + ns_ref_inc(ns); +} + +extern struct uts_namespace *copy_utsname(u64 flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns); +extern void free_uts_ns(struct uts_namespace *ns); + +static inline void put_uts_ns(struct uts_namespace *ns) +{ + if (ns_ref_put(ns)) + free_uts_ns(ns); +} + +void uts_ns_init(void); +#else +static inline void get_uts_ns(struct uts_namespace *ns) +{ +} + +static inline void put_uts_ns(struct uts_namespace *ns) +{ +} + +static inline struct uts_namespace *copy_utsname(u64 flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns) +{ + if (flags & CLONE_NEWUTS) + return ERR_PTR(-EINVAL); + + return old_ns; +} + +static inline void uts_ns_init(void) +{ +} +#endif + +#endif /* _LINUX_UTS_NAMESPACE_H */ diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412b..547bd4439706 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -7,7 +7,7 @@ #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/err.h> -#include <uapi/linux/utsname.h> +#include <linux/uts_namespace.h> enum uts_proc { UTS_PROC_ARCH, @@ -18,57 +18,6 @@ enum uts_proc { UTS_PROC_DOMAINNAME, }; -struct user_namespace; -extern struct user_namespace init_user_ns; - -struct uts_namespace { - struct new_utsname name; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct ns_common ns; -} __randomize_layout; -extern struct uts_namespace init_uts_ns; - -#ifdef CONFIG_UTS_NS -static inline void get_uts_ns(struct uts_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -extern struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns); -extern void free_uts_ns(struct uts_namespace *ns); - -static inline void put_uts_ns(struct uts_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_uts_ns(ns); -} - -void uts_ns_init(void); -#else -static inline void get_uts_ns(struct uts_namespace *ns) -{ -} - -static inline void put_uts_ns(struct uts_namespace *ns) -{ -} - -static inline struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns) -{ - if (flags & CLONE_NEWUTS) - return ERR_PTR(-EINVAL); - - return old_ns; -} - -static inline void uts_ns_init(void) -{ -} -#endif - #ifdef CONFIG_PROC_SYSCTL extern void uts_proc_notify(enum uts_proc proc); #else diff --git a/include/linux/verification.h b/include/linux/verification.h index 4f3022d081c3..dec7f2beabfd 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -36,6 +36,7 @@ enum key_being_used_for { VERIFYING_KEY_SIGNATURE, VERIFYING_KEY_SELF_SIGNATURE, VERIFYING_UNSPECIFIED_SIGNATURE, + VERIFYING_BPF_SIGNATURE, NR__KEY_BEING_USED_FOR }; #ifdef CONFIG_SYSTEM_DATA_VERIFICATION diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 918cf25cd3c6..7427b79d6f3d 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -193,14 +193,15 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, } static inline void virtio_get_features(struct virtio_device *vdev, - u64 *features) + u64 *features_out) { if (vdev->config->get_extended_features) { - vdev->config->get_extended_features(vdev, features); + vdev->config->get_extended_features(vdev, features_out); return; } - virtio_features_from_u64(features, vdev->config->get_features(vdev)); + virtio_features_from_u64(features_out, + vdev->config->get_features(vdev)); } /** @@ -326,13 +327,11 @@ int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) static inline bool virtio_get_shm_region(struct virtio_device *vdev, - struct virtio_shm_region *region, u8 id) + struct virtio_shm_region *region_out, u8 id) { - if (!region->len) - return false; if (!vdev->config->get_shm_region) return false; - return vdev->config->get_shm_region(vdev, region, id); + return vdev->config->get_shm_region(vdev, region_out, id); } static inline bool virtio_is_little_endian(struct virtio_device *vdev) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index fdc9aeb74a44..2759dac6be44 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -220,22 +220,6 @@ int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); /* - * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values - * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() - * needs to be called. - */ -#ifndef ARCH_PAGE_TABLE_SYNC_MASK -#define ARCH_PAGE_TABLE_SYNC_MASK 0 -#endif - -/* - * There is no default implementation for arch_sync_kernel_mappings(). It is - * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK - * is 0. - */ -void arch_sync_kernel_mappings(unsigned long start, unsigned long end); - -/* * Lowlevel-APIs (not for driver use!) */ diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 45d5dd470ff6..dabc351cc127 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -410,7 +410,7 @@ enum wq_flags { __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ /* BH wq only allows the following flags */ - __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, + __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI | WQ_PERCPU, }; enum wq_consts { @@ -434,10 +434,10 @@ enum wq_consts { * short queue flush time. Don't queue works which can run for too * long. * - * system_highpri_wq is similar to system_wq but for work items which + * system_highpri_wq is similar to system_percpu_wq but for work items which * require WQ_HIGHPRI. * - * system_long_wq is similar to system_wq but may host long running + * system_long_wq is similar to system_percpu_wq but may host long running * works. Queue flushing might take relatively long. * * system_dfl_wq is unbound workqueue. Workers are not bound to @@ -445,13 +445,13 @@ enum wq_consts { * executed immediately as long as max_active limit is not reached and * resources are available. * - * system_freezable_wq is equivalent to system_wq except that it's + * system_freezable_wq is equivalent to system_percpu_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. - * system_power_efficient_wq is identical to system_wq if + * system_power_efficient_wq is identical to system_percpu_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. * * system_bh[_highpri]_wq are convenience interface to softirq. BH work items @@ -502,7 +502,7 @@ void workqueue_softirq_dead(unsigned int cpu); * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means * that the sum of per-node max_active's may be larger than @max_active. * - * For detailed information on %WQ_* flags, please refer to + * For detailed information on %WQ_\* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: @@ -570,7 +570,7 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ - alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) @@ -708,7 +708,7 @@ static inline bool mod_delayed_work(struct workqueue_struct *wq, */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { - return queue_work_on(cpu, system_wq, work); + return queue_work_on(cpu, system_percpu_wq, work); } /** @@ -727,7 +727,7 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work) */ static inline bool schedule_work(struct work_struct *work) { - return queue_work(system_wq, work); + return queue_work(system_percpu_wq, work); } /** @@ -770,21 +770,21 @@ extern void __warn_flushing_systemwide_wq(void) #define flush_scheduled_work() \ ({ \ __warn_flushing_systemwide_wq(); \ - __flush_workqueue(system_wq); \ + __flush_workqueue(system_percpu_wq); \ }) #define flush_workqueue(wq) \ ({ \ struct workqueue_struct *_wq = (wq); \ \ - if ((__builtin_constant_p(_wq == system_wq) && \ - _wq == system_wq) || \ + if ((__builtin_constant_p(_wq == system_percpu_wq) && \ + _wq == system_percpu_wq) || \ (__builtin_constant_p(_wq == system_highpri_wq) && \ _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ - (__builtin_constant_p(_wq == system_unbound_wq) && \ - _wq == system_unbound_wq) || \ + (__builtin_constant_p(_wq == system_dfl_wq) && \ + _wq == system_dfl_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ _wq == system_freezable_wq) || \ (__builtin_constant_p(_wq == system_power_efficient_wq) && \ @@ -807,7 +807,7 @@ extern void __warn_flushing_systemwide_wq(void) static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, system_wq, dwork, delay); + return queue_delayed_work_on(cpu, system_percpu_wq, dwork, delay); } /** @@ -821,7 +821,7 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(system_wq, dwork, delay); + return queue_delayed_work(system_percpu_wq, dwork, delay); } #ifndef CONFIG_SMP diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a2848d731a46..15a4bc4ab819 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } +void inode_switch_wbs_work_fn(struct work_struct *work); + #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) |