diff options
Diffstat (limited to 'include/linux')
162 files changed, 5566 insertions, 1833 deletions
diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5ff5d99f6ead..fbf0c3a65f59 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -8,6 +8,7 @@ #ifndef _LINUX_ACPI_H #define _LINUX_ACPI_H +#include <linux/cleanup.h> #include <linux/errno.h> #include <linux/ioport.h> /* for struct resource */ #include <linux/resource_ext.h> @@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void); void acpi_table_init_complete (void); int acpi_table_init (void); +static inline struct acpi_table_header *acpi_get_table_pointer(char *signature, u32 instance) +{ + struct acpi_table_header *table; + int status = acpi_get_table(signature, instance, &table); + + if (ACPI_FAILURE(status)) + return ERR_PTR(-ENOENT); + return table; +} +DEFINE_FREE(acpi_put_table, struct acpi_table_header *, if (!IS_ERR_OR_NULL(_T)) acpi_put_table(_T)) + int acpi_table_parse(char *id, acpi_tbl_table_handler handler); int __init_or_acpilib acpi_table_parse_entries(char *id, unsigned long table_size, int entry_id, @@ -755,7 +767,6 @@ int acpi_reconfig_notifier_unregister(struct notifier_block *nb); int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count); int acpi_gtdt_map_ppi(int type); bool acpi_gtdt_c3stop(int type); -int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count); #endif #ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER @@ -1146,12 +1157,7 @@ struct acpi_s2idle_dev_ops { #if defined(CONFIG_SUSPEND) && defined(CONFIG_X86) int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg); void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg); -int acpi_get_lps0_constraint(struct acpi_device *adev); #else /* CONFIG_SUSPEND && CONFIG_X86 */ -static inline int acpi_get_lps0_constraint(struct device *dev) -{ - return ACPI_STATE_UNKNOWN; -} static inline int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg) { return -ENODEV; @@ -1349,9 +1355,6 @@ acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid, int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr); -struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, - struct fwnode_handle *child); - struct acpi_probe_entry; typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *, struct acpi_probe_entry *); @@ -1451,13 +1454,6 @@ static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode, } static inline struct fwnode_handle * -acpi_get_next_subnode(const struct fwnode_handle *fwnode, - struct fwnode_handle *child) -{ - return NULL; -} - -static inline struct fwnode_handle * acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { @@ -1509,12 +1505,19 @@ static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console) #if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI) int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res); +const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, + unsigned int index); #else static inline int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res) { return -EINVAL; } +static inline const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, + unsigned int index) +{ + return NULL; +} #endif #ifdef CONFIG_ACPI_LPIT @@ -1541,6 +1544,9 @@ int find_acpi_cpu_topology(unsigned int cpu, int level); int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); +void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); +int find_acpi_cache_level_from_id(u32 cache_id); +int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { @@ -1562,6 +1568,17 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) { return -EINVAL; } +static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, + cpumask_t *cpus) { } +static inline int find_acpi_cache_level_from_id(u32 cache_id) +{ + return -ENOENT; +} +static inline int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, + cpumask_t *cpus) +{ + return -ENOENT; +} #endif void acpi_arch_init(void); diff --git a/include/linux/annotate.h b/include/linux/annotate.h new file mode 100644 index 000000000000..7c10d34d198c --- /dev/null +++ b/include/linux/annotate.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ANNOTATE_H +#define _LINUX_ANNOTATE_H + +#include <linux/objtool_types.h> + +#ifdef CONFIG_OBJTOOL + +#ifndef __ASSEMBLY__ + +#define __ASM_ANNOTATE(section, label, type) \ + ".pushsection " section ",\"M\", @progbits, 8\n\t" \ + ".long " __stringify(label) " - .\n\t" \ + ".long " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define ASM_ANNOTATE_LABEL(label, type) \ + __ASM_ANNOTATE(".discard.annotate_insn", label, type) + +#define ASM_ANNOTATE(type) \ + "911:\n\t" \ + ASM_ANNOTATE_LABEL(911b, type) + +#define ASM_ANNOTATE_DATA(type) \ + "912:\n\t" \ + __ASM_ANNOTATE(".discard.annotate_data", 912b, type) + +#else /* __ASSEMBLY__ */ + +.macro __ANNOTATE section, type +.Lhere_\@: + .pushsection \section, "M", @progbits, 8 + .long .Lhere_\@ - . + .long \type + .popsection +.endm + +.macro ANNOTATE type + __ANNOTATE ".discard.annotate_insn", \type +.endm + +.macro ANNOTATE_DATA type + __ANNOTATE ".discard.annotate_data", \type +.endm + +#endif /* __ASSEMBLY__ */ + +#else /* !CONFIG_OBJTOOL */ +#ifndef __ASSEMBLY__ +#define ASM_ANNOTATE_LABEL(label, type) "" +#define ASM_ANNOTATE(type) +#define ASM_ANNOTATE_DATA(type) +#else /* __ASSEMBLY__ */ +.macro ANNOTATE type +.endm +.macro ANNOTATE_DATA type +.endm +#endif /* __ASSEMBLY__ */ +#endif /* !CONFIG_OBJTOOL */ + +#ifndef __ASSEMBLY__ + +/* + * Annotate away the various 'relocation to !ENDBR` complaints; knowing that + * these relocations will never be used for indirect calls. + */ +#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) +#define ANNOTATE_NOENDBR_SYM(sym) asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOENDBR)) + +/* + * This should be used immediately before an indirect jump/call. It tells + * objtool the subsequent indirect jump/call is vouched safe for retpoline + * builds. + */ +#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) +/* + * See linux/instrumentation.h + */ +#define ANNOTATE_INSTR_BEGIN(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_BEGIN) +#define ANNOTATE_INSTR_END(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_END) +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL) +/* + * Use objtool to validate the entry requirement that all code paths do + * VALIDATE_UNRET_END before RET. + * + * NOTE: The macro must be used at the beginning of a global symbol, otherwise + * it will be ignored. + */ +#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) +/* + * This should be used to refer to an instruction that is considered + * terminating, like a noreturn CALL or UD2 when we know they are not -- eg + * WARN using UD2. + */ +#define ANNOTATE_REACHABLE(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_REACHABLE) +/* + * This should not be used; it annotates away CFI violations. There are a few + * valid use cases like kexec handover to the next kernel image, and there is + * no security concern there. + * + * There are also a few real issues annotated away, like EFI because we can't + * control the EFI code. + */ +#define ANNOTATE_NOCFI_SYM(sym) asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOCFI)) + +/* + * Annotate a special section entry. This emables livepatch module generation + * to find and extract individual special section entries as needed. + */ +#define ANNOTATE_DATA_SPECIAL ASM_ANNOTATE_DATA(ANNOTYPE_DATA_SPECIAL) + +#else /* __ASSEMBLY__ */ +#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR +#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE +/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */ +/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */ +#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS +#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL +#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE +#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI +#define ANNOTATE_DATA_SPECIAL ANNOTATE_DATA type=ANNOTYPE_DATA_SPECIAL +#endif /* __ASSEMBLY__ */ + +#endif /* _LINUX_ANNOTATE_H */ diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index d72d6e5aa200..0c2a8b846c20 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -89,6 +89,21 @@ void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); void freq_inv_set_max_ratio(int cpu, u64 max_rate); -#endif + +/* + * Architectures like ARM64 don't have reliable architectural way to get SMT + * information and depend on the firmware (ACPI/OF) report. Non-SMT core won't + * initialize thread_id so we can use this to detect the SMT implementation. + */ +static inline bool topology_core_has_smt(int cpu) +{ + return cpu_topology[cpu].thread_id != -1; +} + +#else + +static inline bool topology_core_has_smt(int cpu) { return false; } + +#endif /* CONFIG_GENERIC_ARCH_TOPOLOGY */ #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h new file mode 100644 index 000000000000..7f00c5285a32 --- /dev/null +++ b/include/linux/arm_mpam.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2025 Arm Ltd. */ + +#ifndef __LINUX_ARM_MPAM_H +#define __LINUX_ARM_MPAM_H + +#include <linux/acpi.h> +#include <linux/types.h> + +struct mpam_msc; + +enum mpam_msc_iface { + MPAM_IFACE_MMIO, /* a real MPAM MSC */ + MPAM_IFACE_PCC, /* a fake MPAM MSC */ +}; + +enum mpam_class_types { + MPAM_CLASS_CACHE, /* Caches, e.g. L2, L3 */ + MPAM_CLASS_MEMORY, /* Main memory */ + MPAM_CLASS_UNKNOWN, /* Everything else, e.g. SMMU */ +}; + +#define MPAM_CLASS_ID_DEFAULT 255 + +#ifdef CONFIG_ACPI_MPAM +int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc); + +int acpi_mpam_count_msc(void); +#else +static inline int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc) +{ + return -EINVAL; +} + +static inline int acpi_mpam_count_msc(void) { return -EINVAL; } +#endif + +#ifdef CONFIG_ARM64_MPAM_DRIVER +int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, int component_id); +#else +static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, + int component_id) +{ + return -EINVAL; +} +#endif + +/** + * mpam_register_requestor() - Register a requestor with the MPAM driver + * @partid_max: The maximum PARTID value the requestor can generate. + * @pmg_max: The maximum PMG value the requestor can generate. + * + * Registers a requestor with the MPAM driver to ensure the chosen system-wide + * minimum PARTID and PMG values will allow the requestors features to be used. + * + * Returns an error if the registration is too late, and a larger PARTID/PMG + * value has been advertised to user-space. In this case the requestor should + * not use its MPAM features. Returns 0 on success. + */ +int mpam_register_requestor(u16 partid_max, u8 pmg_max); + +#endif /* __LINUX_ARM_MPAM_H */ diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index 9409a6ddf3e0..37ab6314a9f7 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -1276,7 +1276,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg(v, old, new); } @@ -1298,7 +1298,7 @@ static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_acquire(v, old, new); } @@ -1321,7 +1321,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_release(v, old, new); } @@ -1343,7 +1343,7 @@ static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_relaxed(v, old, new); } @@ -2854,7 +2854,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg(v, old, new); } @@ -2876,7 +2876,7 @@ static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_acquire(v, old, new); } @@ -2899,7 +2899,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_release(v, old, new); } @@ -2921,7 +2921,7 @@ static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_relaxed(v, old, new); } @@ -4432,7 +4432,7 @@ atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg(v, old, new); } @@ -4454,7 +4454,7 @@ static __always_inline bool atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_acquire(v, old, new); } @@ -4477,7 +4477,7 @@ atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_release(v, old, new); } @@ -4499,7 +4499,7 @@ static __always_inline bool atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_relaxed(v, old, new); } @@ -5050,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// 8829b337928e9508259079d32581775ececd415b +// f618ac667f868941a84ce0ab2242f1786e049ed4 diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c5c9d89c73ed..0217c1073735 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -63,6 +63,8 @@ enum wb_reason { struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; + unsigned long progress_stamp; /* The jiffies when slow progress is detected */ + unsigned long wait_start; /* The jiffies when waiting for the writeback work to finish */ }; #define __WB_COMPLETION_INIT(_waitq) \ @@ -168,7 +170,9 @@ struct backing_dev_info { u64 id; struct rb_node rb_node; /* keyed by ->id */ struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ + /* max readahead in PAGE_SIZE units */ + unsigned long __data_racy ra_pages; + unsigned long io_pages; /* max allowed IO size */ struct kref refcnt; /* Reference counter for the structure */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3e64f14739dd..0c8342747cab 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -277,10 +277,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) rcu_read_lock(); /* - * Paired with store_release in inode_switch_wbs_work_fn() and + * Paired with a release fence in inode_do_switch_wbs() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + cookie->locked = inode_state_read_once(inode) & I_WB_SWITCH; + smp_rmb(); if (unlikely(cookie->locked)) xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 851254f36eb3..21e4652dcfd2 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -13,7 +13,8 @@ enum bip_flags { BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ - BIP_P2P_DMA = 1 << 8, /* using P2P address */ + + BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ }; struct bio_integrity_payload { @@ -140,4 +141,8 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, return 0; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ + +void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); +void bio_integrity_free_buf(struct bio_integrity_payload *bip); + #endif /* _LINUX_BIO_INTEGRITY_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h index 16c1c85613b7..ad2d57908c1c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,6 +324,8 @@ extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, unsigned *segs, unsigned max_bytes, unsigned len_align); +u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next, + u8 gaps_bit); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 595217b7a6e7..b0395e4ccf90 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -45,6 +45,7 @@ struct device; * bitmap_copy(dst, src, nbits) *dst = *src * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 + * bitmap_weighted_or(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) * bitmap_complement(dst, src, nbits) *dst = ~(*src) @@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); +unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, @@ -338,6 +341,18 @@ void bitmap_or(unsigned long *dst, const unsigned long *src1, } static __always_inline +unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) { + *dst = *src1 | *src2; + return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits)); + } else { + return __bitmap_weighted_or(dst, src1, src2, nbits); + } +} + +static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index b659373788f6..a6b84206eb94 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -8,6 +8,11 @@ struct request; +/* + * Maximum contiguous integrity buffer allocation. + */ +#define BLK_INTEGRITY_MAX_SIZE SZ_2M + enum blk_integrity_flags { BLK_INTEGRITY_NOVERIFY = 1 << 0, BLK_INTEGRITY_NOGENERATE = 1 << 1, @@ -28,14 +33,6 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, #ifdef CONFIG_BLK_DEV_INTEGRITY int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); -static inline bool blk_rq_integrity_dma_unmap(struct request *req, - struct device *dma_dev, struct dma_iova_state *state, - size_t mapped_len) -{ - return blk_dma_unmap(req, dma_dev, state, mapped_len, - bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA); -} - int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); @@ -124,12 +121,6 @@ static inline int blk_rq_map_integrity_sg(struct request *q, { return 0; } -static inline bool blk_rq_integrity_dma_unmap(struct request *req, - struct device *dma_dev, struct dma_iova_state *state, - size_t mapped_len) -{ - return false; -} static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 51829958d872..cb88fc791fbd 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -16,13 +16,13 @@ struct blk_dma_iter { /* Output address range for this iteration */ dma_addr_t addr; u32 len; + struct pci_p2pdma_map_state p2pdma; /* Status code. Only valid when blk_rq_dma_map_iter_* returned false */ blk_status_t status; /* Internal to blk_rq_dma_map_iter_* */ struct blk_map_iter iter; - struct pci_p2pdma_map_state p2pdma; }; bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, @@ -43,36 +43,34 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) } /** - * blk_dma_unmap - try to DMA unmap a request + * blk_rq_dma_unmap - try to DMA unmap a request * @req: request to unmap * @dma_dev: device to unmap from * @state: DMA IOVA state * @mapped_len: number of bytes to unmap - * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR + * @map: peer-to-peer mapping type * * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ -static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len, bool is_p2p) +static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len, + enum pci_p2pdma_map_type map) { - if (is_p2p) + if (map == PCI_P2PDMA_MAP_BUS_ADDR) return true; if (dma_use_iova(state)) { + unsigned int attrs = 0; + + if (map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req), - 0); + attrs); return true; } return !dma_need_unmap(dma_dev); } - -static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len) -{ - return blk_dma_unmap(req, dma_dev, state, mapped_len, - req->cmd_flags & REQ_P2PDMA); -} - #endif /* BLK_MQ_DMA_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b25d12545f46..eb7254b3dddd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -152,6 +152,14 @@ struct request { unsigned short nr_phys_segments; unsigned short nr_integrity_segments; + /* + * The lowest set bit for address gaps between physical segments. This + * provides information necessary for dma optimization opprotunities, + * like for testing if the segments can be coalesced against the + * device's iommu granule. + */ + unsigned char phys_gap_bit; + #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; struct blk_crypto_keyslot *crypt_keyslot; @@ -208,6 +216,14 @@ struct request { void *end_io_data; }; +/* + * Returns a mask with all bits starting at req->phys_gap_bit set to 1. + */ +static inline unsigned long req_phys_gap_mask(const struct request *req) +{ + return ~(((1 << req->phys_gap_bit) >> 1) - 1); +} + static inline enum req_op req_op(const struct request *req) { return req->cmd_flags & REQ_OP_MASK; @@ -999,8 +1015,20 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) return rq + 1; } +static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) +{ + struct blk_mq_hw_ctx *hctx; + + rcu_read_lock(); + hctx = rcu_dereference(q->queue_hw_ctx)[id]; + rcu_read_unlock(); + + return hctx; +} + #define queue_for_each_hw_ctx(q, hctx, i) \ - xa_for_each(&(q)->hctx_table, (i), (hctx)) + for ((i) = 0; (i) < (q)->nr_hw_queues && \ + ({ hctx = queue_hctx((q), i); 1; }); (i)++) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 44c30183ecc3..cbbcb9051ec3 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -218,6 +218,18 @@ struct bio { enum rw_hint bi_write_hint; u8 bi_write_stream; blk_status_t bi_status; + + /* + * The bvec gap bit indicates the lowest set bit in any address offset + * between all bi_io_vecs. This field is initialized only after the bio + * is split to the hardware limits (see bio_split_io_at()). The value + * may be used to consider DMA optimization when performing that + * mapping. The value is compared to a power of two mask where the + * result depends on any bit set within the mask, so saving the lowest + * bit is sufficient to know if any segment gap collides with the mask. + */ + u8 bi_bvec_gap_bit; + atomic_t __bi_remaining; struct bvec_iter bi_iter; @@ -381,7 +393,6 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ - __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -414,7 +425,6 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) -#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 70b671a9a7f7..72e34acd439c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -38,6 +38,7 @@ struct blk_flush_queue; struct kiocb; struct pr_ops; struct rq_qos; +struct blk_report_zones_args; struct blk_queue_stats; struct blk_stat_callback; struct blk_crypto_profile; @@ -172,6 +173,7 @@ struct gendisk { #define GD_ADDED 4 #define GD_SUPPRESS_PART_SCAN 5 #define GD_OWNS_QUEUE 6 +#define GD_ZONE_APPEND_USED 7 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ @@ -195,7 +197,7 @@ struct gendisk { unsigned int nr_zones; unsigned int zone_capacity; unsigned int last_zone_capacity; - unsigned long __rcu *conv_zones_bitmap; + u8 __rcu *zones_cond; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; spinlock_t zone_wplugs_lock; @@ -378,7 +380,7 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; - unsigned int min_segment_size; + unsigned int max_fast_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; @@ -432,9 +434,17 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); +int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, + unsigned int idx, struct blk_report_zones_args *args); + +int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, + struct blk_zone *zone); + #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); +int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk); @@ -485,7 +495,7 @@ struct request_queue { */ unsigned long queue_flags; - unsigned int rq_timeout; + unsigned int __data_racy rq_timeout; unsigned int queue_depth; @@ -493,7 +503,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct xarray hctx_table; + struct blk_mq_hw_ctx * __rcu *queue_hw_ctx; struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; @@ -921,12 +931,20 @@ static inline unsigned int bdev_zone_capacity(struct block_device *bdev, { return disk_zone_capacity(bdev->bd_disk, pos); } + +bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector); + #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } +static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) +{ + return false; +} + static inline bool bio_needs_zone_write_plugging(struct bio *bio) { return false; @@ -1504,6 +1522,12 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev) return q->limits.chunk_sectors; } +static inline sector_t bdev_zone_start(struct block_device *bdev, + sector_t sector) +{ + return sector & ~(bdev_zone_sectors(bdev) - 1); +} + static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev, sector_t sector) { @@ -1529,33 +1553,6 @@ static inline bool bdev_is_zone_aligned(struct block_device *bdev, return bdev_is_zone_start(bdev, sector); } -/** - * bdev_zone_is_seq - check if a sector belongs to a sequential write zone - * @bdev: block device to check - * @sector: sector number - * - * Check if @sector on @bdev is contained in a sequential write required zone. - */ -static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) -{ - bool is_seq = false; - -#if IS_ENABLED(CONFIG_BLK_DEV_ZONED) - if (bdev_is_zoned(bdev)) { - struct gendisk *disk = bdev->bd_disk; - unsigned long *bitmap; - - rcu_read_lock(); - bitmap = rcu_dereference(disk->conv_zones_bitmap); - is_seq = !bitmap || - !test_bit(disk_zone_no(disk, sector), bitmap); - rcu_read_unlock(); - } -#endif - - return is_seq; -} - int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); @@ -1662,7 +1659,8 @@ struct block_device_operations { /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 122c62e561fc..05c8754456aa 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -14,11 +14,12 @@ #include <linux/sysfs.h> struct blk_trace { + int version; int trace_state; struct rchan *rchan; unsigned long __percpu *sequence; unsigned char __percpu *msg_data; - u16 act_mask; + u64 act_mask; u64 start_lba; u64 end_lba; u32 pid; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d808253f2e94..6498be4c44f8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -663,6 +663,16 @@ int map_check_no_btf(const struct bpf_map *map, bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1); +static inline bool bpf_map_has_internal_structs(struct bpf_map *map) +{ + return btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK); +} + +void bpf_map_free_internal_structs(struct bpf_map *map, void *obj); + +int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, + struct bpf_dynptr *ptr__uninit); + extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of @@ -785,12 +795,15 @@ enum bpf_type_flag { /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */ DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to file */ + DYNPTR_TYPE_FILE = BIT(20 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ - | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META) + | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META | DYNPTR_TYPE_FILE) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -988,6 +1001,7 @@ enum bpf_reg_type { PTR_TO_ARENA, PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_INSN, /* reg points to a bpf program instruction */ CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, @@ -1250,6 +1264,18 @@ typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start, bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog); bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP +static inline bool bpf_trampoline_use_jmp(u64 flags) +{ + return flags & BPF_TRAMP_F_CALL_ORIG && !(flags & BPF_TRAMP_F_SKIP_FRAME); +} +#else +static inline bool bpf_trampoline_use_jmp(u64 flags) +{ + return false; +} +#endif + struct bpf_ksym { unsigned long start; unsigned long end; @@ -1378,21 +1404,23 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_XDP, /* Points to skb_metadata_end()-skb_metadata_len() */ BPF_DYNPTR_TYPE_SKB_META, + /* Underlying data is a file */ + BPF_DYNPTR_TYPE_FILE, }; -int bpf_dynptr_check_size(u32 size); -u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); -const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); -void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); +int bpf_dynptr_check_size(u64 size); +u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len); +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len); bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); -int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, - void *src, u32 len, u64 flags); -void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk); +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, + void *src, u64 len, u64 flags); +void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk); -static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len) { - u32 size = __bpf_dynptr_size(ptr); + u64 size = __bpf_dynptr_size(ptr); if (len > size || offset > size - len) return -E2BIG; @@ -1616,6 +1644,7 @@ struct bpf_prog_aux { u32 ctx_arg_info_size; u32 max_rdonly_access; u32 max_rdwr_access; + u32 subprog_start; struct btf *attach_btf; struct bpf_ctx_arg_aux *ctx_arg_info; void __percpu *priv_stack_ptr; @@ -1905,12 +1934,14 @@ struct btf_member; * reason, if this callback is not defined, the check is skipped as * the struct_ops map will have final verification performed in * @reg. - * @type: BTF type. - * @value_type: Value type. + * @cfi_stubs: Pointer to a structure of stub functions for CFI. These stubs + * provide the correct Control Flow Integrity hashes for the + * trampolines generated by BPF struct_ops. + * @owner: The module that owns this struct_ops. Used for module reference + * counting to ensure the module providing the struct_ops cannot be + * unloaded while in use. * @name: The name of the struct bpf_struct_ops object. * @func_models: Func models - * @type_id: BTF type id. - * @value_id: BTF value id. */ struct bpf_struct_ops { const struct bpf_verifier_ops *verifier_ops; @@ -2099,6 +2130,12 @@ struct bpf_array { }; }; +/* + * The bpf_array_get_next_key() function may be used for all array-like + * maps, i.e., maps with u32 keys with range [0 ,..., max_entries) + */ +int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key); + #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 @@ -2374,6 +2411,9 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array *array, bool bpf_jit_bypass_spec_v1(void); bool bpf_jit_bypass_spec_v4(void); +#define bpf_rcu_lock_held() \ + (rcu_read_lock_held() || rcu_read_lock_trace_held() || rcu_read_lock_bh_held()) + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; @@ -3670,12 +3710,14 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, #endif /* CONFIG_INET */ enum bpf_text_poke_type { + BPF_MOD_NOP, BPF_MOD_CALL, BPF_MOD_JUMP, }; -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *addr1, void *addr2); +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr); void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, struct bpf_prog *new, struct bpf_prog *old); @@ -3772,4 +3814,30 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * const char **linep, int *nump); struct bpf_prog *bpf_prog_find_from_stack(void); +int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog); +int bpf_insn_array_ready(struct bpf_map *map); +void bpf_insn_array_release(struct bpf_map *map); +void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len); +void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len); + +#ifdef CONFIG_BPF_SYSCALL +void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image); +#else +static inline void +bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) +{ +} +#endif + +static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) +{ + if (flags & ~allowed_flags) + return -EINVAL; + + if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) + return -EINVAL; + + return 0; +} + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index ab7244d8108f..66432248cd81 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -18,9 +18,6 @@ #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 -#define bpf_rcu_lock_held() \ - (rcu_read_lock_held() || rcu_read_lock_trace_held() || \ - rcu_read_lock_bh_held()) struct bpf_local_storage_map_bucket { struct hlist_head list; raw_spinlock_t lock; @@ -56,9 +53,7 @@ struct bpf_local_storage_map { u32 bucket_log; u16 elem_size; u16 cache_idx; - struct bpf_mem_alloc selem_ma; - struct bpf_mem_alloc storage_ma; - bool bpf_ma; + bool use_kmalloc_nolock; }; struct bpf_local_storage_data { @@ -100,6 +95,7 @@ struct bpf_local_storage { */ struct rcu_head rcu; raw_spinlock_t lock; /* Protect adding/removing from the "list" */ + bool use_kmalloc_nolock; }; /* U16_MAX is much more than enough for sk local storage @@ -133,7 +129,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, - bool bpf_ma); + bool use_kmalloc_nolock); void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, @@ -187,10 +183,9 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, - bool charge_mem, bool swap_uptrs, gfp_t gfp_flags); + bool swap_uptrs, gfp_t gfp_flags); void bpf_selem_free(struct bpf_local_storage_elem *selem, - struct bpf_local_storage_map *smap, bool reuse_now); int diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa78f49d4a9a..b13de31e163f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -133,6 +133,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4c497e839526..130bcbd66f60 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -416,7 +416,7 @@ struct bpf_verifier_state { u32 active_irq_id; u32 active_lock_id; void *active_lock_ptr; - bool active_rcu_lock; + u32 active_rcu_locks; bool speculative; bool in_sleepable; @@ -509,6 +509,15 @@ struct bpf_map_ptr_state { #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) +/* + * An array of BPF instructions. + * Primary usage: return value of bpf_insn_successors. + */ +struct bpf_iarray { + int cnt; + u32 items[]; +}; + struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ @@ -540,6 +549,7 @@ struct bpf_insn_aux_data { /* remember the offset of node field within type to rewrite */ u64 insert_off; }; + struct bpf_iarray *jt; /* jump table for gotox or bpf_tailcall call instruction */ struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ @@ -548,7 +558,7 @@ struct bpf_insn_aux_data { bool nospec_result; /* result is unsafe under speculation, nospec must follow */ bool zext_dst; /* this insn zero extends dst reg */ bool needs_zext; /* alu op needs to clear upper bits */ - bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ + bool non_sleepable; /* helper/kfunc may be called from non-sleepable context */ bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ @@ -642,6 +652,7 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u32 postorder_start; /* The idx to the env->cfg.insn_postorder */ + u32 exit_idx; /* Index of one of the BPF_EXIT instructions in this subprogram */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; /* offsets in range [stack_depth .. fastcall_stack_off) @@ -659,9 +670,9 @@ struct bpf_subprog_info { bool keep_fastcall_stack: 1; bool changes_pkt_data: 1; bool might_sleep: 1; + u8 arg_cnt:3; enum priv_stack_mode priv_stack_mode; - u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; }; @@ -745,8 +756,10 @@ struct bpf_verifier_env { struct list_head free_list; /* list of struct bpf_verifier_state_list */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ + struct bpf_map *insn_array_maps[MAX_USED_MAPS]; /* array of INSN_ARRAY map's to be relocated */ u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ + u32 insn_array_map_cnt; /* number of used maps of type BPF_MAP_TYPE_INSN_ARRAY */ u32 id_gen; /* used to generate unique reg IDs */ u32 hidden_subprog_cnt; /* number of hidden subprogs */ int exception_callback_subprog; @@ -828,6 +841,8 @@ struct bpf_verifier_env { /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; + struct bpf_iarray *succ; + struct bpf_iarray *gotox_tmp_buf; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) @@ -1038,6 +1053,13 @@ static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_ return !(off % BPF_REG_SIZE); } +static inline bool insn_is_gotox(struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_JA && + BPF_SRC(insn->code) == BPF_X; +} + const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); const char *dynptr_type_str(enum bpf_dynptr_type type); const char *iter_type_str(const struct btf *btf, u32 btf_id); @@ -1050,7 +1072,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); int bpf_jmp_offset(struct bpf_insn *insn); -int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); +struct bpf_iarray *bpf_insn_successors(struct bpf_verifier_env *env, u32 idx); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); diff --git a/include/linux/bug.h b/include/linux/bug.h index a9948a9f1093..17a4933c611b 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -42,6 +42,7 @@ void bug_get_file_line(struct bug_entry *bug, const char **file, struct bug_entry *find_bug(unsigned long bugaddr); enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs); +enum bug_trap_type report_bug_entry(struct bug_entry *bug, struct pt_regs *regs); /* These are defined by the architecture */ int is_valid_bugaddr(unsigned long addr); @@ -62,6 +63,13 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, } struct bug_entry; + +static inline enum bug_trap_type +report_bug_entry(struct bug_entry *bug, struct pt_regs *regs) +{ + return BUG_TRAP_TYPE_BUG; +} + static inline void bug_get_file_line(struct bug_entry *bug, const char **file, unsigned int *line) { diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 014a88c41073..831c1b4b626c 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -18,4 +18,29 @@ void init_vmlinux_build_id(void); static inline void init_vmlinux_build_id(void) { } #endif +struct freader { + void *buf; + u32 buf_sz; + int err; + union { + struct { + struct file *file; + struct folio *folio; + void *addr; + loff_t folio_off; + bool may_fault; + }; + struct { + const char *data; + u64 data_sz; + }; + }; +}; + +void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, + struct file *file, bool may_fault); +void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz); +const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz); +void freader_cleanup(struct freader *r); + #endif diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h index b3705e8bbe2b..55a44199de87 100644 --- a/include/linux/byteorder/generic.h +++ b/include/linux/byteorder/generic.h @@ -173,6 +173,22 @@ static inline void cpu_to_le32_array(u32 *buf, unsigned int words) } } +static inline void le64_to_cpu_array(u64 *buf, unsigned int words) +{ + while (words--) { + __le64_to_cpus(buf); + buf++; + } +} + +static inline void cpu_to_le64_array(u64 *buf, unsigned int words) +{ + while (words--) { + __cpu_to_le64s(buf); + buf++; + } +} + static inline void memcpy_from_le32(u32 *dst, const __le32 *src, size_t words) { size_t i; diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index 7fcec025c5e0..559353ad64ac 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -74,7 +74,7 @@ enum cc_attr { CC_ATTR_GUEST_UNROLL_STRING_IO, /** - * @CC_ATTR_SEV_SNP: Guest SNP is active. + * @CC_ATTR_GUEST_SEV_SNP: Guest SNP is active. * * The platform/OS is running as a guest/virtual machine and actively * using AMD SEV-SNP features. diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 733e7f93db66..63e0e2aa1ce9 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -306,8 +306,7 @@ struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); u64 ceph_client_gid(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); extern void ceph_reset_client_addr(struct ceph_client *client); -extern int __ceph_open_session(struct ceph_client *client, - unsigned long started); +extern int __ceph_open_session(struct ceph_client *client); extern int ceph_open_session(struct ceph_client *client); int ceph_wait_for_latest_osdmap(struct ceph_client *client, unsigned long timeout); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6ed477338b16..bc892e3b37ee 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -137,9 +137,10 @@ extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); -void cgroup_exit(struct task_struct *p); -void cgroup_release(struct task_struct *p); -void cgroup_free(struct task_struct *p); +void cgroup_task_exit(struct task_struct *p); +void cgroup_task_dead(struct task_struct *p); +void cgroup_task_release(struct task_struct *p); +void cgroup_task_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); @@ -680,9 +681,10 @@ static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} -static inline void cgroup_exit(struct task_struct *p) {} -static inline void cgroup_release(struct task_struct *p) {} -static inline void cgroup_free(struct task_struct *p) {} +static inline void cgroup_task_exit(struct task_struct *p) {} +static inline void cgroup_task_dead(struct task_struct *p) {} +static inline void cgroup_task_release(struct task_struct *p) {} +static inline void cgroup_task_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2573585b7f06..0b55a8f6c59e 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -208,7 +208,7 @@ */ #define DEFINE_FREE(_name, _type, _free) \ - static inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } + static __always_inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } #define __free(_name) __cleanup(__free_##_name) @@ -220,7 +220,7 @@ __val; \ }) -static inline __must_check +static __always_inline __must_check const volatile void * __must_check_fn(const volatile void *val) { return val; } @@ -261,6 +261,10 @@ const volatile void * __must_check_fn(const volatile void *val) * CLASS(name, var)(args...): * declare the variable @var as an instance of the named class * + * CLASS_INIT(name, var, init_expr): + * declare the variable @var as an instance of the named class with + * custom initialization expression. + * * Ex. * * DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd) @@ -274,31 +278,35 @@ const volatile void * __must_check_fn(const volatile void *val) #define DEFINE_CLASS(_name, _type, _exit, _init, _init_args...) \ typedef _type class_##_name##_t; \ -static inline void class_##_name##_destructor(_type *p) \ +static __always_inline void class_##_name##_destructor(_type *p) \ { _type _T = *p; _exit; } \ -static inline _type class_##_name##_constructor(_init_args) \ +static __always_inline _type class_##_name##_constructor(_init_args) \ { _type t = _init; return t; } #define EXTEND_CLASS(_name, ext, _init, _init_args...) \ typedef class_##_name##_t class_##_name##ext##_t; \ -static inline void class_##_name##ext##_destructor(class_##_name##_t *p)\ +static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \ { class_##_name##_destructor(p); } \ -static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ +static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ { class_##_name##_t t = _init; return t; } #define CLASS(_name, var) \ class_##_name##_t var __cleanup(class_##_name##_destructor) = \ class_##_name##_constructor -#define scoped_class(_name, var, args) \ - for (CLASS(_name, var)(args); \ - __guard_ptr(_name)(&var) || !__is_cond_ptr(_name); \ - ({ goto _label; })) \ - if (0) { \ -_label: \ - break; \ +#define CLASS_INIT(_name, _var, _init_expr) \ + class_##_name##_t _var __cleanup(class_##_name##_destructor) = (_init_expr) + +#define __scoped_class(_name, var, _label, args...) \ + for (CLASS(_name, var)(args); ; ({ goto _label; })) \ + if (0) { \ +_label: \ + break; \ } else +#define scoped_class(_name, var, args...) \ + __scoped_class(_name, var, __UNIQUE_ID(label), args) + /* * DEFINE_GUARD(name, type, lock, unlock): * trivial wrapper around DEFINE_CLASS() above specifically @@ -340,6 +348,11 @@ _label: \ #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond +#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \ + __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ + { return (void *)1; } + #define __GUARD_IS_ERR(_ptr) \ ({ \ unsigned long _rc = (__force unsigned long)(_ptr); \ @@ -347,7 +360,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond }) #define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \ - static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ { \ void *_ptr = (void *)(__force unsigned long)*(_exp); \ if (IS_ERR(_ptr)) { \ @@ -355,7 +368,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond } \ return _ptr; \ } \ - static inline int class_##_name##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_lock_err(class_##_name##_t *_T) \ { \ long _rc = (__force unsigned long)*(_exp); \ if (!_rc) { \ @@ -384,9 +397,9 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond EXTEND_CLASS(_name, _ext, \ ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \ class_##_name##_t _T) \ - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ { return class_##_name##_lock_ptr(_T); } \ - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ { return class_##_name##_lock_err(_T); } /* @@ -466,7 +479,7 @@ typedef struct { \ __VA_ARGS__; \ } class_##_name##_t; \ \ -static inline void class_##_name##_destructor(class_##_name##_t *_T) \ +static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \ { \ if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \ } \ @@ -474,7 +487,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \ -static inline class_##_name##_t class_##_name##_constructor(_type *l) \ +static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \ { \ class_##_name##_t _t = { .lock = l }, *_T = &_t; \ _lock; \ @@ -482,7 +495,7 @@ static inline class_##_name##_t class_##_name##_constructor(_type *l) \ } #define __DEFINE_LOCK_GUARD_0(_name, _lock) \ -static inline class_##_name##_t class_##_name##_constructor(void) \ +static __always_inline class_##_name##_t class_##_name##_constructor(void) \ { \ class_##_name##_t _t = { .lock = (void*)1 }, \ *_T __maybe_unused = &_t; \ @@ -508,9 +521,9 @@ __DEFINE_LOCK_GUARD_0(_name, _lock) if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\ _t; }), \ typeof_member(class_##_name##_t, lock) l) \ - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ { return class_##_name##_lock_ptr(_T); } \ - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ { return class_##_name##_lock_err(_T); } #define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5b45ea7dff3e..ab181d87d71d 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -163,7 +163,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __asm__ ("" : "=r" (var) : "0" (var)) #endif -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) +/* Format: __UNIQUE_ID_<name>_<__COUNTER__> */ +#define __UNIQUE_ID(name) \ + __PASTE(__UNIQUE_ID_, \ + __PASTE(name, \ + __PASTE(_, __COUNTER__))) /** * data_race - mark an expression as containing intentional data races @@ -283,7 +287,7 @@ static inline void *offset_to_ptr(const int *off) */ #define ___ADDRESSABLE(sym, __attrs) \ static void * __used __attrs \ - __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; + __UNIQUE_ID(__PASTE(addressable_, sym)) = (void *)(uintptr_t)&sym; #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 0a1b9598940d..3eac51d68426 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -393,6 +393,21 @@ struct ftrace_likely_data { #define __counted_by_be(member) __counted_by(member) #endif +/* + * This designates the minimum number of elements a passed array parameter must + * have. For example: + * + * void some_function(u8 param[at_least 7]); + * + * If a caller passes an array with fewer than 7 elements, the compiler will + * emit a warning. + */ +#ifndef __CHECKER__ +#define at_least static +#else +#define at_least +#endif + /* Do not trap wrapping arithmetic within an annotated function. */ #ifdef CONFIG_UBSAN_INTEGER_WRAP # define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow"))) diff --git a/include/linux/console.h b/include/linux/console.h index 031a58dc2b91..fc9f5c5c1b04 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -19,6 +19,7 @@ #include <linux/irq_work.h> #include <linux/rculist.h> #include <linux/rcuwait.h> +#include <linux/smp.h> #include <linux/types.h> #include <linux/vesa.h> @@ -185,6 +186,8 @@ static inline void con_debug_leave(void) { } * printing callbacks must not be called. * @CON_NBCON: Console can operate outside of the legacy style console_lock * constraints. + * @CON_NBCON_ATOMIC_UNSAFE: The write_atomic() callback is not safe and is + * therefore only used by nbcon_atomic_flush_unsafe(). */ enum cons_flags { CON_PRINTBUFFER = BIT(0), @@ -196,6 +199,7 @@ enum cons_flags { CON_EXTENDED = BIT(6), CON_SUSPENDED = BIT(7), CON_NBCON = BIT(8), + CON_NBCON_ATOMIC_UNSAFE = BIT(9), }; /** @@ -602,16 +606,80 @@ static inline bool console_is_registered(const struct console *con) extern void nbcon_cpu_emergency_enter(void); extern void nbcon_cpu_emergency_exit(void); extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); +extern void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); +extern bool nbcon_allow_unsafe_takeover(void); +extern bool nbcon_kdb_try_acquire(struct console *con, + struct nbcon_write_context *wctxt); +extern void nbcon_kdb_release(struct nbcon_write_context *wctxt); + +/* + * Check if the given console is currently capable and allowed to print + * records. Note that this function does not consider the current context, + * which can also play a role in deciding if @con can be used to print + * records. + */ +static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) +{ + if (!(flags & CON_ENABLED)) + return false; + + if ((flags & CON_SUSPENDED)) + return false; + + if (flags & CON_NBCON) { + if (use_atomic) { + /* The write_atomic() callback is optional. */ + if (!con->write_atomic) + return false; + + /* + * An unsafe write_atomic() callback is only usable + * when unsafe takeovers are allowed. + */ + if ((flags & CON_NBCON_ATOMIC_UNSAFE) && !nbcon_allow_unsafe_takeover()) + return false; + } + + /* + * For the !use_atomic case, @printk_kthreads_running is not + * checked because the write_thread() callback is also used + * via the legacy loop when the printer threads are not + * available. + */ + } else { + if (!con->write) + return false; + } + + /* + * Console drivers may assume that per-cpu resources have been + * allocated. So unless they're explicitly marked as being able to + * cope (CON_ANYTIME) don't call them until this CPU is officially up. + */ + if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) + return false; + + return true; +} + #else static inline void nbcon_cpu_emergency_enter(void) { } static inline void nbcon_cpu_emergency_exit(void) { } static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } +static inline void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len) { } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } +static inline bool nbcon_kdb_try_acquire(struct console *con, + struct nbcon_write_context *wctxt) { return false; } +static inline void nbcon_kdb_release(struct nbcon_write_context *wctxt) { } +static inline bool console_is_usable(struct console *con, short flags, + bool use_atomic) { return false; } #endif extern int console_set_on_cmdline; diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 7b8433d5a8ef..0b81248aa03e 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -18,12 +18,6 @@ enum ctx_state { CT_STATE_MAX = 4, }; -/* Odd value for watching, else even. */ -#define CT_RCU_WATCHING CT_STATE_MAX - -#define CT_STATE_MASK (CT_STATE_MAX - 1) -#define CT_RCU_WATCHING_MASK (~CT_STATE_MASK) - struct context_tracking { #ifdef CONFIG_CONTEXT_TRACKING_USER /* @@ -44,9 +38,45 @@ struct context_tracking { #endif }; +/* + * We cram two different things within the same atomic variable: + * + * CT_RCU_WATCHING_START CT_STATE_START + * | | + * v v + * MSB [ RCU watching counter ][ context_state ] LSB + * ^ ^ + * | | + * CT_RCU_WATCHING_END CT_STATE_END + * + * Bits are used from the LSB upwards, so unused bits (if any) will always be in + * upper bits of the variable. + */ #ifdef CONFIG_CONTEXT_TRACKING +#define CT_SIZE (sizeof(((struct context_tracking *)0)->state) * BITS_PER_BYTE) + +#define CT_STATE_WIDTH bits_per(CT_STATE_MAX - 1) +#define CT_STATE_START 0 +#define CT_STATE_END (CT_STATE_START + CT_STATE_WIDTH - 1) + +#define CT_RCU_WATCHING_MAX_WIDTH (CT_SIZE - CT_STATE_WIDTH) +#define CT_RCU_WATCHING_WIDTH (IS_ENABLED(CONFIG_RCU_DYNTICKS_TORTURE) ? 2 : CT_RCU_WATCHING_MAX_WIDTH) +#define CT_RCU_WATCHING_START (CT_STATE_END + 1) +#define CT_RCU_WATCHING_END (CT_RCU_WATCHING_START + CT_RCU_WATCHING_WIDTH - 1) +#define CT_RCU_WATCHING BIT(CT_RCU_WATCHING_START) + +#define CT_STATE_MASK GENMASK(CT_STATE_END, CT_STATE_START) +#define CT_RCU_WATCHING_MASK GENMASK(CT_RCU_WATCHING_END, CT_RCU_WATCHING_START) + +#define CT_UNUSED_WIDTH (CT_RCU_WATCHING_MAX_WIDTH - CT_RCU_WATCHING_WIDTH) + +static_assert(CT_STATE_WIDTH + + CT_RCU_WATCHING_WIDTH + + CT_UNUSED_WIDTH == + CT_SIZE); + DECLARE_PER_CPU(struct context_tracking, context_tracking); -#endif +#endif /* CONFIG_CONTEXT_TRACKING */ #ifdef CONFIG_CONTEXT_TRACKING_USER static __always_inline int __ct_state(void) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index a9ee4fe55dcf..4073690504a7 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -248,7 +248,8 @@ extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, u64 latency_limit_ns); extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev); + struct cpuidle_device *dev, + u64 latency_limit_ns); extern void cpuidle_use_deepest_state(u64 latency_limit_ns); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, @@ -256,7 +257,8 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, u64 latency_limit_ns) {return -ENODEV; } static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + u64 latency_limit_ns) {return -ENODEV; } static inline void cpuidle_use_deepest_state(u64 latency_limit_ns) { diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index ff8f41ab7ce6..afedfd5bea07 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -126,6 +126,7 @@ extern struct cpumask __cpu_dying_mask; #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) extern atomic_t __num_online_cpus; +extern unsigned int __num_possible_cpus; extern cpumask_t cpus_booted_once_mask; @@ -729,6 +730,22 @@ void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, } /** + * cpumask_weighted_or - *dstp = *src1p | *src2p and return the weight of the result + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + * + * Return: The number of bits set in the resulting cpumask @dstp + */ +static __always_inline +unsigned int cpumask_weighted_or(struct cpumask *dstp, const struct cpumask *src1p, + const struct cpumask *src2p) +{ + return bitmap_weighted_or(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), small_cpumask_bits); +} + +/** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result * @src1p: the first input @@ -1005,6 +1022,7 @@ static __always_inline unsigned int cpumask_size(void) #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly +#define CPUMASK_VAR_NULL NULL bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); @@ -1051,6 +1069,7 @@ static __always_inline bool cpumask_available(cpumask_var_t mask) #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly +#define CPUMASK_VAR_NULL {} static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { @@ -1136,13 +1155,13 @@ void init_cpu_possible(const struct cpumask *src); #define __assign_cpu(cpu, mask, val) \ __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) -#define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) #define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) void set_cpu_online(unsigned int cpu, bool online); +void set_cpu_possible(unsigned int cpu, bool possible); /** * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * @@ -1195,7 +1214,12 @@ static __always_inline unsigned int num_online_cpus(void) { return raw_atomic_read(&__num_online_cpus); } -#define num_possible_cpus() cpumask_weight(cpu_possible_mask) + +static __always_inline unsigned int num_possible_cpus(void) +{ + return __num_possible_cpus; +} + #define num_enabled_cpus() cpumask_weight(cpu_enabled_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 2ddb256187b5..a98d3330385c 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -74,6 +74,7 @@ extern void inc_dl_tasks_cs(struct task_struct *task); extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); extern void cpuset_unlock(void); +extern void cpuset_cpus_allowed_locked(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern bool cpuset_cpu_is_isolated(int cpu); @@ -195,10 +196,16 @@ static inline void dec_dl_tasks_cs(struct task_struct *task) { } static inline void cpuset_lock(void) { } static inline void cpuset_unlock(void) { } +static inline void cpuset_cpus_allowed_locked(struct task_struct *p, + struct cpumask *mask) +{ + cpumask_copy(mask, task_cpu_possible_mask(p)); +} + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { - cpumask_copy(mask, task_cpu_possible_mask(p)); + cpuset_cpus_allowed_locked(p, mask); } static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) diff --git a/include/linux/cred.h b/include/linux/cred.h index 89ae50ad2ace..343a140a6ba2 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -20,6 +20,8 @@ struct cred; struct inode; +extern struct task_struct init_task; + /* * COW Supplementary groups list */ @@ -156,6 +158,11 @@ extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); +static inline const struct cred *kernel_cred(void) +{ + /* shut up sparse */ + return rcu_dereference_raw(init_task.cred); +} extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); @@ -180,6 +187,16 @@ static inline const struct cred *revert_creds(const struct cred *revert_cred) return rcu_replace_pointer(current->cred, revert_cred, 1); } +DEFINE_CLASS(override_creds, + const struct cred *, + revert_creds(_T), + override_creds(override_cred), const struct cred *override_cred) + +#define scoped_with_creds(cred) \ + scoped_class(override_creds, __UNIQUE_ID(label), cred) + +#define scoped_with_kernel_creds() scoped_with_creds(kernel_cred()) + /** * get_cred_many - Get references on a set of credentials * @cred: The credentials to reference @@ -263,6 +280,11 @@ static inline void put_cred(const struct cred *cred) put_cred_many(cred, 1); } +DEFINE_CLASS(prepare_creds, + struct cred *, + if (_T) put_cred(_T), + prepare_creds(), void) + DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T)) /** diff --git a/include/linux/delay.h b/include/linux/delay.h index 89866bab100d..46412c00033a 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -68,7 +68,7 @@ void usleep_range_state(unsigned long min, unsigned long max, * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * - * For basic information please refere to usleep_range_state(). + * For basic information please refer to usleep_range_state(). * * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep. */ @@ -82,10 +82,10 @@ static inline void usleep_range(unsigned long min, unsigned long max) * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * - * For basic information please refere to usleep_range_state(). + * For basic information please refer to usleep_range_state(). * * The sleeping task has the state TASK_IDLE during the sleep to prevent - * contribution to the load avarage. + * contribution to the load average. */ static inline void usleep_range_idle(unsigned long min, unsigned long max) { @@ -96,7 +96,7 @@ static inline void usleep_range_idle(unsigned long min, unsigned long max) * ssleep - wrapper for seconds around msleep * @seconds: Requested sleep duration in seconds * - * Please refere to msleep() for detailed information. + * Please refer to msleep() for detailed information. */ static inline void ssleep(unsigned int seconds) { diff --git a/include/linux/devfreq-governor.h b/include/linux/devfreq-governor.h new file mode 100644 index 000000000000..dfdd0160a29f --- /dev/null +++ b/include/linux/devfreq-governor.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * governor.h - internal header for devfreq governors. + * + * Copyright (C) 2011 Samsung Electronics + * MyungJoo Ham <myungjoo.ham@samsung.com> + * + * This header is for devfreq governors + */ + +#ifndef __LINUX_DEVFREQ_DEVFREQ_H__ +#define __LINUX_DEVFREQ_DEVFREQ_H__ + +#include <linux/devfreq.h> + +#define DEVFREQ_NAME_LEN 16 + +#define to_devfreq(DEV) container_of((DEV), struct devfreq, dev) + +/* Devfreq events */ +#define DEVFREQ_GOV_START 0x1 +#define DEVFREQ_GOV_STOP 0x2 +#define DEVFREQ_GOV_UPDATE_INTERVAL 0x3 +#define DEVFREQ_GOV_SUSPEND 0x4 +#define DEVFREQ_GOV_RESUME 0x5 + +#define DEVFREQ_MIN_FREQ 0 +#define DEVFREQ_MAX_FREQ ULONG_MAX + +/* + * Definition of the governor feature flags + * - DEVFREQ_GOV_FLAG_IMMUTABLE + * : This governor is never changeable to other governors. + * - DEVFREQ_GOV_FLAG_IRQ_DRIVEN + * : The devfreq won't schedule the work for this governor. + */ +#define DEVFREQ_GOV_FLAG_IMMUTABLE BIT(0) +#define DEVFREQ_GOV_FLAG_IRQ_DRIVEN BIT(1) + +/* + * Definition of governor attribute flags except for common sysfs attributes + * - DEVFREQ_GOV_ATTR_POLLING_INTERVAL + * : Indicate polling_interval sysfs attribute + * - DEVFREQ_GOV_ATTR_TIMER + * : Indicate timer sysfs attribute + */ +#define DEVFREQ_GOV_ATTR_POLLING_INTERVAL BIT(0) +#define DEVFREQ_GOV_ATTR_TIMER BIT(1) + +/** + * struct devfreq_governor - Devfreq policy governor + * @node: list node - contains registered devfreq governors + * @name: Governor's name + * @attrs: Governor's sysfs attribute flags + * @flags: Governor's feature flags + * @get_target_freq: Returns desired operating frequency for the device. + * Basically, get_target_freq will run + * devfreq_dev_profile.get_dev_status() to get the + * status of the device (load = busy_time / total_time). + * @event_handler: Callback for devfreq core framework to notify events + * to governors. Events include per device governor + * init and exit, opp changes out of devfreq, suspend + * and resume of per device devfreq during device idle. + * + * Note that the callbacks are called with devfreq->lock locked by devfreq. + */ +struct devfreq_governor { + struct list_head node; + + const char name[DEVFREQ_NAME_LEN]; + const u64 attrs; + const u64 flags; + int (*get_target_freq)(struct devfreq *this, unsigned long *freq); + int (*event_handler)(struct devfreq *devfreq, + unsigned int event, void *data); +}; + +void devfreq_monitor_start(struct devfreq *devfreq); +void devfreq_monitor_stop(struct devfreq *devfreq); +void devfreq_monitor_suspend(struct devfreq *devfreq); +void devfreq_monitor_resume(struct devfreq *devfreq); +void devfreq_update_interval(struct devfreq *devfreq, unsigned int *delay); + +int devfreq_add_governor(struct devfreq_governor *governor); +int devfreq_remove_governor(struct devfreq_governor *governor); + +int devm_devfreq_add_governor(struct device *dev, + struct devfreq_governor *governor); + +int devfreq_update_status(struct devfreq *devfreq, unsigned long freq); +int devfreq_update_target(struct devfreq *devfreq, unsigned long freq); +void devfreq_get_freq_range(struct devfreq *devfreq, unsigned long *min_freq, + unsigned long *max_freq); + +static inline int devfreq_update_stats(struct devfreq *df) +{ + if (!df->profile->get_dev_status) + return -EINVAL; + + return df->profile->get_dev_status(df->dev.parent, &df->last_status); +} +#endif /* __LINUX_DEVFREQ_DEVFREQ_H__ */ diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 84fdc3a6a19a..38f625af6ab4 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -538,12 +538,18 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone); #ifdef CONFIG_BLK_DEV_ZONED struct dm_report_zones_args { struct dm_target *tgt; + struct gendisk *disk; sector_t next_sector; - void *orig_data; - report_zones_cb orig_cb; unsigned int zone_idx; + /* for block layer ->report_zones */ + struct blk_report_zones_args *rep_args; + + /* for internal users */ + report_zones_cb cb; + void *data; + /* must be filled by ->report_zones before calling dm_report_zones_cb */ sector_t start; }; diff --git a/include/linux/efi.h b/include/linux/efi.h index a98cc39e7aaa..b23ff8b83219 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1126,6 +1126,8 @@ static inline bool efi_runtime_disabled(void) { return true; } extern void efi_call_virt_check_flags(unsigned long flags, const void *caller); extern unsigned long efi_call_virt_save_flags(void); +void efi_runtime_assert_lock_held(void); + enum efi_secureboot_mode { efi_secureboot_mode_unset, efi_secureboot_mode_unknown, diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 69b136e4dd2b..bb3dcded055f 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h @@ -60,23 +60,21 @@ #else /* !__ASSEMBLER__ */ #include <uapi/linux/elf.h> +#include <linux/compiler.h> /* * Use an anonymous structure which matches the shape of * Elf{32,64}_Nhdr, but includes the name and desc data. The size and * type of name and desc depend on the macro arguments. "name" must - * be a literal string, and "desc" must be passed by value. You may - * only define one note per line, since __LINE__ is used to generate - * unique symbols. + * be a literal string, and "desc" must be passed by value. */ -#define _ELFNOTE_PASTE(a,b) a##b -#define _ELFNOTE(size, name, unique, type, desc) \ +#define ELFNOTE(size, name, type, desc) \ static const struct { \ struct elf##size##_note _nhdr; \ unsigned char _name[sizeof(name)] \ __attribute__((aligned(sizeof(Elf##size##_Word)))); \ typeof(desc) _desc \ __attribute__((aligned(sizeof(Elf##size##_Word)))); \ - } _ELFNOTE_PASTE(_note_, unique) \ + } __UNIQUE_ID(note) \ __used \ __attribute__((section(".note." name), \ aligned(sizeof(Elf##size##_Word)), \ @@ -89,11 +87,10 @@ name, \ desc \ } -#define ELFNOTE(size, name, type, desc) \ - _ELFNOTE(size, name, __LINE__, type, desc) #define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc) #define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc) + #endif /* __ASSEMBLER__ */ #endif /* _LINUX_ELFNOTE_H */ diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 61d50571ad88..43aa6153dc57 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -54,6 +54,8 @@ struct em_perf_table { /** * struct em_perf_domain - Performance domain * @em_table: Pointer to the runtime modifiable em_perf_table + * @node: node in em_pd_list (in energy_model.c) + * @id: A unique ID number for each performance domain * @nr_perf_states: Number of performance states * @min_perf_state: Minimum allowed Performance State index * @max_perf_state: Maximum allowed Performance State index @@ -71,6 +73,8 @@ struct em_perf_table { */ struct em_perf_domain { struct em_perf_table __rcu *em_table; + struct list_head node; + int id; int nr_perf_states; int min_perf_state; int max_perf_state; diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 7177436f0f9e..87efb38b7081 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -3,11 +3,11 @@ #define __LINUX_ENTRYCOMMON_H #include <linux/irq-entry-common.h> +#include <linux/livepatch.h> #include <linux/ptrace.h> +#include <linux/resume_user_mode.h> #include <linux/seccomp.h> #include <linux/sched.h> -#include <linux/livepatch.h> -#include <linux/resume_user_mode.h> #include <asm/entry-common.h> #include <asm/syscall.h> @@ -37,6 +37,7 @@ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ ARCH_SYSCALL_WORK_ENTER) + #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ @@ -44,25 +45,7 @@ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) -/** - * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts - * @regs: Pointer to currents pt_regs - * - * Invoked from architecture specific syscall entry code with interrupts - * disabled. The calling code has to be non-instrumentable. When the - * function returns all state is correct, interrupts are enabled and the - * subsequent functions can be instrumented. - * - * This handles lockdep, RCU (context tracking) and tracing state, i.e. - * the functionality provided by enter_from_user_mode(). - * - * This is invoked when there is extra architecture specific functionality - * to be done between establishing state and handling user mode entry work. - */ -void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); - -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work); +long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); /** * syscall_enter_from_user_mode_work - Check and handle work before invoking @@ -71,8 +54,8 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, * @syscall: The syscall number * * Invoked from architecture specific syscall entry code with interrupts - * enabled after invoking syscall_enter_from_user_mode_prepare() and extra - * architecture specific work. + * enabled after invoking enter_from_user_mode(), enabling interrupts and + * extra architecture specific work. * * Returns: The original or a modified syscall number * @@ -108,8 +91,9 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re * function returns all state is correct, interrupts are enabled and the * subsequent functions can be instrumented. * - * This is combination of syscall_enter_from_user_mode_prepare() and - * syscall_enter_from_user_mode_work(). + * This is the combination of enter_from_user_mode() and + * syscall_enter_from_user_mode_work() to be used when there is no + * architecture specific work to be done between the two. * * Returns: The original or a modified syscall number. See * syscall_enter_from_user_mode_work() for further explanation. @@ -162,7 +146,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) local_irq_enable(); } - rseq_syscall(regs); + rseq_debug_syscall_return(regs); /* * Do one-time syscall specific work. If these work items are @@ -172,7 +156,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) if (unlikely(work & SYSCALL_WORK_EXIT)) syscall_exit_work(regs, work); local_irq_disable_exit_to_user(); - exit_to_user_mode_prepare(regs); + syscall_exit_to_user_mode_prepare(regs); } /** diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 8c829d28dcf3..58fd14c82270 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -8,6 +8,10 @@ struct dentry; struct kmem_cache; +enum fault_flags { + FAULT_NOWARN = 1 << 0, +}; + #ifdef CONFIG_FAULT_INJECTION #include <linux/atomic.h> @@ -36,10 +40,6 @@ struct fault_attr { struct dentry *dname; }; -enum fault_flags { - FAULT_NOWARN = 1 << 0, -}; - #define FAULT_ATTR_INITIALIZER { \ .interval = 1, \ .times = ATOMIC_INIT(1), \ diff --git a/include/linux/file.h b/include/linux/file.h index af1768d934a0..cf389fde9bc2 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -127,4 +127,130 @@ extern void __fput_sync(struct file *); extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; +/* + * fd_prepare: Combined fd + file allocation cleanup class. + * @err: Error code to indicate if allocation succeeded. + * @__fd: Allocated fd (may not be accessed directly) + * @__file: Allocated struct file pointer (may not be accessed directly) + * + * Allocates an fd and a file together. On error paths, automatically cleans + * up whichever resource was successfully allocated. Allows flexible file + * allocation with different functions per usage. + * + * Do not use directly. + */ +struct fd_prepare { + s32 err; + s32 __fd; /* do not access directly */ + struct file *__file; /* do not access directly */ +}; + +/* Typedef for fd_prepare cleanup guards. */ +typedef struct fd_prepare class_fd_prepare_t; + +/* + * Accessors for fd_prepare class members. + * _Generic() is used for zero-cost type safety. + */ +#define fd_prepare_fd(_fdf) \ + (_Generic((_fdf), struct fd_prepare: (_fdf).__fd)) + +#define fd_prepare_file(_fdf) \ + (_Generic((_fdf), struct fd_prepare: (_fdf).__file)) + +/* Do not use directly. */ +static inline void class_fd_prepare_destructor(const struct fd_prepare *fdf) +{ + if (unlikely(fdf->err)) { + if (likely(fdf->__fd >= 0)) + put_unused_fd(fdf->__fd); + if (unlikely(!IS_ERR_OR_NULL(fdf->__file))) + fput(fdf->__file); + } +} + +/* Do not use directly. */ +static inline int class_fd_prepare_lock_err(const struct fd_prepare *fdf) +{ + if (unlikely(fdf->err)) + return fdf->err; + if (unlikely(fdf->__fd < 0)) + return fdf->__fd; + if (unlikely(IS_ERR(fdf->__file))) + return PTR_ERR(fdf->__file); + if (unlikely(!fdf->__file)) + return -ENOMEM; + return 0; +} + +/* + * __FD_PREPARE_INIT - Helper to initialize fd_prepare class. + * @_fd_flags: flags for get_unused_fd_flags() + * @_file_owned: expression that returns struct file * + * + * Returns a struct fd_prepare with fd, file, and err set. + * If fd allocation fails, fd will be negative and err will be set. If + * fd succeeds but file_init_expr fails, file will be ERR_PTR and err + * will be set. The err field is the single source of truth for error + * checking. + */ +#define __FD_PREPARE_INIT(_fd_flags, _file_owned) \ + ({ \ + struct fd_prepare fdf = { \ + .__fd = get_unused_fd_flags((_fd_flags)), \ + }; \ + if (likely(fdf.__fd >= 0)) \ + fdf.__file = (_file_owned); \ + fdf.err = ACQUIRE_ERR(fd_prepare, &fdf); \ + fdf; \ + }) + +/* + * FD_PREPARE - Macro to declare and initialize an fd_prepare variable. + * + * Declares and initializes an fd_prepare variable with automatic + * cleanup. No separate scope required - cleanup happens when variable + * goes out of scope. + * + * @_fdf: name of struct fd_prepare variable to define + * @_fd_flags: flags for get_unused_fd_flags() + * @_file_owned: struct file to take ownership of (can be expression) + */ +#define FD_PREPARE(_fdf, _fd_flags, _file_owned) \ + CLASS_INIT(fd_prepare, _fdf, __FD_PREPARE_INIT(_fd_flags, _file_owned)) + +/* + * fd_publish - Publish prepared fd and file to the fd table. + * @_fdf: struct fd_prepare variable + */ +#define fd_publish(_fdf) \ + ({ \ + struct fd_prepare *fdp = &(_fdf); \ + VFS_WARN_ON_ONCE(fdp->err); \ + VFS_WARN_ON_ONCE(fdp->__fd < 0); \ + VFS_WARN_ON_ONCE(IS_ERR_OR_NULL(fdp->__file)); \ + fd_install(fdp->__fd, fdp->__file); \ + fdp->__fd; \ + }) + +/* Do not use directly. */ +#define __FD_ADD(_fdf, _fd_flags, _file_owned) \ + ({ \ + FD_PREPARE(_fdf, _fd_flags, _file_owned); \ + s32 ret = _fdf.err; \ + if (likely(!ret)) \ + ret = fd_publish(_fdf); \ + ret; \ + }) + +/* + * FD_ADD - Allocate and install an fd and file in one step. + * @_fd_flags: flags for get_unused_fd_flags() + * @_file_owned: struct file to take ownership of + * + * Returns the allocated fd number, or negative error code on failure. + */ +#define FD_ADD(_fd_flags, _file_owned) \ + __FD_ADD(__UNIQUE_ID(fd_prepare), _fd_flags, _file_owned) + #endif /* __LINUX_FILE_H */ diff --git a/include/linux/filelock.h b/include/linux/filelock.h index c2ce8ba05d06..54b824c05299 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -159,6 +159,8 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned int, int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); +int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg); +int fcntl_getdeleg(struct file *filp, struct delegation *deleg); static inline bool lock_is_unlock(struct file_lock *fl) { @@ -212,7 +214,14 @@ int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); void locks_init_lease(struct file_lease *); void locks_free_lease(struct file_lease *fl); struct file_lease *locks_alloc_lease(void); -int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); + +#define LEASE_BREAK_LEASE BIT(0) // break leases and delegations +#define LEASE_BREAK_DELEG BIT(1) // break delegations only +#define LEASE_BREAK_LAYOUT BIT(2) // break layouts only +#define LEASE_BREAK_NONBLOCK BIT(3) // non-blocking break +#define LEASE_BREAK_OPEN_RDONLY BIT(4) // readonly open event + +int __break_lease(struct inode *inode, unsigned int flags); void lease_get_mtime(struct inode *, struct timespec64 *time); int generic_setlease(struct file *, int, struct file_lease **, void **priv); int kernel_setlease(struct file *, int, struct file_lease **, void **); @@ -271,6 +280,16 @@ static inline int fcntl_getlease(struct file *filp) return F_UNLCK; } +static inline int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg) +{ + return -EINVAL; +} + +static inline int fcntl_getdeleg(struct file *filp, struct delegation *deleg) +{ + return -EINVAL; +} + static inline bool lock_is_unlock(struct file_lock *fl) { return false; @@ -367,7 +386,7 @@ static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *f return -ENOLCK; } -static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) +static inline int __break_lease(struct inode *inode, unsigned int flags) { return 0; } @@ -428,6 +447,17 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) } #ifdef CONFIG_FILE_LOCKING +static inline unsigned int openmode_to_lease_flags(unsigned int mode) +{ + unsigned int flags = 0; + + if ((mode & O_ACCMODE) == O_RDONLY) + flags |= LEASE_BREAK_OPEN_RDONLY; + if (mode & O_NONBLOCK) + flags |= LEASE_BREAK_NONBLOCK; + return flags; +} + static inline int break_lease(struct inode *inode, unsigned int mode) { struct file_lock_context *flctx; @@ -443,11 +473,11 @@ static inline int break_lease(struct inode *inode, unsigned int mode) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) - return __break_lease(inode, mode, FL_LEASE); + return __break_lease(inode, LEASE_BREAK_LEASE | openmode_to_lease_flags(mode)); return 0; } -static inline int break_deleg(struct inode *inode, unsigned int mode) +static inline int break_deleg(struct inode *inode, unsigned int flags) { struct file_lock_context *flctx; @@ -461,60 +491,84 @@ static inline int break_deleg(struct inode *inode, unsigned int mode) if (!flctx) return 0; smp_mb(); - if (!list_empty_careful(&flctx->flc_lease)) - return __break_lease(inode, mode, FL_DELEG); + if (!list_empty_careful(&flctx->flc_lease)) { + flags |= LEASE_BREAK_DELEG; + return __break_lease(inode, flags); + } return 0; } -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +struct delegated_inode { + struct inode *di_inode; +}; + +static inline bool is_delegated(struct delegated_inode *di) +{ + return di->di_inode; +} + +static inline int try_break_deleg(struct inode *inode, + struct delegated_inode *di) { int ret; - ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); - if (ret == -EWOULDBLOCK && delegated_inode) { - *delegated_inode = inode; + ret = break_deleg(inode, LEASE_BREAK_NONBLOCK); + if (ret == -EWOULDBLOCK && di) { + di->di_inode = inode; ihold(inode); } return ret; } -static inline int break_deleg_wait(struct inode **delegated_inode) +static inline int break_deleg_wait(struct delegated_inode *di) { int ret; - ret = break_deleg(*delegated_inode, O_WRONLY); - iput(*delegated_inode); - *delegated_inode = NULL; + ret = break_deleg(di->di_inode, 0); + iput(di->di_inode); + di->di_inode = NULL; return ret; } static inline int break_layout(struct inode *inode, bool wait) { smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) - return __break_lease(inode, - wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, - FL_LAYOUT); + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) { + unsigned int flags = LEASE_BREAK_LAYOUT; + + if (!wait) + flags |= LEASE_BREAK_NONBLOCK; + + return __break_lease(inode, flags); + } return 0; } #else /* !CONFIG_FILE_LOCKING */ -static inline int break_lease(struct inode *inode, unsigned int mode) +struct delegated_inode { }; + +static inline bool is_delegated(struct delegated_inode *di) +{ + return false; +} + +static inline int break_lease(struct inode *inode, bool wait) { return 0; } -static inline int break_deleg(struct inode *inode, unsigned int mode) +static inline int break_deleg(struct inode *inode, unsigned int flags) { return 0; } -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +static inline int try_break_deleg(struct inode *inode, + struct delegated_inode *delegated_inode) { return 0; } -static inline int break_deleg_wait(struct inode **delegated_inode) +static inline int break_deleg_wait(struct delegated_inode *delegated_inode) { BUG(); return 0; diff --git a/include/linux/filter.h b/include/linux/filter.h index 03e7516c6187..fd54fed8f95f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -712,11 +712,13 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, ret = dfunc(ctx, prog->insnsi, prog->bpf_func); duration = sched_clock() - start; - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, duration); - u64_stats_update_end_irqrestore(&stats->syncp, flags); + if (likely(prog->stats)) { + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->cnt); + u64_stats_add(&stats->nsecs, duration); + u64_stats_update_end_irqrestore(&stats->syncp, flags); + } } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 32884c9721e5..0a8c6c4d1a82 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -22,14 +22,18 @@ extern bool pm_nosig_freezing; /* PM nosig freezing in effect */ extern unsigned int freeze_timeout_msecs; /* - * Check if a process has been frozen + * Check if a process has been frozen for PM or cgroup1 freezer. Note that + * cgroup2 freezer uses the job control mechanism and does not interact with + * the PM freezer. */ extern bool frozen(struct task_struct *p); extern bool freezing_slow_path(struct task_struct *p); /* - * Check if there is a request to freeze a process + * Check if there is a request to freeze a task from PM or cgroup1 freezer. + * Note that cgroup2 freezer uses the job control mechanism and does not + * interact with the PM freezer. */ static inline bool freezing(struct task_struct *p) { @@ -63,9 +67,9 @@ extern bool freeze_task(struct task_struct *p); extern bool set_freezable(void); #ifdef CONFIG_CGROUP_FREEZER -extern bool cgroup_freezing(struct task_struct *task); +extern bool cgroup1_freezing(struct task_struct *task); #else /* !CONFIG_CGROUP_FREEZER */ -static inline bool cgroup_freezing(struct task_struct *task) +static inline bool cgroup1_freezing(struct task_struct *task) { return false; } diff --git a/include/linux/fs.h b/include/linux/fs.h index dd3b57cfadee..ce25feb06727 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H +#include <linux/fs/super.h> #include <linux/vfsdebug.h> #include <linux/linkage.h> #include <linux/wait_bit.h> @@ -11,7 +12,6 @@ #include <linux/stat.h> #include <linux/cache.h> #include <linux/list.h> -#include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> #include <linux/xarray.h> @@ -37,7 +37,6 @@ #include <linux/uuid.h> #include <linux/errseq.h> #include <linux/ioprio.h> -#include <linux/fs_types.h> #include <linux/build_bug.h> #include <linux/stddef.h> #include <linux/mount.h> @@ -52,11 +51,9 @@ #include <asm/byteorder.h> #include <uapi/linux/fs.h> -struct backing_dev_info; struct bdi_writeback; struct bio; struct io_comp_batch; -struct export_operations; struct fiemap_extent_info; struct hd_geometry; struct iovec; @@ -70,16 +67,13 @@ struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; -struct workqueue_struct; struct iov_iter; -struct fscrypt_operations; -struct fsverity_operations; struct fsnotify_mark_connector; -struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct file_kattr; struct iomap_ops; +struct delegated_inode; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -299,11 +293,6 @@ struct iattr { }; /* - * Includes for diskquotas. - */ -#include <linux/quota.h> - -/* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow */ @@ -367,23 +356,9 @@ struct readahead_control; #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) -/* - * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the - * iocb completion can be passed back to the owner for execution from a safe - * context rather than needing to be punted through a workqueue. If this - * flag is set, the bio completion handling may set iocb->dio_complete to a - * handler function and iocb->private to context information for that handler. - * The issuer should call the handler with that context information from task - * context to complete the processing of the iocb. Note that while this - * provides a task context for the dio_complete() callback, it should only be - * used on the completion side for non-IO generating completions. It's fine to - * call blocking functions from this callback, but they should not wait for - * unrelated IO (like cache flushing, new IO generation, etc). - */ -#define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ -#define IOCB_AIO_RW (1 << 23) -#define IOCB_HAS_METADATA (1 << 24) +#define IOCB_AIO_RW (1 << 22) +#define IOCB_HAS_METADATA (1 << 23) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ @@ -400,7 +375,6 @@ struct readahead_control; { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ - { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \ { IOCB_AIO_RW, "AIO_RW" }, \ { IOCB_HAS_METADATA, "AIO_HAS_METADATA" } @@ -412,23 +386,13 @@ struct kiocb { int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ u8 ki_write_stream; - union { - /* - * Only used for async buffered reads, where it denotes the - * page waitqueue associated with completing the read. Valid - * IFF IOCB_WAITQ is set. - */ - struct wait_page_queue *ki_waitq; - /* - * Can be used for O_DIRECT IO, where the completion handling - * is punted back to the issuer of the IO. May only be set - * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer - * must then check for presence of this handler when ki_complete - * is invoked. The data passed in to this handler must be - * assigned to ->private when dio_complete is assigned. - */ - ssize_t (*dio_complete)(void *data); - }; + + /* + * Only used for async buffered reads, where it denotes the page + * waitqueue associated with completing the read. + * Valid IFF IOCB_WAITQ is set. + */ + struct wait_page_queue *ki_waitq; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) @@ -659,13 +623,14 @@ is_uncached_acl(struct posix_acl *acl) return (long)acl & 1; } -#define IOP_FASTPERM 0x0001 -#define IOP_LOOKUP 0x0002 -#define IOP_NOFOLLOW 0x0004 -#define IOP_XATTR 0x0008 +#define IOP_FASTPERM 0x0001 +#define IOP_LOOKUP 0x0002 +#define IOP_NOFOLLOW 0x0004 +#define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 -#define IOP_MGTIME 0x0020 -#define IOP_CACHED_LINK 0x0040 +#define IOP_MGTIME 0x0020 +#define IOP_CACHED_LINK 0x0040 +#define IOP_FASTPERM_MAY_EXEC 0x0080 /* * Inode state bits. Protected by inode->i_lock @@ -759,7 +724,7 @@ enum inode_state_bits { /* reserved wait address bit 3 */ }; -enum inode_state_flags_t { +enum inode_state_flags_enum { I_NEW = (1U << __I_NEW), I_SYNC = (1U << __I_SYNC), I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING), @@ -786,6 +751,13 @@ enum inode_state_flags_t { #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) /* + * Use inode_state_read() & friends to access. + */ +struct inode_state_flags { + enum inode_state_flags_enum __state; +}; + +/* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode' @@ -793,14 +765,13 @@ enum inode_state_flags_t { struct inode { umode_t i_mode; unsigned short i_opflags; - kuid_t i_uid; - kgid_t i_gid; unsigned int i_flags; - #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif + kuid_t i_uid; + kgid_t i_gid; const struct inode_operations *i_op; struct super_block *i_sb; @@ -843,7 +814,7 @@ struct inode { #endif /* Misc */ - enum inode_state_flags_t i_state; + struct inode_state_flags i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -902,6 +873,80 @@ struct inode { void *i_private; /* fs or device private pointer */ } __randomize_layout; +/* + * i_state handling + * + * We hide all of it behind helpers so that we can validate consumers. + */ +static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode) +{ + return READ_ONCE(inode->i_state.__state); +} + +static inline enum inode_state_flags_enum inode_state_read(struct inode *inode) +{ + lockdep_assert_held(&inode->i_lock); + return inode->i_state.__state; +} + +static inline void inode_state_set_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state.__state, inode->i_state.__state | flags); +} + +static inline void inode_state_set(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_set_raw(inode, flags); +} + +static inline void inode_state_clear_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state.__state, inode->i_state.__state & ~flags); +} + +static inline void inode_state_clear(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_clear_raw(inode, flags); +} + +static inline void inode_state_assign_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state.__state, flags); +} + +static inline void inode_state_assign(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_assign_raw(inode, flags); +} + +static inline void inode_state_replace_raw(struct inode *inode, + enum inode_state_flags_enum clearflags, + enum inode_state_flags_enum setflags) +{ + enum inode_state_flags_enum flags; + flags = inode->i_state.__state; + flags &= ~clearflags; + flags |= setflags; + inode_state_assign_raw(inode, flags); +} + +static inline void inode_state_replace(struct inode *inode, + enum inode_state_flags_enum clearflags, + enum inode_state_flags_enum setflags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_replace_raw(inode, clearflags, setflags); +} + static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { VFS_WARN_ON_INODE(strlen(link) != linklen, inode); @@ -949,6 +994,8 @@ static inline void inode_fake_hash(struct inode *inode) hlist_add_fake(&inode->i_hash); } +void wait_on_new_inode(struct inode *inode); + /* * inode->i_rwsem nesting subclasses for the lock validator: * @@ -1348,49 +1395,6 @@ extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct file *file); /* - * sb->s_flags. Note that these mirror the equivalent MS_* flags where - * represented in both. - */ -#define SB_RDONLY BIT(0) /* Mount read-only */ -#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ -#define SB_NODEV BIT(2) /* Disallow access to device special files */ -#define SB_NOEXEC BIT(3) /* Disallow program execution */ -#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ -#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ -#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ -#define SB_NOATIME BIT(10) /* Do not update access times. */ -#define SB_NODIRATIME BIT(11) /* Do not update directory access times */ -#define SB_SILENT BIT(15) -#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ -#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ -#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ -#define SB_I_VERSION BIT(23) /* Update inode I_version field */ -#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ - -/* These sb flags are internal to the kernel */ -#define SB_DEAD BIT(21) -#define SB_DYING BIT(24) -#define SB_FORCE BIT(27) -#define SB_NOSEC BIT(28) -#define SB_BORN BIT(29) -#define SB_ACTIVE BIT(30) -#define SB_NOUSER BIT(31) - -/* These flags relate to encoding and casefolding */ -#define SB_ENC_STRICT_MODE_FL (1 << 0) -#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) - -#define sb_has_strict_encoding(sb) \ - (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) - -#if IS_ENABLED(CONFIG_UNICODE) -#define sb_no_casefold_compat_fallback(sb) \ - (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) -#else -#define sb_no_casefold_compat_fallback(sb) (1) -#endif - -/* * Umount options */ @@ -1400,191 +1404,6 @@ extern int send_sigurg(struct file *file); #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ -/* sb->s_iflags */ -#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ -#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ -#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ -#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ - -/* sb->s_iflags to limit user namespace mounts */ -#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ -#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 -#define SB_I_UNTRUSTED_MOUNTER 0x00000040 -#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 - -#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ -#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ -#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ -#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ -#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ -#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ -#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ - -/* Possible states of 'frozen' field */ -enum { - SB_UNFROZEN = 0, /* FS is unfrozen */ - SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ - SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ - SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop - * internal threads if needed) */ - SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ -}; - -#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) - -struct sb_writers { - unsigned short frozen; /* Is sb frozen? */ - int freeze_kcount; /* How many kernel freeze requests? */ - int freeze_ucount; /* How many userspace freeze requests? */ - const void *freeze_owner; /* Owner of the freeze */ - struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; -}; - -struct mount; - -struct super_block { - struct list_head s_list; /* Keep this first */ - dev_t s_dev; /* search index; _not_ kdev_t */ - unsigned char s_blocksize_bits; - unsigned long s_blocksize; - loff_t s_maxbytes; /* Max file size */ - struct file_system_type *s_type; - const struct super_operations *s_op; - const struct dquot_operations *dq_op; - const struct quotactl_ops *s_qcop; - const struct export_operations *s_export_op; - unsigned long s_flags; - unsigned long s_iflags; /* internal SB_I_* flags */ - unsigned long s_magic; - struct dentry *s_root; - struct rw_semaphore s_umount; - int s_count; - atomic_t s_active; -#ifdef CONFIG_SECURITY - void *s_security; -#endif - const struct xattr_handler * const *s_xattr; -#ifdef CONFIG_FS_ENCRYPTION - const struct fscrypt_operations *s_cop; - struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ -#endif -#ifdef CONFIG_FS_VERITY - const struct fsverity_operations *s_vop; -#endif -#if IS_ENABLED(CONFIG_UNICODE) - struct unicode_map *s_encoding; - __u16 s_encoding_flags; -#endif - struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ - struct mount *s_mounts; /* list of mounts; _not_ for fs use */ - struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ - struct file *s_bdev_file; - struct backing_dev_info *s_bdi; - struct mtd_info *s_mtd; - struct hlist_node s_instances; - unsigned int s_quota_types; /* Bitmask of supported quota types */ - struct quota_info s_dquot; /* Diskquota specific options */ - - struct sb_writers s_writers; - - /* - * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and - * s_fsnotify_info together for cache efficiency. They are frequently - * accessed and rarely modified. - */ - void *s_fs_info; /* Filesystem private info */ - - /* Granularity of c/m/atime in ns (cannot be worse than a second) */ - u32 s_time_gran; - /* Time limits for c/m/atime in seconds */ - time64_t s_time_min; - time64_t s_time_max; -#ifdef CONFIG_FSNOTIFY - u32 s_fsnotify_mask; - struct fsnotify_sb_info *s_fsnotify_info; -#endif - - /* - * q: why are s_id and s_sysfs_name not the same? both are human - * readable strings that identify the filesystem - * a: s_id is allowed to change at runtime; it's used in log messages, - * and we want to when a device starts out as single device (s_id is dev - * name) but then a device is hot added and we have to switch to - * identifying it by UUID - * but s_sysfs_name is a handle for programmatic access, and can't - * change at runtime - */ - char s_id[32]; /* Informational name */ - uuid_t s_uuid; /* UUID */ - u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ - - /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ - char s_sysfs_name[UUID_STRING_LEN + 1]; - - unsigned int s_max_links; - unsigned int s_d_flags; /* default d_flags for dentries */ - - /* - * The next field is for VFS *only*. No filesystems have any business - * even looking at it. You had been warned. - */ - struct mutex s_vfs_rename_mutex; /* Kludge */ - - /* - * Filesystem subtype. If non-empty the filesystem type field - * in /proc/mounts will be "type.subtype" - */ - const char *s_subtype; - - const struct dentry_operations *__s_d_op; /* default d_op for dentries */ - - struct shrinker *s_shrink; /* per-sb shrinker handle */ - - /* Number of inodes with nlink == 0 but still referenced */ - atomic_long_t s_remove_count; - - /* Read-only state of the superblock is being changed */ - int s_readonly_remount; - - /* per-sb errseq_t for reporting writeback errors via syncfs */ - errseq_t s_wb_err; - - /* AIO completions deferred from interrupt context */ - struct workqueue_struct *s_dio_done_wq; - struct hlist_head s_pins; - - /* - * Owning user namespace and default context in which to - * interpret filesystem uids, gids, quotas, device nodes, - * xattrs and security labels. - */ - struct user_namespace *s_user_ns; - - /* - * The list_lru structure is essentially just a pointer to a table - * of per-node lru lists, each of which has its own spinlock. - * There is no need to put them into separate cachelines. - */ - struct list_lru s_dentry_lru; - struct list_lru s_inode_lru; - struct rcu_head rcu; - struct work_struct destroy_work; - - struct mutex s_sync_lock; /* sync serialisation lock */ - - /* - * Indicates how deep in a filesystem stack this SB is - */ - int s_stack_depth; - - /* s_inode_list_lock protects s_inodes */ - spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; - struct list_head s_inodes; /* all inodes */ - - spinlock_t s_inode_wblist_lock; - struct list_head s_inodes_wb; /* writeback inodes */ -} __randomize_layout; - static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; @@ -1902,66 +1721,6 @@ struct timespec64 simple_inode_init_ts(struct inode *inode); * Snapshotting support. */ -/* - * These are internal functions, please use sb_start_{write,pagefault,intwrite} - * instead. - */ -static inline void __sb_end_write(struct super_block *sb, int level) -{ - percpu_up_read(sb->s_writers.rw_sem + level-1); -} - -static inline void __sb_start_write(struct super_block *sb, int level) -{ - percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); -} - -static inline bool __sb_start_write_trylock(struct super_block *sb, int level) -{ - return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); -} - -#define __sb_writers_acquired(sb, lev) \ - percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) -#define __sb_writers_release(sb, lev) \ - percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_) - -/** - * __sb_write_started - check if sb freeze level is held - * @sb: the super we write to - * @level: the freeze level - * - * * > 0 - sb freeze level is held - * * 0 - sb freeze level is not held - * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN - */ -static inline int __sb_write_started(const struct super_block *sb, int level) -{ - return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); -} - -/** - * sb_write_started - check if SB_FREEZE_WRITE is held - * @sb: the super we write to - * - * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. - */ -static inline bool sb_write_started(const struct super_block *sb) -{ - return __sb_write_started(sb, SB_FREEZE_WRITE); -} - -/** - * sb_write_not_started - check if SB_FREEZE_WRITE is not held - * @sb: the super we write to - * - * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. - */ -static inline bool sb_write_not_started(const struct super_block *sb) -{ - return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; -} - /** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to @@ -1992,137 +1751,26 @@ static inline bool file_write_not_started(const struct file *file) return sb_write_not_started(file_inode(file)->i_sb); } -/** - * sb_end_write - drop write access to a superblock - * @sb: the super we wrote to - * - * Decrement number of writers to the filesystem. Wake up possible waiters - * wanting to freeze the filesystem. - */ -static inline void sb_end_write(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_WRITE); -} - -/** - * sb_end_pagefault - drop write access to a superblock from a page fault - * @sb: the super we wrote to - * - * Decrement number of processes handling write page fault to the filesystem. - * Wake up possible waiters wanting to freeze the filesystem. - */ -static inline void sb_end_pagefault(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_PAGEFAULT); -} - -/** - * sb_end_intwrite - drop write access to a superblock for internal fs purposes - * @sb: the super we wrote to - * - * Decrement fs-internal number of writers to the filesystem. Wake up possible - * waiters wanting to freeze the filesystem. - */ -static inline void sb_end_intwrite(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_FS); -} - -/** - * sb_start_write - get write access to a superblock - * @sb: the super we write to - * - * When a process wants to write data or metadata to a file system (i.e. dirty - * a page or an inode), it should embed the operation in a sb_start_write() - - * sb_end_write() pair to get exclusion against file system freezing. This - * function increments number of writers preventing freezing. If the file - * system is already frozen, the function waits until the file system is - * thawed. - * - * Since freeze protection behaves as a lock, users have to preserve - * ordering of freeze protection and other filesystem locks. Generally, - * freeze protection should be the outermost lock. In particular, we have: - * - * sb_start_write - * -> i_rwsem (write path, truncate, directory ops, ...) - * -> s_umount (freeze_super, thaw_super) - */ -static inline void sb_start_write(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_WRITE); -} - -static inline bool sb_start_write_trylock(struct super_block *sb) -{ - return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); -} - -/** - * sb_start_pagefault - get write access to a superblock from a page fault - * @sb: the super we write to - * - * When a process starts handling write page fault, it should embed the - * operation into sb_start_pagefault() - sb_end_pagefault() pair to get - * exclusion against file system freezing. This is needed since the page fault - * is going to dirty a page. This function increments number of running page - * faults preventing freezing. If the file system is already frozen, the - * function waits until the file system is thawed. - * - * Since page fault freeze protection behaves as a lock, users have to preserve - * ordering of freeze protection and other filesystem locks. It is advised to - * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault - * handling code implies lock dependency: - * - * mmap_lock - * -> sb_start_pagefault - */ -static inline void sb_start_pagefault(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_PAGEFAULT); -} - -/** - * sb_start_intwrite - get write access to a superblock for internal fs purposes - * @sb: the super we write to - * - * This is the third level of protection against filesystem freezing. It is - * free for use by a filesystem. The only requirement is that it must rank - * below sb_start_pagefault. - * - * For example filesystem can call sb_start_intwrite() when starting a - * transaction which somewhat eases handling of freezing for internal sources - * of filesystem changes (internal fs threads, discarding preallocation on file - * close, etc.). - */ -static inline void sb_start_intwrite(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_FS); -} - -static inline bool sb_start_intwrite_trylock(struct super_block *sb) -{ - return __sb_start_write_trylock(sb, SB_FREEZE_FS); -} - bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode); /* * VFS helper functions.. */ -int vfs_create(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t, bool); +int vfs_create(struct mnt_idmap *, struct dentry *, umode_t, + struct delegated_inode *); struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t); + struct dentry *, umode_t, struct delegated_inode *); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, - umode_t, dev_t); + umode_t, dev_t, struct delegated_inode *); int vfs_symlink(struct mnt_idmap *, struct inode *, - struct dentry *, const char *); + struct dentry *, const char *, struct delegated_inode *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, - struct dentry *, struct inode **); -int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); + struct dentry *, struct delegated_inode *); +int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *, + struct delegated_inode *); int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, - struct inode **); + struct delegated_inode *); /** * struct renamedata - contains all information required for renaming @@ -2140,7 +1788,7 @@ struct renamedata { struct dentry *old_dentry; struct dentry *new_parent; struct dentry *new_dentry; - struct inode **delegated_inode; + struct delegated_inode *delegated_inode; unsigned int flags; } __randomize_layout; @@ -2150,7 +1798,7 @@ static inline int vfs_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE, - WHITEOUT_DEV); + WHITEOUT_DEV, NULL); } struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, @@ -2431,72 +2079,6 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); -/** - * enum freeze_holder - holder of the freeze - * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem - * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem - * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed - * @FREEZE_EXCL: a freeze that can only be undone by the owner - * - * Indicate who the owner of the freeze or thaw request is and whether - * the freeze needs to be exclusive or can nest. - * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the - * same holder aren't allowed. It is however allowed to hold a single - * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at - * the same time. This is relied upon by some filesystems during online - * repair or similar. - */ -enum freeze_holder { - FREEZE_HOLDER_KERNEL = (1U << 0), - FREEZE_HOLDER_USERSPACE = (1U << 1), - FREEZE_MAY_NEST = (1U << 2), - FREEZE_EXCL = (1U << 3), -}; - -struct super_operations { - struct inode *(*alloc_inode)(struct super_block *sb); - void (*destroy_inode)(struct inode *); - void (*free_inode)(struct inode *); - - void (*dirty_inode) (struct inode *, int flags); - int (*write_inode) (struct inode *, struct writeback_control *wbc); - int (*drop_inode) (struct inode *); - void (*evict_inode) (struct inode *); - void (*put_super) (struct super_block *); - int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_super) (struct super_block *, enum freeze_holder who, const void *owner); - int (*freeze_fs) (struct super_block *); - int (*thaw_super) (struct super_block *, enum freeze_holder who, const void *owner); - int (*unfreeze_fs) (struct super_block *); - int (*statfs) (struct dentry *, struct kstatfs *); - int (*remount_fs) (struct super_block *, int *, char *); - void (*umount_begin) (struct super_block *); - - int (*show_options)(struct seq_file *, struct dentry *); - int (*show_devname)(struct seq_file *, struct dentry *); - int (*show_path)(struct seq_file *, struct dentry *); - int (*show_stats)(struct seq_file *, struct dentry *); -#ifdef CONFIG_QUOTA - ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); - ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); - struct dquot __rcu **(*get_dquots)(struct inode *); -#endif - long (*nr_cached_objects)(struct super_block *, - struct shrink_control *); - long (*free_cached_objects)(struct super_block *, - struct shrink_control *); - /* - * If a filesystem can support graceful removal of a device and - * continue read-write operations, implement this callback. - * - * Return 0 if the filesystem can continue read-write. - * Non-zero return value or no such callback means the fs will be shutdown - * as usual. - */ - int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); - void (*shutdown)(struct super_block *sb); -}; - /* * Inode flags - they have no relation to superblock flags now */ @@ -2539,7 +2121,6 @@ struct super_operations { */ #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) -static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; } #define IS_RDONLY(inode) sb_rdonly((inode)->i_sb) #define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \ ((inode)->i_flags & S_SYNC)) @@ -2635,8 +2216,8 @@ static inline int icount_read(const struct inode *inode) */ static inline bool inode_is_dirtytime_only(struct inode *inode) { - return (inode->i_state & (I_DIRTY_TIME | I_NEW | - I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; + return (inode_state_read_once(inode) & + (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; } extern void inc_nlink(struct inode *inode); @@ -2774,10 +2355,6 @@ extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); -int freeze_super(struct super_block *super, enum freeze_holder who, - const void *freeze_owner); -int thaw_super(struct super_block *super, enum freeze_holder who, - const void *freeze_owner); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); @@ -2820,8 +2397,6 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch va_end(args); } -extern int current_umask(void); - extern void ihold(struct inode * inode); extern void iput(struct inode *); void iput_not_last(struct inode *); @@ -2965,12 +2540,6 @@ extern struct kmem_cache *names_cachep; #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) -extern struct super_block *blockdev_superblock; -static inline bool sb_is_blkdev_sb(struct super_block *sb) -{ - return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; -} - void emergency_thaw_all(void); extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; @@ -3016,7 +2585,7 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); -int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, +int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end); static inline int file_write_and_wait(struct file *file) @@ -3053,8 +2622,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; - filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, - iocb->ki_pos - 1); + filemap_flush_range(mapping, iocb->ki_pos - count, + iocb->ki_pos - 1); } return count; @@ -3073,7 +2642,7 @@ static inline int bmap(struct inode *inode, sector_t *block) #endif int notify_change(struct mnt_idmap *, struct dentry *, - struct iattr *, struct inode **); + struct iattr *, struct delegated_inode *); int inode_permission(struct mnt_idmap *, struct inode *, int); int generic_permission(struct mnt_idmap *, struct inode *, int); static inline int file_permission(struct file *file, int mask) @@ -3103,7 +2672,7 @@ static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) * file_start_write - get write access to a superblock for regular file io * @file: the file we want to write to * - * This is a variant of sb_start_write() which is a noop on non-regualr file. + * This is a variant of sb_start_write() which is a noop on non-regular file. * Should be matched with a call to file_end_write(). */ static inline void file_start_write(struct file *file) @@ -3271,6 +2840,7 @@ extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ extern bool is_subdir(struct dentry *, struct dentry *); extern bool path_is_under(const struct path *, const struct path *); +u64 vfsmount_to_propagation_flags(struct vfsmount *mnt); extern char *file_path(struct file *, char *, int); @@ -3328,7 +2898,7 @@ extern void d_mark_dontcache(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), - void *data); + void *data, bool *isnew); extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup(struct super_block *sb, unsigned long ino); @@ -3380,11 +2950,9 @@ static inline bool is_zero_ino(ino_t ino) return (u32)ino == 0; } -/* - * inode->i_lock must be held - */ static inline void __iget(struct inode *inode) { + lockdep_assert_held(&inode->i_lock); atomic_inc(&inode->i_count); } @@ -3423,10 +2991,7 @@ static inline void remove_inode_hash(struct inode *inode) } extern void inode_sb_list_add(struct inode *inode); -extern void inode_add_lru(struct inode *inode); - -int sb_set_blocksize(struct super_block *sb, int size); -int __must_check sb_min_blocksize(struct super_block *sb, int size); +extern void inode_lru_list_add(struct inode *inode); int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); @@ -3611,6 +3176,8 @@ extern void iterate_supers_type(struct file_system_type *, void filesystems_freeze(bool freeze_all); void filesystems_thaw(void); +void end_dirop(struct dentry *de); + extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); @@ -3747,38 +3314,6 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir, } #endif -static inline struct unicode_map *sb_encoding(const struct super_block *sb) -{ -#if IS_ENABLED(CONFIG_UNICODE) - return sb->s_encoding; -#else - return NULL; -#endif -} - -static inline bool sb_has_encoding(const struct super_block *sb) -{ - return !!sb_encoding(sb); -} - -/* - * Compare if two super blocks have the same encoding and flags - */ -static inline bool sb_same_encoding(const struct super_block *sb1, - const struct super_block *sb2) -{ -#if IS_ENABLED(CONFIG_UNICODE) - if (sb1->s_encoding == sb2->s_encoding) - return true; - - return (sb1->s_encoding && sb2->s_encoding && - (sb1->s_encoding->version == sb2->s_encoding->version) && - (sb1->s_encoding_flags == sb2->s_encoding_flags)); -#else - return true; -#endif -} - int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h new file mode 100644 index 000000000000..f21ffbb6dea5 --- /dev/null +++ b/include/linux/fs/super.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FS_SUPER_H +#define _LINUX_FS_SUPER_H + +#include <linux/fs/super_types.h> +#include <linux/unicode.h> + +/* + * These are internal functions, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +static inline void __sb_end_write(struct super_block *sb, int level) +{ + percpu_up_read(sb->s_writers.rw_sem + level - 1); +} + +static inline void __sb_start_write(struct super_block *sb, int level) +{ + percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); +} + +static inline bool __sb_start_write_trylock(struct super_block *sb, int level) +{ + return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); +} + +#define __sb_writers_acquired(sb, lev) \ + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev) - 1], 1, _THIS_IP_) +#define __sb_writers_release(sb, lev) \ + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev) - 1], _THIS_IP_) + +/** + * __sb_write_started - check if sb freeze level is held + * @sb: the super we write to + * @level: the freeze level + * + * * > 0 - sb freeze level is held + * * 0 - sb freeze level is not held + * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN + */ +static inline int __sb_write_started(const struct super_block *sb, int level) +{ + return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); +} + +/** + * sb_write_started - check if SB_FREEZE_WRITE is held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE); +} + +/** + * sb_write_not_started - check if SB_FREEZE_WRITE is not held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_not_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; +} + +/** + * sb_end_write - drop write access to a superblock + * @sb: the super we wrote to + * + * Decrement number of writers to the filesystem. Wake up possible waiters + * wanting to freeze the filesystem. + */ +static inline void sb_end_write(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_WRITE); +} + +/** + * sb_end_pagefault - drop write access to a superblock from a page fault + * @sb: the super we wrote to + * + * Decrement number of processes handling write page fault to the filesystem. + * Wake up possible waiters wanting to freeze the filesystem. + */ +static inline void sb_end_pagefault(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_PAGEFAULT); +} + +/** + * sb_end_intwrite - drop write access to a superblock for internal fs purposes + * @sb: the super we wrote to + * + * Decrement fs-internal number of writers to the filesystem. Wake up possible + * waiters wanting to freeze the filesystem. + */ +static inline void sb_end_intwrite(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_FS); +} + +/** + * sb_start_write - get write access to a superblock + * @sb: the super we write to + * + * When a process wants to write data or metadata to a file system (i.e. dirty + * a page or an inode), it should embed the operation in a sb_start_write() - + * sb_end_write() pair to get exclusion against file system freezing. This + * function increments number of writers preventing freezing. If the file + * system is already frozen, the function waits until the file system is + * thawed. + * + * Since freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. Generally, + * freeze protection should be the outermost lock. In particular, we have: + * + * sb_start_write + * -> i_rwsem (write path, truncate, directory ops, ...) + * -> s_umount (freeze_super, thaw_super) + */ +static inline void sb_start_write(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_WRITE); +} + +DEFINE_GUARD(super_write, + struct super_block *, + sb_start_write(_T), + sb_end_write(_T)) + +static inline bool sb_start_write_trylock(struct super_block *sb) +{ + return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); +} + +/** + * sb_start_pagefault - get write access to a superblock from a page fault + * @sb: the super we write to + * + * When a process starts handling write page fault, it should embed the + * operation into sb_start_pagefault() - sb_end_pagefault() pair to get + * exclusion against file system freezing. This is needed since the page fault + * is going to dirty a page. This function increments number of running page + * faults preventing freezing. If the file system is already frozen, the + * function waits until the file system is thawed. + * + * Since page fault freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. It is advised to + * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault + * handling code implies lock dependency: + * + * mmap_lock + * -> sb_start_pagefault + */ +static inline void sb_start_pagefault(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_PAGEFAULT); +} + +/** + * sb_start_intwrite - get write access to a superblock for internal fs purposes + * @sb: the super we write to + * + * This is the third level of protection against filesystem freezing. It is + * free for use by a filesystem. The only requirement is that it must rank + * below sb_start_pagefault. + * + * For example filesystem can call sb_start_intwrite() when starting a + * transaction which somewhat eases handling of freezing for internal sources + * of filesystem changes (internal fs threads, discarding preallocation on file + * close, etc.). + */ +static inline void sb_start_intwrite(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_FS); +} + +static inline bool sb_start_intwrite_trylock(struct super_block *sb) +{ + return __sb_start_write_trylock(sb, SB_FREEZE_FS); +} + +static inline bool sb_rdonly(const struct super_block *sb) +{ + return sb->s_flags & SB_RDONLY; +} + +static inline bool sb_is_blkdev_sb(struct super_block *sb) +{ + return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; +} + +#if IS_ENABLED(CONFIG_UNICODE) +static inline struct unicode_map *sb_encoding(const struct super_block *sb) +{ + return sb->s_encoding; +} + +/* Compare if two super blocks have the same encoding and flags */ +static inline bool sb_same_encoding(const struct super_block *sb1, + const struct super_block *sb2) +{ + if (sb1->s_encoding == sb2->s_encoding) + return true; + + return (sb1->s_encoding && sb2->s_encoding && + (sb1->s_encoding->version == sb2->s_encoding->version) && + (sb1->s_encoding_flags == sb2->s_encoding_flags)); +} +#else +static inline struct unicode_map *sb_encoding(const struct super_block *sb) +{ + return NULL; +} + +static inline bool sb_same_encoding(const struct super_block *sb1, + const struct super_block *sb2) +{ + return true; +} +#endif + +static inline bool sb_has_encoding(const struct super_block *sb) +{ + return !!sb_encoding(sb); +} + +int sb_set_blocksize(struct super_block *sb, int size); +int __must_check sb_min_blocksize(struct super_block *sb, int size); + +int freeze_super(struct super_block *super, enum freeze_holder who, + const void *freeze_owner); +int thaw_super(struct super_block *super, enum freeze_holder who, + const void *freeze_owner); + +#endif /* _LINUX_FS_SUPER_H */ diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h new file mode 100644 index 000000000000..6bd3009e09b3 --- /dev/null +++ b/include/linux/fs/super_types.h @@ -0,0 +1,336 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FS_SUPER_TYPES_H +#define _LINUX_FS_SUPER_TYPES_H + +#include <linux/fs_dirent.h> +#include <linux/errseq.h> +#include <linux/list_lru.h> +#include <linux/list.h> +#include <linux/list_bl.h> +#include <linux/llist.h> +#include <linux/uidgid.h> +#include <linux/uuid.h> +#include <linux/percpu-rwsem.h> +#include <linux/workqueue_types.h> +#include <linux/quota.h> + +struct backing_dev_info; +struct block_device; +struct dentry; +struct dentry_operations; +struct dquot_operations; +struct export_operations; +struct file; +struct file_system_type; +struct fscrypt_operations; +struct fsnotify_sb_info; +struct fsverity_operations; +struct kstatfs; +struct mount; +struct mtd_info; +struct quotactl_ops; +struct shrinker; +struct unicode_map; +struct user_namespace; +struct workqueue_struct; +struct writeback_control; +struct xattr_handler; + +extern struct super_block *blockdev_superblock; + +/* Possible states of 'frozen' field */ +enum { + SB_UNFROZEN = 0, /* FS is unfrozen */ + SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ + SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ + SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop internal threads if needed) */ + SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ +}; + +#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) + +struct sb_writers { + unsigned short frozen; /* Is sb frozen? */ + int freeze_kcount; /* How many kernel freeze requests? */ + int freeze_ucount; /* How many userspace freeze requests? */ + const void *freeze_owner; /* Owner of the freeze */ + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; +}; + +/** + * enum freeze_holder - holder of the freeze + * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem + * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem + * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed + * @FREEZE_EXCL: a freeze that can only be undone by the owner + * + * Indicate who the owner of the freeze or thaw request is and whether + * the freeze needs to be exclusive or can nest. + * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the + * same holder aren't allowed. It is however allowed to hold a single + * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at + * the same time. This is relied upon by some filesystems during online + * repair or similar. + */ +enum freeze_holder { + FREEZE_HOLDER_KERNEL = (1U << 0), + FREEZE_HOLDER_USERSPACE = (1U << 1), + FREEZE_MAY_NEST = (1U << 2), + FREEZE_EXCL = (1U << 3), +}; + +struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); + void (*destroy_inode)(struct inode *inode); + void (*free_inode)(struct inode *inode); + void (*dirty_inode)(struct inode *inode, int flags); + int (*write_inode)(struct inode *inode, struct writeback_control *wbc); + int (*drop_inode)(struct inode *inode); + void (*evict_inode)(struct inode *inode); + void (*put_super)(struct super_block *sb); + int (*sync_fs)(struct super_block *sb, int wait); + int (*freeze_super)(struct super_block *sb, enum freeze_holder who, + const void *owner); + int (*freeze_fs)(struct super_block *sb); + int (*thaw_super)(struct super_block *sb, enum freeze_holder who, + const void *owner); + int (*unfreeze_fs)(struct super_block *sb); + int (*statfs)(struct dentry *dentry, struct kstatfs *kstatfs); + int (*remount_fs) (struct super_block *, int *, char *); + void (*umount_begin)(struct super_block *sb); + + int (*show_options)(struct seq_file *seq, struct dentry *dentry); + int (*show_devname)(struct seq_file *seq, struct dentry *dentry); + int (*show_path)(struct seq_file *seq, struct dentry *dentry); + int (*show_stats)(struct seq_file *seq, struct dentry *dentry); +#ifdef CONFIG_QUOTA + ssize_t (*quota_read)(struct super_block *sb, int type, char *data, + size_t len, loff_t off); + ssize_t (*quota_write)(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); + struct dquot __rcu **(*get_dquots)(struct inode *inode); +#endif + long (*nr_cached_objects)(struct super_block *sb, + struct shrink_control *sc); + long (*free_cached_objects)(struct super_block *sb, + struct shrink_control *sc); + /* + * If a filesystem can support graceful removal of a device and + * continue read-write operations, implement this callback. + * + * Return 0 if the filesystem can continue read-write. + * Non-zero return value or no such callback means the fs will be shutdown + * as usual. + */ + int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); + void (*shutdown)(struct super_block *sb); +}; + +struct super_block { + struct list_head s_list; /* Keep this first */ + dev_t s_dev; /* search index; _not_ kdev_t */ + unsigned char s_blocksize_bits; + unsigned long s_blocksize; + loff_t s_maxbytes; /* Max file size */ + struct file_system_type *s_type; + const struct super_operations *s_op; + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; + unsigned long s_flags; + unsigned long s_iflags; /* internal SB_I_* flags */ + unsigned long s_magic; + struct dentry *s_root; + struct rw_semaphore s_umount; + int s_count; + atomic_t s_active; +#ifdef CONFIG_SECURITY + void *s_security; +#endif + const struct xattr_handler *const *s_xattr; +#ifdef CONFIG_FS_ENCRYPTION + const struct fscrypt_operations *s_cop; + struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ +#endif +#ifdef CONFIG_FS_VERITY + const struct fsverity_operations *s_vop; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + struct unicode_map *s_encoding; + __u16 s_encoding_flags; +#endif + struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ + struct mount *s_mounts; /* list of mounts; _not_ for fs use */ + struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ + struct file *s_bdev_file; + struct backing_dev_info *s_bdi; + struct mtd_info *s_mtd; + struct hlist_node s_instances; + unsigned int s_quota_types; /* Bitmask of supported quota types */ + struct quota_info s_dquot; /* Diskquota specific options */ + + struct sb_writers s_writers; + + /* + * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and + * s_fsnotify_info together for cache efficiency. They are frequently + * accessed and rarely modified. + */ + void *s_fs_info; /* Filesystem private info */ + + /* Granularity of c/m/atime in ns (cannot be worse than a second) */ + u32 s_time_gran; + /* Time limits for c/m/atime in seconds */ + time64_t s_time_min; + time64_t s_time_max; +#ifdef CONFIG_FSNOTIFY + u32 s_fsnotify_mask; + struct fsnotify_sb_info *s_fsnotify_info; +#endif + + /* + * q: why are s_id and s_sysfs_name not the same? both are human + * readable strings that identify the filesystem + * a: s_id is allowed to change at runtime; it's used in log messages, + * and we want to when a device starts out as single device (s_id is dev + * name) but then a device is hot added and we have to switch to + * identifying it by UUID + * but s_sysfs_name is a handle for programmatic access, and can't + * change at runtime + */ + char s_id[32]; /* Informational name */ + uuid_t s_uuid; /* UUID */ + u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ + + /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ + char s_sysfs_name[UUID_STRING_LEN + 1]; + + unsigned int s_max_links; + unsigned int s_d_flags; /* default d_flags for dentries */ + + /* + * The next field is for VFS *only*. No filesystems have any business + * even looking at it. You had been warned. + */ + struct mutex s_vfs_rename_mutex; /* Kludge */ + + /* + * Filesystem subtype. If non-empty the filesystem type field + * in /proc/mounts will be "type.subtype" + */ + const char *s_subtype; + + const struct dentry_operations *__s_d_op; /* default d_op for dentries */ + + struct shrinker *s_shrink; /* per-sb shrinker handle */ + + /* Number of inodes with nlink == 0 but still referenced */ + atomic_long_t s_remove_count; + + /* Read-only state of the superblock is being changed */ + int s_readonly_remount; + + /* per-sb errseq_t for reporting writeback errors via syncfs */ + errseq_t s_wb_err; + + /* AIO completions deferred from interrupt context */ + struct workqueue_struct *s_dio_done_wq; + struct hlist_head s_pins; + + /* + * Owning user namespace and default context in which to + * interpret filesystem uids, gids, quotas, device nodes, + * xattrs and security labels. + */ + struct user_namespace *s_user_ns; + + /* + * The list_lru structure is essentially just a pointer to a table + * of per-node lru lists, each of which has its own spinlock. + * There is no need to put them into separate cachelines. + */ + struct list_lru s_dentry_lru; + struct list_lru s_inode_lru; + struct rcu_head rcu; + struct work_struct destroy_work; + + struct mutex s_sync_lock; /* sync serialisation lock */ + + /* + * Indicates how deep in a filesystem stack this SB is + */ + int s_stack_depth; + + /* s_inode_list_lock protects s_inodes */ + spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; + struct list_head s_inodes; /* all inodes */ + + spinlock_t s_inode_wblist_lock; + struct list_head s_inodes_wb; /* writeback inodes */ + long s_min_writeback_pages; +} __randomize_layout; + +/* + * sb->s_flags. Note that these mirror the equivalent MS_* flags where + * represented in both. + */ +#define SB_RDONLY BIT(0) /* Mount read-only */ +#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ +#define SB_NODEV BIT(2) /* Disallow access to device special files */ +#define SB_NOEXEC BIT(3) /* Disallow program execution */ +#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ +#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ +#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ +#define SB_NOATIME BIT(10) /* Do not update access times. */ +#define SB_NODIRATIME BIT(11) /* Do not update directory access times */ +#define SB_SILENT BIT(15) +#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ +#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ +#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ +#define SB_I_VERSION BIT(23) /* Update inode I_version field */ +#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ + +/* These sb flags are internal to the kernel */ +#define SB_DEAD BIT(21) +#define SB_DYING BIT(24) +#define SB_FORCE BIT(27) +#define SB_NOSEC BIT(28) +#define SB_BORN BIT(29) +#define SB_ACTIVE BIT(30) +#define SB_NOUSER BIT(31) + +/* These flags relate to encoding and casefolding */ +#define SB_ENC_STRICT_MODE_FL (1 << 0) +#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) + +#define sb_has_strict_encoding(sb) \ + (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) + +#if IS_ENABLED(CONFIG_UNICODE) +#define sb_no_casefold_compat_fallback(sb) \ + (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) +#else +#define sb_no_casefold_compat_fallback(sb) (1) +#endif + +/* sb->s_iflags */ +#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ +#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ +#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ +#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ + +/* sb->s_iflags to limit user namespace mounts */ +#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ +#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 +#define SB_I_UNTRUSTED_MOUNTER 0x00000040 +#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 + +#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ +#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ +#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ +#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ +#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ +#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ +#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ + +#endif /* _LINUX_FS_SUPER_TYPES_H */ diff --git a/include/linux/fs_types.h b/include/linux/fs_dirent.h index 54816791196f..92f75c5bac19 100644 --- a/include/linux/fs_types.h +++ b/include/linux/fs_dirent.h @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_FS_TYPES_H -#define _LINUX_FS_TYPES_H +#ifndef _LINUX_FS_DIRENT_H +#define _LINUX_FS_DIRENT_H + +#include <linux/stat.h> +#include <linux/types.h> /* * This is a header for the common implementation of dirent @@ -66,10 +69,10 @@ /* * declarations for helper functions, accompanying implementation - * is in fs/fs_types.c + * is in fs/fs_dirent.c */ extern unsigned char fs_ftype_to_dtype(unsigned int filetype); extern unsigned char fs_umode_to_ftype(umode_t mode); extern unsigned char fs_umode_to_dtype(umode_t mode); -#endif +#endif /* _LINUX_FS_DIRENT_H */ diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index baf200ab5c77..0070764b790a 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_STRUCT_H #define _LINUX_FS_STRUCT_H +#include <linux/sched.h> #include <linux/path.h> #include <linux/spinlock.h> #include <linux/seqlock.h> @@ -41,4 +42,9 @@ static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd) extern bool current_chrooted(void); +static inline int current_umask(void) +{ + return current->fs->umask; +} + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 07f8c309e432..015dd1049bea 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -359,6 +359,7 @@ enum { FTRACE_OPS_FL_DIRECT = BIT(17), FTRACE_OPS_FL_SUBOP = BIT(18), FTRACE_OPS_FL_GRAPH = BIT(19), + FTRACE_OPS_FL_JMP = BIT(20), }; #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS @@ -577,6 +578,38 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) { } #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP +static inline bool ftrace_is_jmp(unsigned long addr) +{ + return addr & 1; +} + +static inline unsigned long ftrace_jmp_set(unsigned long addr) +{ + return addr | 1UL; +} + +static inline unsigned long ftrace_jmp_get(unsigned long addr) +{ + return addr & ~1UL; +} +#else +static inline bool ftrace_is_jmp(unsigned long addr) +{ + return false; +} + +static inline unsigned long ftrace_jmp_set(unsigned long addr) +{ + return addr; +} + +static inline unsigned long ftrace_jmp_get(unsigned long addr) +{ + return addr; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_JMP */ + #ifdef CONFIG_STACK_TRACER int stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer, diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 65db9349f905..3de43b12209e 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -55,9 +55,7 @@ enum { #ifdef CONFIG_LOCKDEP ___GFP_NOLOCKDEP_BIT, #endif -#ifdef CONFIG_SLAB_OBJ_EXT ___GFP_NO_OBJ_EXT_BIT, -#endif ___GFP_LAST_BIT }; @@ -98,11 +96,7 @@ enum { #else #define ___GFP_NOLOCKDEP 0 #endif -#ifdef CONFIG_SLAB_OBJ_EXT #define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT) -#else -#define ___GFP_NO_OBJ_EXT 0 -#endif /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h index 8838ca2f3d08..dadb5e0aa8a3 100644 --- a/include/linux/hfs_common.h +++ b/include/linux/hfs_common.h @@ -17,4 +17,637 @@ pr_debug("pid %d:%s:%d %s(): " fmt, \ current->pid, __FILE__, __LINE__, __func__, ##__VA_ARGS__) \ +/* + * Format of structures on disk + * Information taken from Apple Technote #1150 (HFS Plus Volume Format) + */ + +/* offsets to various blocks */ +#define HFS_DD_BLK 0 /* Driver Descriptor block */ +#define HFS_PMAP_BLK 1 /* First block of partition map */ +#define HFS_MDB_BLK 2 /* Block (w/i partition) of MDB */ + +/* magic numbers for various disk blocks */ +#define HFS_DRVR_DESC_MAGIC 0x4552 /* "ER": driver descriptor map */ +#define HFS_OLD_PMAP_MAGIC 0x5453 /* "TS": old-type partition map */ +#define HFS_NEW_PMAP_MAGIC 0x504D /* "PM": new-type partition map */ +#define HFS_SUPER_MAGIC 0x4244 /* "BD": HFS MDB (super block) */ +#define HFS_MFS_SUPER_MAGIC 0xD2D7 /* MFS MDB (super block) */ + +#define HFSPLUS_VOLHEAD_SIG 0x482b +#define HFSPLUS_VOLHEAD_SIGX 0x4858 +#define HFSPLUS_SUPER_MAGIC 0x482b + +#define HFSP_WRAP_MAGIC 0x4244 +#define HFSP_WRAP_ATTRIB_SLOCK 0x8000 +#define HFSP_WRAP_ATTRIB_SPARED 0x0200 + +#define HFSP_WRAPOFF_SIG 0x00 +#define HFSP_WRAPOFF_ATTRIB 0x0A +#define HFSP_WRAPOFF_ABLKSIZE 0x14 +#define HFSP_WRAPOFF_ABLKSTART 0x1C +#define HFSP_WRAPOFF_EMBEDSIG 0x7C +#define HFSP_WRAPOFF_EMBEDEXT 0x7E + +#define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */ +#define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */ + +#define HFSP_SYMLINK_TYPE 0x736c6e6b /* 'slnk' */ +#define HFSP_SYMLINK_CREATOR 0x72686170 /* 'rhap' */ + +#define HFSP_MOUNT_VERSION 0x482b4c78 /* 'H+Lx' */ + +#define HFSP_HIDDENDIR_NAME \ + "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data" + +/* various FIXED size parameters */ +#define HFS_SECTOR_SIZE 512 /* size of an HFS sector */ +#define HFS_SECTOR_SIZE_BITS 9 /* log_2(HFS_SECTOR_SIZE) */ +#define HFS_MAX_VALENCE 32767U + +#define HFSPLUS_SECTOR_SIZE HFS_SECTOR_SIZE +#define HFSPLUS_SECTOR_SHIFT HFS_SECTOR_SIZE_BITS +#define HFSPLUS_VOLHEAD_SECTOR 2 +#define HFSPLUS_MIN_VERSION 4 +#define HFSPLUS_CURRENT_VERSION 5 + +#define HFS_NAMELEN 31 /* maximum length of an HFS filename */ +#define HFS_MAX_NAMELEN 128 + +#define HFSPLUS_MAX_STRLEN 255 +#define HFSPLUS_ATTR_MAX_STRLEN 127 + +/* Meanings of the drAtrb field of the MDB, + * Reference: _Inside Macintosh: Files_ p. 2-61 + */ +#define HFS_SB_ATTRIB_HLOCK (1 << 7) +#define HFS_SB_ATTRIB_UNMNT (1 << 8) +#define HFS_SB_ATTRIB_SPARED (1 << 9) +#define HFS_SB_ATTRIB_INCNSTNT (1 << 11) +#define HFS_SB_ATTRIB_SLOCK (1 << 15) + +/* values for hfs_cat_rec.cdrType */ +#define HFS_CDR_DIR 0x01 /* folder (directory) */ +#define HFS_CDR_FIL 0x02 /* file */ +#define HFS_CDR_THD 0x03 /* folder (directory) thread */ +#define HFS_CDR_FTH 0x04 /* file thread */ + +/* legal values for hfs_ext_key.FkType and hfs_file.fork */ +#define HFS_FK_DATA 0x00 +#define HFS_FK_RSRC 0xFF + +/* bits in hfs_fil_entry.Flags */ +#define HFS_FIL_LOCK 0x01 /* locked */ +#define HFS_FIL_THD 0x02 /* file thread */ +#define HFS_FIL_DOPEN 0x04 /* data fork open */ +#define HFS_FIL_ROPEN 0x08 /* resource fork open */ +#define HFS_FIL_DIR 0x10 /* directory (always clear) */ +#define HFS_FIL_NOCOPY 0x40 /* copy-protected file */ +#define HFS_FIL_USED 0x80 /* open */ + +/* bits in hfs_dir_entry.Flags. dirflags is 16 bits. */ +#define HFS_DIR_LOCK 0x01 /* locked */ +#define HFS_DIR_THD 0x02 /* directory thread */ +#define HFS_DIR_INEXPFOLDER 0x04 /* in a shared area */ +#define HFS_DIR_MOUNTED 0x08 /* mounted */ +#define HFS_DIR_DIR 0x10 /* directory (always set) */ +#define HFS_DIR_EXPFOLDER 0x20 /* share point */ + +/* bits hfs_finfo.fdFlags */ +#define HFS_FLG_INITED 0x0100 +#define HFS_FLG_LOCKED 0x1000 +#define HFS_FLG_INVISIBLE 0x4000 + +/* Some special File ID numbers */ +#define HFS_POR_CNID 1 /* Parent Of the Root */ +#define HFSPLUS_POR_CNID HFS_POR_CNID +#define HFS_ROOT_CNID 2 /* ROOT directory */ +#define HFSPLUS_ROOT_CNID HFS_ROOT_CNID +#define HFS_EXT_CNID 3 /* EXTents B-tree */ +#define HFSPLUS_EXT_CNID HFS_EXT_CNID +#define HFS_CAT_CNID 4 /* CATalog B-tree */ +#define HFSPLUS_CAT_CNID HFS_CAT_CNID +#define HFS_BAD_CNID 5 /* BAD blocks file */ +#define HFSPLUS_BAD_CNID HFS_BAD_CNID +#define HFS_ALLOC_CNID 6 /* ALLOCation file (HFS+) */ +#define HFSPLUS_ALLOC_CNID HFS_ALLOC_CNID +#define HFS_START_CNID 7 /* STARTup file (HFS+) */ +#define HFSPLUS_START_CNID HFS_START_CNID +#define HFS_ATTR_CNID 8 /* ATTRibutes file (HFS+) */ +#define HFSPLUS_ATTR_CNID HFS_ATTR_CNID +#define HFS_EXCH_CNID 15 /* ExchangeFiles temp id */ +#define HFSPLUS_EXCH_CNID HFS_EXCH_CNID +#define HFS_FIRSTUSER_CNID 16 /* first available user id */ +#define HFSPLUS_FIRSTUSER_CNID HFS_FIRSTUSER_CNID + +/*======== HFS/HFS+ structures as they appear on the disk ========*/ + +typedef __be32 hfsplus_cnid; +typedef __be16 hfsplus_unichr; + +/* Pascal-style string of up to 31 characters */ +struct hfs_name { + u8 len; + u8 name[HFS_NAMELEN]; +} __packed; + +/* A "string" as used in filenames, etc. */ +struct hfsplus_unistr { + __be16 length; + hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN]; +} __packed; + +/* + * A "string" is used in attributes file + * for name of extended attribute + */ +struct hfsplus_attr_unistr { + __be16 length; + hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN]; +} __packed; + +struct hfs_extent { + __be16 block; + __be16 count; +}; +typedef struct hfs_extent hfs_extent_rec[3]; + +/* A single contiguous area of a file */ +struct hfsplus_extent { + __be32 start_block; + __be32 block_count; +} __packed; +typedef struct hfsplus_extent hfsplus_extent_rec[8]; + +/* Information for a "Fork" in a file */ +struct hfsplus_fork_raw { + __be64 total_size; + __be32 clump_size; + __be32 total_blocks; + hfsplus_extent_rec extents; +} __packed; + +struct hfs_mdb { + __be16 drSigWord; /* Signature word indicating fs type */ + __be32 drCrDate; /* fs creation date/time */ + __be32 drLsMod; /* fs modification date/time */ + __be16 drAtrb; /* fs attributes */ + __be16 drNmFls; /* number of files in root directory */ + __be16 drVBMSt; /* location (in 512-byte blocks) + of the volume bitmap */ + __be16 drAllocPtr; /* location (in allocation blocks) + to begin next allocation search */ + __be16 drNmAlBlks; /* number of allocation blocks */ + __be32 drAlBlkSiz; /* bytes in an allocation block */ + __be32 drClpSiz; /* clumpsize, the number of bytes to + allocate when extending a file */ + __be16 drAlBlSt; /* location (in 512-byte blocks) + of the first allocation block */ + __be32 drNxtCNID; /* CNID to assign to the next + file or directory created */ + __be16 drFreeBks; /* number of free allocation blocks */ + u8 drVN[28]; /* the volume label */ + __be32 drVolBkUp; /* fs backup date/time */ + __be16 drVSeqNum; /* backup sequence number */ + __be32 drWrCnt; /* fs write count */ + __be32 drXTClpSiz; /* clumpsize for the extents B-tree */ + __be32 drCTClpSiz; /* clumpsize for the catalog B-tree */ + __be16 drNmRtDirs; /* number of directories in + the root directory */ + __be32 drFilCnt; /* number of files in the fs */ + __be32 drDirCnt; /* number of directories in the fs */ + u8 drFndrInfo[32]; /* data used by the Finder */ + __be16 drEmbedSigWord; /* embedded volume signature */ + __be32 drEmbedExtent; /* starting block number (xdrStABN) + and number of allocation blocks + (xdrNumABlks) occupied by embedded + volume */ + __be32 drXTFlSize; /* bytes in the extents B-tree */ + hfs_extent_rec drXTExtRec; /* extents B-tree's first 3 extents */ + __be32 drCTFlSize; /* bytes in the catalog B-tree */ + hfs_extent_rec drCTExtRec; /* catalog B-tree's first 3 extents */ +} __packed; + +/* HFS+ Volume Header */ +struct hfsplus_vh { + __be16 signature; + __be16 version; + __be32 attributes; + __be32 last_mount_vers; + u32 reserved; + + __be32 create_date; + __be32 modify_date; + __be32 backup_date; + __be32 checked_date; + + __be32 file_count; + __be32 folder_count; + + __be32 blocksize; + __be32 total_blocks; + __be32 free_blocks; + + __be32 next_alloc; + __be32 rsrc_clump_sz; + __be32 data_clump_sz; + hfsplus_cnid next_cnid; + + __be32 write_count; + __be64 encodings_bmp; + + u32 finder_info[8]; + + struct hfsplus_fork_raw alloc_file; + struct hfsplus_fork_raw ext_file; + struct hfsplus_fork_raw cat_file; + struct hfsplus_fork_raw attr_file; + struct hfsplus_fork_raw start_file; +} __packed; + +/* HFS+ volume attributes */ +#define HFSPLUS_VOL_UNMNT (1 << 8) +#define HFSPLUS_VOL_SPARE_BLK (1 << 9) +#define HFSPLUS_VOL_NOCACHE (1 << 10) +#define HFSPLUS_VOL_INCNSTNT (1 << 11) +#define HFSPLUS_VOL_NODEID_REUSED (1 << 12) +#define HFSPLUS_VOL_JOURNALED (1 << 13) +#define HFSPLUS_VOL_SOFTLOCK (1 << 15) +#define HFSPLUS_VOL_UNUSED_NODE_FIX (1 << 31) + +struct hfs_point { + __be16 v; + __be16 h; +} __packed; + +typedef struct hfs_point hfsp_point; + +struct hfs_rect { + __be16 top; + __be16 left; + __be16 bottom; + __be16 right; +} __packed; + +typedef struct hfs_rect hfsp_rect; + +struct hfs_finfo { + __be32 fdType; + __be32 fdCreator; + __be16 fdFlags; + struct hfs_point fdLocation; + __be16 fdFldr; +} __packed; + +typedef struct hfs_finfo FInfo; + +struct hfs_fxinfo { + __be16 fdIconID; + u8 fdUnused[8]; + __be16 fdComment; + __be32 fdPutAway; +} __packed; + +typedef struct hfs_fxinfo FXInfo; + +struct hfs_dinfo { + struct hfs_rect frRect; + __be16 frFlags; + struct hfs_point frLocation; + __be16 frView; +} __packed; + +typedef struct hfs_dinfo DInfo; + +struct hfs_dxinfo { + struct hfs_point frScroll; + __be32 frOpenChain; + __be16 frUnused; + __be16 frComment; + __be32 frPutAway; +} __packed; + +typedef struct hfs_dxinfo DXInfo; + +union hfs_finder_info { + struct { + struct hfs_finfo finfo; + struct hfs_fxinfo fxinfo; + } file; + struct { + struct hfs_dinfo dinfo; + struct hfs_dxinfo dxinfo; + } dir; +} __packed; + +/* The key used in the catalog b-tree: */ +struct hfs_cat_key { + u8 key_len; /* number of bytes in the key */ + u8 reserved; /* padding */ + __be32 ParID; /* CNID of the parent dir */ + struct hfs_name CName; /* The filename of the entry */ +} __packed; + +/* HFS+ catalog entry key */ +struct hfsplus_cat_key { + __be16 key_len; + hfsplus_cnid parent; + struct hfsplus_unistr name; +} __packed; + +#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key)) + +/* The key used in the extents b-tree: */ +struct hfs_ext_key { + u8 key_len; /* number of bytes in the key */ + u8 FkType; /* HFS_FK_{DATA,RSRC} */ + __be32 FNum; /* The File ID of the file */ + __be16 FABN; /* allocation blocks number*/ +} __packed; + +/* HFS+ extents tree key */ +struct hfsplus_ext_key { + __be16 key_len; + u8 fork_type; + u8 pad; + hfsplus_cnid cnid; + __be32 start_block; +} __packed; + +#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key) + +typedef union hfs_btree_key { + u8 key_len; /* number of bytes in the key */ + struct hfs_cat_key cat; + struct hfs_ext_key ext; +} hfs_btree_key; + +#define HFS_MAX_CAT_KEYLEN (sizeof(struct hfs_cat_key) - sizeof(u8)) +#define HFS_MAX_EXT_KEYLEN (sizeof(struct hfs_ext_key) - sizeof(u8)) + +typedef union hfs_btree_key btree_key; + +/* The catalog record for a file */ +struct hfs_cat_file { + s8 type; /* The type of entry */ + u8 reserved; + u8 Flags; /* Flags such as read-only */ + s8 Typ; /* file version number = 0 */ + struct hfs_finfo UsrWds; /* data used by the Finder */ + __be32 FlNum; /* The CNID */ + __be16 StBlk; /* obsolete */ + __be32 LgLen; /* The logical EOF of the data fork*/ + __be32 PyLen; /* The physical EOF of the data fork */ + __be16 RStBlk; /* obsolete */ + __be32 RLgLen; /* The logical EOF of the rsrc fork */ + __be32 RPyLen; /* The physical EOF of the rsrc fork */ + __be32 CrDat; /* The creation date */ + __be32 MdDat; /* The modified date */ + __be32 BkDat; /* The last backup date */ + struct hfs_fxinfo FndrInfo; /* more data for the Finder */ + __be16 ClpSize; /* number of bytes to allocate + when extending files */ + hfs_extent_rec ExtRec; /* first extent record + for the data fork */ + hfs_extent_rec RExtRec; /* first extent record + for the resource fork */ + u32 Resrv; /* reserved by Apple */ +} __packed; + +/* the catalog record for a directory */ +struct hfs_cat_dir { + s8 type; /* The type of entry */ + u8 reserved; + __be16 Flags; /* flags */ + __be16 Val; /* Valence: number of files and + dirs in the directory */ + __be32 DirID; /* The CNID */ + __be32 CrDat; /* The creation date */ + __be32 MdDat; /* The modification date */ + __be32 BkDat; /* The last backup date */ + struct hfs_dinfo UsrInfo; /* data used by the Finder */ + struct hfs_dxinfo FndrInfo; /* more data used by Finder */ + u8 Resrv[16]; /* reserved by Apple */ +} __packed; + +/* the catalog record for a thread */ +struct hfs_cat_thread { + s8 type; /* The type of entry */ + u8 reserved[9]; /* reserved by Apple */ + __be32 ParID; /* CNID of parent directory */ + struct hfs_name CName; /* The name of this entry */ +} __packed; + +/* A catalog tree record */ +typedef union hfs_cat_rec { + s8 type; /* The type of entry */ + struct hfs_cat_file file; + struct hfs_cat_dir dir; + struct hfs_cat_thread thread; +} hfs_cat_rec; + +/* POSIX permissions */ +struct hfsplus_perm { + __be32 owner; + __be32 group; + u8 rootflags; + u8 userflags; + __be16 mode; + __be32 dev; +} __packed; + +#define HFSPLUS_FLG_NODUMP 0x01 +#define HFSPLUS_FLG_IMMUTABLE 0x02 +#define HFSPLUS_FLG_APPEND 0x04 + +/* HFS/HFS+ BTree node descriptor */ +struct hfs_bnode_desc { + __be32 next; /* (V) Number of the next node at this level */ + __be32 prev; /* (V) Number of the prev node at this level */ + u8 type; /* (F) The type of node */ + u8 height; /* (F) The level of this node (leaves=1) */ + __be16 num_recs; /* (V) The number of records in this node */ + u16 reserved; +} __packed; + +/* HFS/HFS+ BTree node types */ +#define HFS_NODE_INDEX 0x00 /* An internal (index) node */ +#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */ +#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */ +#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */ + +/* HFS/HFS+ BTree header */ +struct hfs_btree_header_rec { + __be16 depth; /* (V) The number of levels in this B-tree */ + __be32 root; /* (V) The node number of the root node */ + __be32 leaf_count; /* (V) The number of leaf records */ + __be32 leaf_head; /* (V) The number of the first leaf node */ + __be32 leaf_tail; /* (V) The number of the last leaf node */ + __be16 node_size; /* (F) The number of bytes in a node (=512) */ + __be16 max_key_len; /* (F) The length of a key in an index node */ + __be32 node_count; /* (V) The total number of nodes */ + __be32 free_nodes; /* (V) The number of unused nodes */ + u16 reserved1; + __be32 clump_size; /* (F) clump size. not usually used. */ + u8 btree_type; /* (F) BTree type */ + u8 key_type; + __be32 attributes; /* (F) attributes */ + u32 reserved3[16]; +} __packed; + +/* BTree attributes */ +#define BTREE_ATTR_BADCLOSE 0x00000001 /* b-tree not closed properly. not + used by hfsplus. */ +#define HFS_TREE_BIGKEYS 0x00000002 /* key length is u16 instead of u8. + used by hfsplus. */ +#define HFS_TREE_VARIDXKEYS 0x00000004 /* variable key length instead of + max key length. use din catalog + b-tree but not in extents + b-tree (hfsplus). */ + +/* HFS+ BTree misc info */ +#define HFSPLUS_TREE_HEAD 0 +#define HFSPLUS_NODE_MXSZ 32768 +#define HFSPLUS_ATTR_TREE_NODE_SIZE 8192 +#define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT 3 +#define HFSPLUS_BTREE_HDR_USER_BYTES 128 + +/* btree key type */ +#define HFSPLUS_KEY_CASEFOLDING 0xCF /* case-insensitive */ +#define HFSPLUS_KEY_BINARY 0xBC /* case-sensitive */ + +/* HFS+ folder data (part of an hfsplus_cat_entry) */ +struct hfsplus_cat_folder { + __be16 type; + __be16 flags; + __be32 valence; + hfsplus_cnid id; + __be32 create_date; + __be32 content_mod_date; + __be32 attribute_mod_date; + __be32 access_date; + __be32 backup_date; + struct hfsplus_perm permissions; + struct_group_attr(info, __packed, + DInfo user_info; + DXInfo finder_info; + ); + __be32 text_encoding; + __be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */ +} __packed; + +/* HFS+ file data (part of a cat_entry) */ +struct hfsplus_cat_file { + __be16 type; + __be16 flags; + u32 reserved1; + hfsplus_cnid id; + __be32 create_date; + __be32 content_mod_date; + __be32 attribute_mod_date; + __be32 access_date; + __be32 backup_date; + struct hfsplus_perm permissions; + struct_group_attr(info, __packed, + FInfo user_info; + FXInfo finder_info; + ); + __be32 text_encoding; + u32 reserved2; + + struct hfsplus_fork_raw data_fork; + struct hfsplus_fork_raw rsrc_fork; +} __packed; + +/* File and folder flag bits */ +#define HFSPLUS_FILE_LOCKED 0x0001 +#define HFSPLUS_FILE_THREAD_EXISTS 0x0002 +#define HFSPLUS_XATTR_EXISTS 0x0004 +#define HFSPLUS_ACL_EXISTS 0x0008 +#define HFSPLUS_HAS_FOLDER_COUNT 0x0010 /* Folder has subfolder count + * (HFSX only) */ + +/* HFS+ catalog thread (part of a cat_entry) */ +struct hfsplus_cat_thread { + __be16 type; + s16 reserved; + hfsplus_cnid parentID; + struct hfsplus_unistr nodeName; +} __packed; + +#define HFSPLUS_MIN_THREAD_SZ 10 + +/* A data record in the catalog tree */ +typedef union { + __be16 type; + struct hfsplus_cat_folder folder; + struct hfsplus_cat_file file; + struct hfsplus_cat_thread thread; +} __packed hfsplus_cat_entry; + +/* HFS+ catalog entry type */ +#define HFSPLUS_FOLDER 0x0001 +#define HFSPLUS_FILE 0x0002 +#define HFSPLUS_FOLDER_THREAD 0x0003 +#define HFSPLUS_FILE_THREAD 0x0004 + +#define HFSPLUS_XATTR_FINDER_INFO_NAME "com.apple.FinderInfo" +#define HFSPLUS_XATTR_ACL_NAME "com.apple.system.Security" + +#define HFSPLUS_ATTR_INLINE_DATA 0x10 +#define HFSPLUS_ATTR_FORK_DATA 0x20 +#define HFSPLUS_ATTR_EXTENTS 0x30 + +/* HFS+ attributes tree key */ +struct hfsplus_attr_key { + __be16 key_len; + __be16 pad; + hfsplus_cnid cnid; + __be32 start_block; + struct hfsplus_attr_unistr key_name; +} __packed; + +#define HFSPLUS_ATTR_KEYLEN sizeof(struct hfsplus_attr_key) + +/* HFS+ fork data attribute */ +struct hfsplus_attr_fork_data { + __be32 record_type; + __be32 reserved; + struct hfsplus_fork_raw the_fork; +} __packed; + +/* HFS+ extension attribute */ +struct hfsplus_attr_extents { + __be32 record_type; + __be32 reserved; + struct hfsplus_extent extents; +} __packed; + +#define HFSPLUS_MAX_INLINE_DATA_SIZE 3802 + +/* HFS+ attribute inline data */ +struct hfsplus_attr_inline_data { + __be32 record_type; + __be32 reserved1; + u8 reserved2[6]; + __be16 length; + u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE]; +} __packed; + +/* A data record in the attributes tree */ +typedef union { + __be32 record_type; + struct hfsplus_attr_fork_data fork_data; + struct hfsplus_attr_extents extents; + struct hfsplus_attr_inline_data inline_data; +} __packed hfsplus_attr_entry; + +/* HFS+ generic BTree key */ +typedef union { + __be16 key_len; + struct hfsplus_cat_key cat; + struct hfsplus_ext_key ext; + struct hfsplus_attr_key attr; +} __packed hfsplus_btree_key; + #endif /* _HFS_COMMON_H_ */ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 71ac78b9f834..11cab07f322a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -11,7 +11,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -void huge_pmd_set_accessed(struct vm_fault *vmf); +bool huge_pmd_set_accessed(struct vm_fault *vmf); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h index 5eb66a399002..4f33e6a39797 100644 --- a/include/linux/iio/buffer-dma.h +++ b/include/linux/iio/buffer-dma.h @@ -174,5 +174,6 @@ int iio_dma_buffer_enqueue_dmabuf(struct iio_buffer *buffer, size_t size, bool cyclic); void iio_dma_buffer_lock_queue(struct iio_buffer *buffer); void iio_dma_buffer_unlock_queue(struct iio_buffer *buffer); +struct device *iio_dma_buffer_get_dma_dev(struct iio_buffer *buffer); #endif diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index e72552e026f3..8d770ced66b2 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -50,6 +50,7 @@ struct sg_table; * @enqueue_dmabuf: called from userspace via ioctl to queue this DMABUF * object to this buffer. Requires a valid DMABUF fd, that * was previouly attached to this buffer. + * @get_dma_dev: called to get the DMA channel associated with this buffer. * @lock_queue: called when the core needs to lock the buffer queue; * it is used when enqueueing DMABUF objects. * @unlock_queue: used to unlock a previously locked buffer queue @@ -90,6 +91,7 @@ struct iio_buffer_access_funcs { struct iio_dma_buffer_block *block, struct dma_fence *fence, struct sg_table *sgt, size_t size, bool cyclic); + struct device * (*get_dma_dev)(struct iio_buffer *buffer); void (*lock_queue)(struct iio_buffer *buffer); void (*unlock_queue)(struct iio_buffer *buffer); diff --git a/include/linux/init.h b/include/linux/init.h index 17c1bc712e23..40331923b9f4 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -200,12 +200,13 @@ extern struct module __this_module; /* Format: <modname>__<counter>_<line>_<fn> */ #define __initcall_id(fn) \ + __PASTE(kmod_, \ __PASTE(__KBUILD_MODNAME, \ __PASTE(__, \ __PASTE(__COUNTER__, \ __PASTE(_, \ __PASTE(__LINE__, \ - __PASTE(_, fn)))))) + __PASTE(_, fn))))))) /* Format: __<prefix>__<iid><id> */ #define __initcall_name(prefix, __iid, id) \ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index bccb3f1f6262..a6cb241ea00c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,7 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; extern struct nsproxy init_nsproxy; -extern struct cred init_cred; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #define INIT_PREV_CPUTIME(x) .prev_cputime = { \ diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index c0397423d3a8..e9ade2ff4af6 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -152,7 +152,7 @@ struct rapl_if_priv { union rapl_reg reg_unit; union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; int limits[RAPL_DOMAIN_MAX]; - int (*read_raw)(int id, struct reg_action *ra); + int (*read_raw)(int id, struct reg_action *ra, bool atomic); int (*write_raw)(int id, struct reg_action *ra); void *defaults; void *rpi; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 51b6484c0493..266f2b39213a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -109,6 +109,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); * @name: name of the device * @dev_id: cookie to identify the device * @percpu_dev_id: cookie to identify the device + * @affinity: CPUs this irqaction is allowed to run on * @next: pointer to the next irqaction for shared interrupts * @irq: interrupt number * @flags: flags (see IRQF_* above) @@ -121,8 +122,11 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); */ struct irqaction { irq_handler_t handler; - void *dev_id; - void __percpu *percpu_dev_id; + union { + void *dev_id; + void __percpu *percpu_dev_id; + }; + const struct cpumask *affinity; struct irqaction *next; irq_handler_t thread_fn; struct task_struct *thread; @@ -179,7 +183,7 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler, extern int __must_check __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, - void __percpu *percpu_dev_id); + const cpumask_t *affinity, void __percpu *percpu_dev_id); extern int __must_check request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags, @@ -190,12 +194,21 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id) { return __request_percpu_irq(irq, handler, 0, - devname, percpu_dev_id); + devname, NULL, percpu_dev_id); +} + +static inline int __must_check +request_percpu_irq_affinity(unsigned int irq, irq_handler_t handler, + const char *devname, const cpumask_t *affinity, + void __percpu *percpu_dev_id) +{ + return __request_percpu_irq(irq, handler, 0, + devname, affinity, percpu_dev_id); } extern int __must_check -request_percpu_nmi(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *dev); +request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, + const struct cpumask *affinity, void __percpu *dev_id); extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h index 2b8026a39906..9d5791e9f737 100644 --- a/include/linux/interval_tree.h +++ b/include/linux/interval_tree.h @@ -20,6 +20,10 @@ interval_tree_remove(struct interval_tree_node *node, struct rb_root_cached *root); extern struct interval_tree_node * +interval_tree_subtree_search(struct interval_tree_node *node, + unsigned long start, unsigned long last); + +extern struct interval_tree_node * interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index 1b400f26f63d..c5a2fed49eb0 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -77,7 +77,7 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \ * Cond2: start <= ITLAST(node) \ */ \ \ -static ITSTRUCT * \ +ITSTATIC ITSTRUCT * \ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ { \ while (true) { \ diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 7509025b4071..375fd048c4cb 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -11,17 +11,13 @@ /* io_uring_cmd is being issued again */ #define IORING_URING_CMD_REISSUE (1U << 31) -typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, - unsigned issue_flags); - struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; - /* callback to defer completions to task context */ - io_uring_cmd_tw_t task_work_cb; u32 cmd_op; u32 flags; u8 pdu[32]; /* available inline for free use */ + u8 unused[8]; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) @@ -60,7 +56,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, unsigned issue_flags, bool is_cqe32); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb, + io_req_tw_func_t task_work_cb, unsigned flags); /* @@ -109,7 +105,7 @@ static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb, unsigned flags) + io_req_tw_func_t task_work_cb, unsigned flags) { } static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, @@ -132,15 +128,23 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, } #endif +static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) +{ + return io_kiocb_to_cmd(tw_req.req, struct io_uring_cmd); +} + +/* task_work executor checks the deferred list completion */ +#define IO_URING_CMD_TASK_WORK_ISSUE_FLAGS IO_URING_F_COMPLETE_DEFER + /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb) + io_req_tw_func_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb) + io_req_tw_func_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index c2ea6280901d..e1adb0d20a0a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -39,7 +39,6 @@ enum io_uring_cmd_flags { /* set when uring wants to cancel a previously issued command */ IO_URING_F_CANCEL = (1 << 11), IO_URING_F_COMPAT = (1 << 12), - IO_URING_F_TASK_DEAD = (1 << 13), }; struct io_wq_work_node { @@ -328,8 +327,8 @@ struct io_ring_ctx { /* * Modifications are protected by ->uring_lock and ->mmap_lock. - * The flags, buf_pages and buf_nr_pages fields should be stable - * once published. + * The buffer list's io mapped region should be stable once + * published. */ struct xarray io_bl_xa; @@ -474,6 +473,7 @@ struct io_ring_ctx { * ONLY core io_uring.c should instantiate this struct. */ struct io_tw_state { + bool cancel; }; /* Alias to use in code that doesn't instantiate struct io_tw_state */ typedef struct io_tw_state io_tw_token_t; @@ -614,7 +614,11 @@ enum { REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); +struct io_tw_req { + struct io_kiocb *req; +}; + +typedef void (*io_req_tw_func_t)(struct io_tw_req tw_req, io_tw_token_t tw); struct io_task_work { struct llist_node node; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 73dceabc21c8..520e967cb501 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/mm_types.h> #include <linux/blkdev.h> +#include <linux/pagevec.h> struct address_space; struct fiemap_extent_info; @@ -16,6 +17,7 @@ struct inode; struct iomap_iter; struct iomap_dio; struct iomap_writepage_ctx; +struct iomap_read_folio_ctx; struct iov_iter; struct kiocb; struct page; @@ -241,11 +243,12 @@ struct iomap_iter { unsigned flags; struct iomap iomap; struct iomap srcmap; + struct folio_batch *fbatch; void *private; }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); -int iomap_iter_advance(struct iomap_iter *iter, u64 *count); +int iomap_iter_advance(struct iomap_iter *iter, u64 count); /** * iomap_length_trim - trimmed length of the current iomap iteration @@ -282,9 +285,7 @@ static inline u64 iomap_length(const struct iomap_iter *iter) */ static inline int iomap_iter_advance_full(struct iomap_iter *iter) { - u64 length = iomap_length(iter); - - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, iomap_length(iter)); } /** @@ -339,8 +340,10 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); -void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); +void iomap_read_folio(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); +void iomap_readahead(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); @@ -349,6 +352,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops); +loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset, + loff_t length); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); @@ -430,6 +435,10 @@ struct iomap_writeback_ops { * An existing mapping from a previous call to this method can be reused * by the file system if it is still valid. * + * If this succeeds, iomap_finish_folio_write() must be called once + * writeback completes for the range, regardless of whether the + * writeback succeeded or failed. + * * Returns the number of bytes processed or a negative errno. */ ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc, @@ -467,14 +476,41 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, loff_t pos, loff_t end_pos, unsigned int dirty_len); int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len); +void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, + int error); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len); int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio); int iomap_writepages(struct iomap_writepage_ctx *wpc); +struct iomap_read_folio_ctx { + const struct iomap_read_ops *ops; + struct folio *cur_folio; + struct readahead_control *rac; + void *read_ctx; +}; + +struct iomap_read_ops { + /* + * Read in a folio range. + * + * If this succeeds, iomap_finish_folio_read() must be called after the + * range is read in, regardless of whether the read succeeded or failed. + * + * Returns 0 on success or a negative error on failure. + */ + int (*read_folio_range)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t len); + + /* + * Submit any pending read requests. + * + * This is optional. + */ + void (*submit_read)(struct iomap_read_folio_ctx *ctx); +}; + /* * Flags for direct I/O ->end_io: */ @@ -518,6 +554,14 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) +/* + * Ensure each bio is aligned to fs block size. + * + * For filesystems which need to calculate/verify the checksum of each fs + * block. Otherwise they may not be able to handle unaligned bios. + */ +#define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); @@ -540,4 +584,30 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, extern struct bio_set iomap_ioend_bioset; +#ifdef CONFIG_BLOCK +extern const struct iomap_read_ops iomap_bio_read_ops; + +static inline void iomap_bio_read_folio(struct folio *folio, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .cur_folio = folio, + }; + + iomap_read_folio(ops, &ctx); +} + +static inline void iomap_bio_readahead(struct readahead_control *rac, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .rac = rac, + }; + + iomap_readahead(ops, &ctx); +} +#endif /* CONFIG_BLOCK */ + #endif /* LINUX_IOMAP_H */ diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d643c7c87822..6ab913e57da0 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -2,11 +2,12 @@ #ifndef __LINUX_IRQENTRYCOMMON_H #define __LINUX_IRQENTRYCOMMON_H +#include <linux/context_tracking.h> +#include <linux/kmsan.h> +#include <linux/rseq_entry.h> #include <linux/static_call_types.h> #include <linux/syscalls.h> -#include <linux/context_tracking.h> #include <linux/tick.h> -#include <linux/kmsan.h> #include <linux/unwind_deferred.h> #include <asm/entry-common.h> @@ -29,7 +30,7 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ - _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ ARCH_EXIT_TO_USER_MODE_WORK) /** @@ -67,6 +68,7 @@ static __always_inline bool arch_in_rcu_eqs(void) { return false; } /** * enter_from_user_mode - Establish state when coming from user mode + * @regs: Pointer to currents pt_regs * * Syscall/interrupt entry disables interrupts, but user mode is traced as * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. @@ -195,14 +197,11 @@ static __always_inline void arch_exit_to_user_mode(void) { } */ void arch_do_signal_or_restart(struct pt_regs *regs); -/** - * exit_to_user_mode_loop - do any pending work before leaving to user space - */ -unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work); +/* Handle pending TIF work */ +unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work); /** - * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack * * 1) check that interrupts are disabled @@ -210,8 +209,10 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, * 3) call exit_to_user_mode_loop() if any flags from * EXIT_TO_USER_MODE_WORK are set * 4) check that interrupts are still disabled + * + * Don't invoke directly, use the syscall/irqentry_ prefixed variants below */ -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) { unsigned long ti_work; @@ -225,13 +226,52 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) ti_work = exit_to_user_mode_loop(regs, ti_work); arch_exit_to_user_mode_prepare(regs, ti_work); +} +static __always_inline void __exit_to_user_mode_validate(void) +{ /* Ensure that kernel state is sane for a return to userspace */ kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); } +/* Temporary workaround to keep ARM64 alive */ +static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_exit_to_user_mode_legacy(); + __exit_to_user_mode_validate(); +} + +/** + * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for + * syscalls and interrupts. + */ +static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_syscall_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + +/** + * irqentry_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for + * syscalls and interrupts. + */ +static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_irqentry_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * @@ -253,11 +293,11 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) static __always_inline void exit_to_user_mode(void) { instrumentation_begin(); + unwind_reset_info(); trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); - unwind_reset_info(); user_enter_irqoff(); arch_exit_to_user_mode(); lockdep_hardirqs_on(CALLER_ADDR0); @@ -274,7 +314,11 @@ static __always_inline void exit_to_user_mode(void) * * The function establishes state (lockdep, RCU (context tracking), tracing) */ -void irqentry_enter_from_user_mode(struct pt_regs *regs); +static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + rseq_note_user_irq_entry(); +} /** * irqentry_exit_to_user_mode - Interrupt exit work @@ -289,7 +333,13 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs); * Interrupt exit is not invoking #1 which is the syscall specific one time * work. */ -void irqentry_exit_to_user_mode(struct pt_regs *regs); +static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + irqentry_exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} #ifndef irqentry_state /** @@ -354,6 +404,7 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); * Conditional reschedule with additional sanity checks. */ void raw_irqentry_exit_cond_resched(void); + #ifdef CONFIG_PREEMPT_DYNAMIC #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched diff --git a/include/linux/irq.h b/include/linux/irq.h index c67e76fbcc07..4a9f1d7b08c3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -655,7 +655,6 @@ extern void handle_bad_irq(struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); extern void handle_fasteoi_nmi(struct irq_desc *desc); -extern void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); extern int irq_chip_pm_get(struct irq_data *data); @@ -719,10 +718,6 @@ static inline void irq_set_chip_and_handler(unsigned int irq, } extern int irq_set_percpu_devid(unsigned int irq); -extern int irq_set_percpu_devid_partition(unsigned int irq, - const struct cpumask *affinity); -extern int irq_get_percpu_devid_partition(unsigned int irq, - struct cpumask *affinity); extern void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 136f2980cba3..c5afd053ae32 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -2,8 +2,9 @@ #ifndef _LINUX_IRQ_WORK_H #define _LINUX_IRQ_WORK_H -#include <linux/smp_types.h> +#include <linux/irq_work_types.h> #include <linux/rcuwait.h> +#include <linux/smp_types.h> /* * An entry can be in one of four states: @@ -14,12 +15,6 @@ * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed */ -struct irq_work { - struct __call_single_node node; - void (*func)(struct irq_work *); - struct rcuwait irqwait; -}; - #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ .node = { .u_flags = (_flags), }, \ .func = (_func), \ diff --git a/include/linux/irq_work_types.h b/include/linux/irq_work_types.h new file mode 100644 index 000000000000..73abec5bb06e --- /dev/null +++ b/include/linux/irq_work_types.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IRQ_WORK_TYPES_H +#define _LINUX_IRQ_WORK_TYPES_H + +#include <linux/smp_types.h> +#include <linux/types.h> + +struct irq_work { + struct __call_single_node node; + void (*func)(struct irq_work *); + struct rcuwait irqwait; +}; + +#endif diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index d5e6024cb2a8..bc4ddacd6ddc 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -17,12 +17,18 @@ #include <linux/of_irq.h> #include <linux/platform_device.h> +typedef int (*platform_irq_probe_t)(struct platform_device *, struct device_node *); + /* Undefined on purpose */ extern of_irq_init_cb_t typecheck_irq_init_cb; +extern platform_irq_probe_t typecheck_irq_probe; #define typecheck_irq_init_cb(fn) \ (__typecheck(typecheck_irq_init_cb, &fn) ? fn : fn) +#define typecheck_irq_probe(fn) \ + (__typecheck(typecheck_irq_probe, &fn) ? fn : fn) + /* * This macro must be used by the different irqchip drivers to declare * the association between their DT compatible string and their @@ -42,7 +48,7 @@ extern int platform_irqchip_probe(struct platform_device *pdev); static const struct of_device_id drv_name##_irqchip_match_table[] = { #define IRQCHIP_MATCH(compat, fn) { .compatible = compat, \ - .data = typecheck_irq_init_cb(fn), }, + .data = typecheck_irq_probe(fn), }, #define IRQCHIP_PLATFORM_DRIVER_END(drv_name, ...) \ diff --git a/include/linux/irqchip/irq-partition-percpu.h b/include/linux/irqchip/irq-partition-percpu.h deleted file mode 100644 index b35ee22c278f..000000000000 --- a/include/linux/irqchip/irq-partition-percpu.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2016 ARM Limited, All Rights Reserved. - * Author: Marc Zyngier <marc.zyngier@arm.com> - */ - -#ifndef __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H -#define __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H - -#include <linux/fwnode.h> -#include <linux/cpumask_types.h> -#include <linux/irqdomain.h> - -struct partition_affinity { - cpumask_t mask; - void *partition_id; -}; - -struct partition_desc; - -#ifdef CONFIG_PARTITION_PERCPU -int partition_translate_id(struct partition_desc *desc, void *partition_id); -struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode, - struct partition_affinity *parts, - int nr_parts, - int chained_irq, - const struct irq_domain_ops *ops); -struct irq_domain *partition_get_domain(struct partition_desc *dsc); -#else -static inline int partition_translate_id(struct partition_desc *desc, - void *partition_id) -{ - return -EINVAL; -} - -static inline -struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode, - struct partition_affinity *parts, - int nr_parts, - int chained_irq, - const struct irq_domain_ops *ops) -{ - return NULL; -} - -static inline -struct irq_domain *partition_get_domain(struct partition_desc *dsc) -{ - return NULL; -} -#endif - -#endif /* __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index fd091c35d572..37e0b5b5600a 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -82,7 +82,6 @@ struct irq_desc { int threads_handled_last; raw_spinlock_t lock; struct cpumask *percpu_enabled; - const struct cpumask *percpu_affinity; #ifdef CONFIG_SMP const struct cpumask *affinity_hint; struct irq_affinity_notify *affinity_notify; diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 4a86e6b915dd..952d3c8dd6b7 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -44,6 +44,23 @@ struct irq_fwspec { u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS]; }; +/** + * struct irq_fwspec_info - firmware provided IRQ information structure + * + * @flags: Information validity flags + * @affinity: Affinity mask for this interrupt + * + * This structure reports firmware-specific information about an + * interrupt. The only significant information is the affinity of a + * per-CPU interrupt, but this is designed to be extended as required. + */ +struct irq_fwspec_info { + unsigned long flags; + const struct cpumask *affinity; +}; + +#define IRQ_FWSPEC_INFO_AFFINITY_VALID BIT(0) + /* Conversion function from of_phandle_args fields to fwspec */ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, unsigned int count, struct irq_fwspec *fwspec); @@ -69,6 +86,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, * @translate: Given @fwspec, decode the hardware irq number (@out_hwirq) and * linux irq type value (@out_type). This is a generalised @xlate * (over struct irq_fwspec) and is preferred if provided. + * @get_fwspec_info: + * Given @fwspec, report additional firmware-provided information in + * @info. Optional. * @debug_show: For domains to show specific data for an interrupt in debugfs. * * Functions below are provided by the driver and called whenever a new mapping @@ -96,6 +116,7 @@ struct irq_domain_ops { void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data); int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type); + int (*get_fwspec_info)(struct irq_fwspec *fwspec, struct irq_fwspec_info *info); #endif #ifdef CONFIG_GENERIC_IRQ_DEBUGFS void (*debug_show)(struct seq_file *m, struct irq_domain *d, @@ -602,6 +623,8 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_bas int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq); +int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info); + static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY; @@ -685,6 +708,10 @@ static inline bool irq_domain_is_msi_device(struct irq_domain *domain) return false; } +static inline int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) +{ + return -EINVAL; +} #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_MSI_IRQ @@ -703,12 +730,6 @@ static inline void msi_device_domain_free_wired(struct irq_domain *domain, unsig } #endif -/* Deprecated functions. Will be removed in the merge window */ -static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) -{ - return node ? &node->fwnode : NULL; -} - static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 43b9297fe8a7..f5eaf76198f3 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1253,6 +1253,12 @@ struct journal_s */ struct lockdep_map j_trans_commit_map; #endif + /** + * @jbd2_trans_commit_key: + * + * "struct lock_class_key" for @j_trans_commit_map + */ + struct lock_class_key jbd2_trans_commit_key; /** * @j_fc_cleanup_callback: diff --git a/include/linux/kdb.h b/include/linux/kdb.h index ecbf819deeca..741c58e86431 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -14,6 +14,7 @@ */ #include <linux/list.h> +#include <linux/smp.h> /* Shifted versions of the command enable bits are be used if the command * has no arguments (see kdb_check_flags). This allows commands, such as @@ -207,11 +208,26 @@ static inline const char *kdb_walk_kallsyms(loff_t *pos) /* Dynamic kdb shell command registration */ extern int kdb_register(kdbtab_t *cmd); extern void kdb_unregister(kdbtab_t *cmd); + +/* Return true when KDB as locked for printing a message on this CPU. */ +static inline +bool kdb_printf_on_this_cpu(void) +{ + /* + * We can use raw_smp_processor_id() here because the task could + * not get migrated when KDB has locked for printing on this CPU. + */ + return unlikely(READ_ONCE(kdb_printf_cpu) == raw_smp_processor_id()); +} + #else /* ! CONFIG_KGDB_KDB */ static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; } static inline void kdb_init(int level) {} static inline int kdb_register(kdbtab_t *cmd) { return 0; } static inline void kdb_unregister(kdbtab_t *cmd) {} + +static inline bool kdb_printf_on_this_cpu(void) { return false; } + #endif /* CONFIG_KGDB_KDB */ enum { KDB_NOT_INITIALIZED, diff --git a/include/linux/kernel_read_file.h b/include/linux/kernel_read_file.h index 90451e2e12bd..d613a7b4dd35 100644 --- a/include/linux/kernel_read_file.h +++ b/include/linux/kernel_read_file.h @@ -14,6 +14,7 @@ id(KEXEC_INITRAMFS, kexec-initramfs) \ id(POLICY, security-policy) \ id(X509_CERTIFICATE, x509-certificate) \ + id(MODULE_COMPRESSED, kernel-module-compressed) \ id(MAX_ID, ) #define __fid_enumify(ENUM, dummy) READING_ ## ENUM, diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 5caf3ce82373..bb97bd3e5af4 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -107,11 +107,14 @@ struct key_type { */ int (*match_preparse)(struct key_match_data *match_data); - /* Free preparsed match data (optional). This should be supplied it - * ->match_preparse() is supplied. */ + /* + * Free preparsed match data (optional). This should be supplied if + * ->match_preparse() is supplied. + */ void (*match_free)(struct key_match_data *match_data); - /* clear some of the data from a key on revokation (optional) + /* + * Clear some of the data from a key on revocation (optional). * - the key's semaphore will be write-locked by the caller */ void (*revoke)(struct key *key); diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index fd743d4c4b4b..8b81ac74829c 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -370,6 +370,30 @@ __kfifo_int_must_check_helper( \ ) /** + * kfifo_alloc_node - dynamically allocates a new fifo buffer on a NUMA node + * @fifo: pointer to the fifo + * @size: the number of elements in the fifo, this must be a power of 2 + * @gfp_mask: get_free_pages mask, passed to kmalloc() + * @node: NUMA node to allocate memory on + * + * This macro dynamically allocates a new fifo buffer with NUMA node awareness. + * + * The number of elements will be rounded-up to a power of 2. + * The fifo will be release with kfifo_free(). + * Return 0 if no error, otherwise an error code. + */ +#define kfifo_alloc_node(fifo, size, gfp_mask, node) \ +__kfifo_int_must_check_helper( \ +({ \ + typeof((fifo) + 1) __tmp = (fifo); \ + struct __kfifo *__kfifo = &__tmp->kfifo; \ + __is_kfifo_ptr(__tmp) ? \ + __kfifo_alloc_node(__kfifo, size, sizeof(*__tmp->type), gfp_mask, node) : \ + -EINVAL; \ +}) \ +) + +/** * kfifo_free - frees the fifo * @fifo: the fifo to be freed */ @@ -899,8 +923,14 @@ __kfifo_uint_must_check_helper( \ ) -extern int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, - size_t esize, gfp_t gfp_mask); +extern int __kfifo_alloc_node(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask, int node); + +static inline int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask) +{ + return __kfifo_alloc_node(fifo, size, esize, gfp_mask, NUMA_NO_NODE); +} extern void __kfifo_free(struct __kfifo *fifo); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 490464c205b4..a568d8e6f4e8 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -11,8 +11,22 @@ #ifdef KVM_SUB_MODULES #define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) \ EXPORT_SYMBOL_FOR_MODULES(symbol, __stringify(KVM_SUB_MODULES)) +#define EXPORT_SYMBOL_FOR_KVM(symbol) \ + EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm," __stringify(KVM_SUB_MODULES)) #else #define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) +/* + * Allow architectures to provide a custom EXPORT_SYMBOL_FOR_KVM, but only + * if there are no submodules, e.g. to allow suppressing exports if KVM=m, but + * kvm.ko won't actually be built (due to lack of at least one submodule). + */ +#ifndef EXPORT_SYMBOL_FOR_KVM +#if IS_MODULE(CONFIG_KVM) +#define EXPORT_SYMBOL_FOR_KVM(symbol) EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm") +#else +#define EXPORT_SYMBOL_FOR_KVM(symbol) +#endif /* IS_MODULE(CONFIG_KVM) */ +#endif /* EXPORT_SYMBOL_FOR_KVM */ #endif #ifndef __ASSEMBLER__ diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 51a258c24ff5..772919e8096a 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -13,6 +13,7 @@ #include <linux/ftrace.h> #include <linux/completion.h> #include <linux/list.h> +#include <linux/livepatch_external.h> #include <linux/livepatch_sched.h> #if IS_ENABLED(CONFIG_LIVEPATCH) @@ -77,30 +78,6 @@ struct klp_func { bool transition; }; -struct klp_object; - -/** - * struct klp_callbacks - pre/post live-(un)patch callback structure - * @pre_patch: executed before code patching - * @post_patch: executed after code patching - * @pre_unpatch: executed before code unpatching - * @post_unpatch: executed after code unpatching - * @post_unpatch_enabled: flag indicating if post-unpatch callback - * should run - * - * All callbacks are optional. Only the pre-patch callback, if provided, - * will be unconditionally executed. If the parent klp_object fails to - * patch for any reason, including a non-zero error status returned from - * the pre-patch callback, no further callbacks will be executed. - */ -struct klp_callbacks { - int (*pre_patch)(struct klp_object *obj); - void (*post_patch)(struct klp_object *obj); - void (*pre_unpatch)(struct klp_object *obj); - void (*post_unpatch)(struct klp_object *obj); - bool post_unpatch_enabled; -}; - /** * struct klp_object - kernel object structure for live patching * @name: module name (or NULL for vmlinux) diff --git a/include/linux/livepatch_external.h b/include/linux/livepatch_external.h new file mode 100644 index 000000000000..138af19b0f5c --- /dev/null +++ b/include/linux/livepatch_external.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * External livepatch interfaces for patch creation tooling + */ + +#ifndef _LINUX_LIVEPATCH_EXTERNAL_H_ +#define _LINUX_LIVEPATCH_EXTERNAL_H_ + +#include <linux/types.h> + +#define KLP_RELOC_SEC_PREFIX ".klp.rela." +#define KLP_SYM_PREFIX ".klp.sym." + +#define __KLP_PRE_PATCH_PREFIX __klp_pre_patch_callback_ +#define __KLP_POST_PATCH_PREFIX __klp_post_patch_callback_ +#define __KLP_PRE_UNPATCH_PREFIX __klp_pre_unpatch_callback_ +#define __KLP_POST_UNPATCH_PREFIX __klp_post_unpatch_callback_ + +#define KLP_PRE_PATCH_PREFIX __stringify(__KLP_PRE_PATCH_PREFIX) +#define KLP_POST_PATCH_PREFIX __stringify(__KLP_POST_PATCH_PREFIX) +#define KLP_PRE_UNPATCH_PREFIX __stringify(__KLP_PRE_UNPATCH_PREFIX) +#define KLP_POST_UNPATCH_PREFIX __stringify(__KLP_POST_UNPATCH_PREFIX) + +struct klp_object; + +typedef int (*klp_pre_patch_t)(struct klp_object *obj); +typedef void (*klp_post_patch_t)(struct klp_object *obj); +typedef void (*klp_pre_unpatch_t)(struct klp_object *obj); +typedef void (*klp_post_unpatch_t)(struct klp_object *obj); + +/** + * struct klp_callbacks - pre/post live-(un)patch callback structure + * @pre_patch: executed before code patching + * @post_patch: executed after code patching + * @pre_unpatch: executed before code unpatching + * @post_unpatch: executed after code unpatching + * @post_unpatch_enabled: flag indicating if post-unpatch callback + * should run + * + * All callbacks are optional. Only the pre-patch callback, if provided, + * will be unconditionally executed. If the parent klp_object fails to + * patch for any reason, including a non-zero error status returned from + * the pre-patch callback, no further callbacks will be executed. + */ +struct klp_callbacks { + klp_pre_patch_t pre_patch; + klp_post_patch_t post_patch; + klp_pre_unpatch_t pre_unpatch; + klp_post_unpatch_t post_unpatch; + bool post_unpatch_enabled; +}; + +/* + * 'struct klp_{func,object}_ext' are compact "external" representations of + * 'struct klp_{func,object}'. They are used by objtool for livepatch + * generation. The structs are then read by the livepatch module and converted + * to the real structs before calling klp_enable_patch(). + * + * TODO make these the official API for klp_enable_patch(). That should + * simplify livepatch's interface as well as its data structure lifetime + * management. + */ +struct klp_func_ext { + const char *old_name; + void *new_func; + unsigned long sympos; +}; + +struct klp_object_ext { + const char *name; + struct klp_func_ext *funcs; + struct klp_callbacks callbacks; + unsigned int nr_funcs; +}; + +#endif /* _LINUX_LIVEPATCH_EXTERNAL_H_ */ diff --git a/include/linux/livepatch_helpers.h b/include/linux/livepatch_helpers.h new file mode 100644 index 000000000000..99d68d0773fa --- /dev/null +++ b/include/linux/livepatch_helpers.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LIVEPATCH_HELPERS_H +#define _LINUX_LIVEPATCH_HELPERS_H + +/* + * Interfaces for use by livepatch patches + */ + +#include <linux/syscalls.h> +#include <linux/livepatch.h> + +#ifdef MODULE +#define KLP_OBJNAME __KBUILD_MODNAME +#else +#define KLP_OBJNAME vmlinux +#endif + +/* Livepatch callback registration */ + +#define KLP_CALLBACK_PTRS ".discard.klp_callback_ptrs" + +#define KLP_PRE_PATCH_CALLBACK(func) \ + klp_pre_patch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_PRE_PATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_POST_PATCH_CALLBACK(func) \ + klp_post_patch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_POST_PATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_PRE_UNPATCH_CALLBACK(func) \ + klp_pre_unpatch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_PRE_UNPATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_POST_UNPATCH_CALLBACK(func) \ + klp_post_unpatch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_POST_UNPATCH_PREFIX, KLP_OBJNAME) = func + +/* + * Replace static_call() usage with this macro when create-diff-object + * recommends it due to the original static call key living in a module. + * + * This converts the static call to a regular indirect call. + */ +#define KLP_STATIC_CALL(name) \ + ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func)) + +/* Syscall patching */ + +#define KLP_SYSCALL_DEFINE1(name, ...) KLP_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE2(name, ...) KLP_SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE3(name, ...) KLP_SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE4(name, ...) KLP_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE5(name, ...) KLP_SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE6(name, ...) KLP_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) + +#define KLP_SYSCALL_DEFINEx(x, sname, ...) \ + __KLP_SYSCALL_DEFINEx(x, sname, __VA_ARGS__) + +#ifdef CONFIG_X86_64 +// TODO move this to arch/x86/include/asm/syscall_wrapper.h and share code +#define __KLP_SYSCALL_DEFINEx(x, name, ...) \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + __X64_SYS_STUBx(x, name, __VA_ARGS__) \ + __IA32_SYS_STUBx(x, name, __VA_ARGS__) \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + long ret = __klp_do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ + return ret; \ + } \ + static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +#endif + +#endif /* _LINUX_LIVEPATCH_HELPERS_H */ diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 0d91d060e3e9..b0e6ab329b00 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -6,6 +6,7 @@ /** * local_lock_init - Runtime initialize a lock instance + * @lock: The lock variable */ #define local_lock_init(lock) __local_lock_init(lock) @@ -52,7 +53,8 @@ __local_unlock_irqrestore(this_cpu_ptr(lock), flags) /** - * local_lock_init - Runtime initialize a lock instance + * local_trylock_init - Runtime initialize a lock instance + * @lock: The lock variable */ #define local_trylock_init(lock) __local_trylock_init(lock) diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index a4dc479157b5..8f82b4eb542f 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -99,18 +99,18 @@ do { \ #define __local_lock_acquire(lock) \ do { \ - local_trylock_t *tl; \ - local_lock_t *l; \ + local_trylock_t *__tl; \ + local_lock_t *__l; \ \ - l = (local_lock_t *)(lock); \ - tl = (local_trylock_t *)l; \ + __l = (local_lock_t *)(lock); \ + __tl = (local_trylock_t *)__l; \ _Generic((lock), \ local_trylock_t *: ({ \ - lockdep_assert(tl->acquired == 0); \ - WRITE_ONCE(tl->acquired, 1); \ + lockdep_assert(__tl->acquired == 0); \ + WRITE_ONCE(__tl->acquired, 1); \ }), \ local_lock_t *: (void)0); \ - local_lock_acquire(l); \ + local_lock_acquire(__l); \ } while (0) #define __local_lock(lock) \ @@ -133,36 +133,36 @@ do { \ #define __local_trylock(lock) \ ({ \ - local_trylock_t *tl; \ + local_trylock_t *__tl; \ \ preempt_disable(); \ - tl = (lock); \ - if (READ_ONCE(tl->acquired)) { \ + __tl = (lock); \ + if (READ_ONCE(__tl->acquired)) { \ preempt_enable(); \ - tl = NULL; \ + __tl = NULL; \ } else { \ - WRITE_ONCE(tl->acquired, 1); \ + WRITE_ONCE(__tl->acquired, 1); \ local_trylock_acquire( \ - (local_lock_t *)tl); \ + (local_lock_t *)__tl); \ } \ - !!tl; \ + !!__tl; \ }) #define __local_trylock_irqsave(lock, flags) \ ({ \ - local_trylock_t *tl; \ + local_trylock_t *__tl; \ \ local_irq_save(flags); \ - tl = (lock); \ - if (READ_ONCE(tl->acquired)) { \ + __tl = (lock); \ + if (READ_ONCE(__tl->acquired)) { \ local_irq_restore(flags); \ - tl = NULL; \ + __tl = NULL; \ } else { \ - WRITE_ONCE(tl->acquired, 1); \ + WRITE_ONCE(__tl->acquired, 1); \ local_trylock_acquire( \ - (local_lock_t *)tl); \ + (local_lock_t *)__tl); \ } \ - !!tl; \ + !!__tl; \ }) /* preemption or migration must be disabled before calling __local_lock_is_locked */ @@ -170,16 +170,16 @@ do { \ #define __local_lock_release(lock) \ do { \ - local_trylock_t *tl; \ - local_lock_t *l; \ + local_trylock_t *__tl; \ + local_lock_t *__l; \ \ - l = (local_lock_t *)(lock); \ - tl = (local_trylock_t *)l; \ - local_lock_release(l); \ + __l = (local_lock_t *)(lock); \ + __tl = (local_trylock_t *)__l; \ + local_lock_release(__l); \ _Generic((lock), \ local_trylock_t *: ({ \ - lockdep_assert(tl->acquired == 1); \ - WRITE_ONCE(tl->acquired, 0); \ + lockdep_assert(__tl->acquired == 1); \ + WRITE_ONCE(__tl->acquired, 0); \ }), \ local_lock_t *: (void)0); \ } while (0) @@ -223,12 +223,12 @@ typedef spinlock_t local_trylock_t; #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) #define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) -#define __local_lock_init(l) \ +#define __local_lock_init(__l) \ do { \ - local_spin_lock_init((l)); \ + local_spin_lock_init((__l)); \ } while (0) -#define __local_trylock_init(l) __local_lock_init(l) +#define __local_trylock_init(__l) __local_lock_init(__l) #define __local_lock(__lock) \ do { \ diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 67964dc4db95..dd634103b014 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -616,7 +616,7 @@ do { \ #define lockdep_assert_in_softirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && \ - (!in_softirq() || in_irq() || in_nmi())); \ + (!in_softirq() || in_hardirq() || in_nmi())); \ } while (0) extern void lockdep_assert_in_softirq_func(void); diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 79ec5a2bdcca..b92008641242 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -102,23 +102,23 @@ struct security_hook_list { * Security blob size or offset data. */ struct lsm_blob_sizes { - int lbs_cred; - int lbs_file; - int lbs_ib; - int lbs_inode; - int lbs_sock; - int lbs_superblock; - int lbs_ipc; - int lbs_key; - int lbs_msg_msg; - int lbs_perf_event; - int lbs_task; - int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ - int lbs_tun_dev; - int lbs_bdev; - int lbs_bpf_map; - int lbs_bpf_prog; - int lbs_bpf_token; + unsigned int lbs_cred; + unsigned int lbs_file; + unsigned int lbs_ib; + unsigned int lbs_inode; + unsigned int lbs_sock; + unsigned int lbs_superblock; + unsigned int lbs_ipc; + unsigned int lbs_key; + unsigned int lbs_msg_msg; + unsigned int lbs_perf_event; + unsigned int lbs_task; + unsigned int lbs_xattr_count; /* num xattr slots in new_xattrs array */ + unsigned int lbs_tun_dev; + unsigned int lbs_bdev; + unsigned int lbs_bpf_map; + unsigned int lbs_bpf_prog; + unsigned int lbs_bpf_token; }; /* @@ -151,13 +151,36 @@ enum lsm_order { LSM_ORDER_LAST = 1, /* This is only for integrity. */ }; +/** + * struct lsm_info - Define an individual LSM for the LSM framework. + * @id: LSM name/ID info + * @order: ordering with respect to other LSMs, optional + * @flags: descriptive flags, optional + * @blobs: LSM blob sharing, optional + * @enabled: controlled by CONFIG_LSM, optional + * @init: LSM specific initialization routine + * @initcall_pure: LSM callback for initcall_pure() setup, optional + * @initcall_early: LSM callback for early_initcall setup, optional + * @initcall_core: LSM callback for core_initcall() setup, optional + * @initcall_subsys: LSM callback for subsys_initcall() setup, optional + * @initcall_fs: LSM callback for fs_initcall setup, optional + * @nitcall_device: LSM callback for device_initcall() setup, optional + * @initcall_late: LSM callback for late_initcall() setup, optional + */ struct lsm_info { - const char *name; /* Required. */ - enum lsm_order order; /* Optional: default is LSM_ORDER_MUTABLE */ - unsigned long flags; /* Optional: flags describing LSM */ - int *enabled; /* Optional: controlled by CONFIG_LSM */ - int (*init)(void); /* Required. */ - struct lsm_blob_sizes *blobs; /* Optional: for blob sharing. */ + const struct lsm_id *id; + enum lsm_order order; + unsigned long flags; + struct lsm_blob_sizes *blobs; + int *enabled; + int (*init)(void); + int (*initcall_pure)(void); + int (*initcall_early)(void); + int (*initcall_core)(void); + int (*initcall_subsys)(void); + int (*initcall_fs)(void); + int (*initcall_device)(void); + int (*initcall_late)(void); }; #define DEFINE_LSM(lsm) \ @@ -170,11 +193,9 @@ struct lsm_info { __used __section(".early_lsm_info.init") \ __aligned(sizeof(unsigned long)) + /* DO NOT tamper with these variables outside of the LSM framework */ -extern char *lsm_names; extern struct lsm_static_calls_table static_calls_table __ro_after_init; -extern struct lsm_info __start_lsm_info[], __end_lsm_info[]; -extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; /** * lsm_get_xattr_slot - Return the next available slot and increment the index diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h index 4c1a91b07de3..e1555e06e7e5 100644 --- a/include/linux/mailbox/mtk-cmdq-mailbox.h +++ b/include/linux/mailbox/mtk-cmdq-mailbox.h @@ -77,6 +77,16 @@ struct cmdq_pkt { size_t buf_size; /* real buffer size */ }; +/** + * cmdq_get_shift_pa() - get the shift bits of physical address + * @chan: mailbox channel + * + * GCE can only fetch the command buffer address from a 32-bit register. + * Some SOCs support more than 32-bit command buffer address for GCE, which + * requires some shift bits to make the address fit into the 32-bit register. + * + * Return: the shift bits of physical address + */ u8 cmdq_get_shift_pa(struct mbox_chan *chan); #endif /* __MTK_CMDQ_MAILBOX_H__ */ diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 6f606d9573c3..cc74de3dbcfe 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -4,6 +4,8 @@ #include <linux/file.h> +#define MEMFD_ANON_NAME "[memfd]" + #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); diff --git a/include/linux/memory.h b/include/linux/memory.h index 0c214256216f..ba1515160894 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -96,17 +96,8 @@ int set_memory_block_size_order(unsigned int order); #define MEM_GOING_ONLINE (1<<3) #define MEM_CANCEL_ONLINE (1<<4) #define MEM_CANCEL_OFFLINE (1<<5) -#define MEM_PREPARE_ONLINE (1<<6) -#define MEM_FINISH_OFFLINE (1<<7) struct memory_notify { - /* - * The altmap_start_pfn and altmap_nr_pages fields are designated for - * specifying the altmap range and are exclusively intended for use in - * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. - */ - unsigned long altmap_start_pfn; - unsigned long altmap_nr_pages; unsigned long start_pfn; unsigned long nr_pages; }; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 23f038a16231..f2f16cdd73ee 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -58,22 +58,6 @@ typedef int __bitwise mhp_t; * implies the node id (nid). */ #define MHP_NID_IS_MGID ((__force mhp_t)BIT(2)) -/* - * The hotplugged memory is completely inaccessible while the memory is - * offline. The memory provider will handle MEM_PREPARE_ONLINE / - * MEM_FINISH_OFFLINE notifications and make the memory accessible. - * - * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY, - * because the altmap cannot be written (e.g., poisoned) when adding - * memory -- before it is set online. - * - * This allows for adding memory with an altmap that is not currently - * made available by a hypervisor. When onlining that memory, the - * hypervisor can be instructed to make that memory available, and - * the onlining phase will not require any memory allocations, which is - * helpful in low-memory situations. - */ -#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3)) /* * Extended parameters for memory hotplug: @@ -123,7 +107,7 @@ extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone, bool mhp_off_inaccessible); + struct zone *zone); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 34941a4b9026..e8e440e04a06 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -27,32 +27,31 @@ typedef struct mempool { wait_queue_head_t wait; } mempool_t; -static inline bool mempool_initialized(mempool_t *pool) +static inline bool mempool_initialized(struct mempool *pool) { return pool->elements != NULL; } -static inline bool mempool_is_saturated(mempool_t *pool) +static inline bool mempool_is_saturated(struct mempool *pool) { return READ_ONCE(pool->curr_nr) >= pool->min_nr; } -void mempool_exit(mempool_t *pool); -int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id); - -int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); +void mempool_exit(struct mempool *pool); +int mempool_init_node(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id); +int mempool_init_noprof(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data); #define mempool_init(...) \ alloc_hooks(mempool_init_noprof(__VA_ARGS__)) -extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); - -extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int nid); +struct mempool *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +struct mempool *mempool_create_node_noprof(int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int nid); #define mempool_create_node(...) \ alloc_hooks(mempool_create_node_noprof(__VA_ARGS__)) @@ -60,15 +59,21 @@ extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_ mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data, \ GFP_KERNEL, NUMA_NO_NODE) -extern int mempool_resize(mempool_t *pool, int new_min_nr); -extern void mempool_destroy(mempool_t *pool); +int mempool_resize(struct mempool *pool, int new_min_nr); +void mempool_destroy(struct mempool *pool); -extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; +void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) __malloc; #define mempool_alloc(...) \ alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) +int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, + unsigned int count, unsigned int allocated); +#define mempool_alloc_bulk(...) \ + alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__)) -extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; -extern void mempool_free(void *element, mempool_t *pool); +void *mempool_alloc_preallocated(struct mempool *pool) __malloc; +void mempool_free(void *element, struct mempool *pool); +unsigned int mempool_free_bulk(struct mempool *pool, void **elem, + unsigned int count); /* * A mempool_alloc_t and mempool_free_t that get the memory from @@ -97,19 +102,6 @@ void mempool_kfree(void *element, void *pool_data); mempool_create((_min_nr), mempool_kmalloc, mempool_kfree, \ (void *)(unsigned long)(_size)) -void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data); -void mempool_kvfree(void *element, void *pool_data); - -static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size) -{ - return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); -} - -static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size) -{ - return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); -} - /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that * allocates pages of the order specified by pool_data diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e5951ba12a28..30c7aecbd245 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -25,7 +25,6 @@ struct vmem_altmap { unsigned long free; unsigned long align; unsigned long alloc; - bool inaccessible; }; /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c79b3369b82..8dc0a07570cc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2408,31 +2408,6 @@ struct zap_details { /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) -#ifdef CONFIG_SCHED_MM_CID -void sched_mm_cid_before_execve(struct task_struct *t); -void sched_mm_cid_after_execve(struct task_struct *t); -void sched_mm_cid_fork(struct task_struct *t); -void sched_mm_cid_exit_signals(struct task_struct *t); -static inline int task_mm_cid(struct task_struct *t) -{ - return t->mm_cid; -} -#else -static inline void sched_mm_cid_before_execve(struct task_struct *t) { } -static inline void sched_mm_cid_after_execve(struct task_struct *t) { } -static inline void sched_mm_cid_fork(struct task_struct *t) { } -static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } -static inline int task_mm_cid(struct task_struct *t) -{ - /* - * Use the processor id as a fall-back when the mm cid feature is - * disabled. This provides functional per-cpu data structure accesses - * in user-space, althrough it won't provide the memory usage benefits. - */ - return raw_smp_processor_id(); -} -#endif - #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else @@ -3376,6 +3351,8 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct rb_root_cached *root); void vma_interval_tree_remove(struct vm_area_struct *node, struct rb_root_cached *root); +struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node, + unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, @@ -3502,10 +3479,10 @@ struct vm_unmapped_area_info { extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); /* truncate.c */ -extern void truncate_inode_pages(struct address_space *, loff_t); -extern void truncate_inode_pages_range(struct address_space *, - loff_t lstart, loff_t lend); -extern void truncate_inode_pages_final(struct address_space *); +void truncate_inode_pages(struct address_space *mapping, loff_t lstart); +void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, + uoff_t lend); +void truncate_inode_pages_final(struct address_space *mapping); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..3b7d05e7169c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,7 @@ #include <linux/seqlock.h> #include <linux/percpu_counter.h> #include <linux/types.h> +#include <linux/rseq_types.h> #include <linux/bitmap.h> #include <asm/mmu.h> @@ -922,14 +923,6 @@ struct vm_area_struct { #define vma_policy(vma) NULL #endif -#ifdef CONFIG_SCHED_MM_CID -struct mm_cid { - u64 time; - int cid; - int recent_cid; -}; -#endif - /* * Opaque type representing current mm_struct flag state. Must be accessed via * mm_flags_xxx() helper functions. @@ -991,44 +984,9 @@ struct mm_struct { */ atomic_t mm_users; -#ifdef CONFIG_SCHED_MM_CID - /** - * @pcpu_cid: Per-cpu current cid. - * - * Keep track of the currently allocated mm_cid for each cpu. - * The per-cpu mm_cid values are serialized by their respective - * runqueue locks. - */ - struct mm_cid __percpu *pcpu_cid; - /* - * @mm_cid_next_scan: Next mm_cid scan (in jiffies). - * - * When the next mm_cid scan is due (in jiffies). - */ - unsigned long mm_cid_next_scan; - /** - * @nr_cpus_allowed: Number of CPUs allowed for mm. - * - * Number of CPUs allowed in the union of all mm's - * threads allowed CPUs. - */ - unsigned int nr_cpus_allowed; - /** - * @max_nr_cid: Maximum number of allowed concurrency - * IDs allocated. - * - * Track the highest number of allowed concurrency IDs - * allocated for the mm. - */ - atomic_t max_nr_cid; - /** - * @cpus_allowed_lock: Lock protecting mm cpus_allowed. - * - * Provide mutual exclusion for mm cpus_allowed and - * mm nr_cpus_allowed updates. - */ - raw_spinlock_t cpus_allowed_lock; -#endif + /* MM CID related storage */ + struct mm_mm_cid mm_cid; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif @@ -1370,37 +1328,6 @@ static inline void vma_iter_init(struct vma_iterator *vmi, } #ifdef CONFIG_SCHED_MM_CID - -enum mm_cid_state { - MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ - MM_CID_LAZY_PUT = (1U << 31), -}; - -static inline bool mm_cid_is_unset(int cid) -{ - return cid == MM_CID_UNSET; -} - -static inline bool mm_cid_is_lazy_put(int cid) -{ - return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); -} - -static inline bool mm_cid_is_valid(int cid) -{ - return !(cid & MM_CID_LAZY_PUT); -} - -static inline int mm_cid_set_lazy_put(int cid) -{ - return cid | MM_CID_LAZY_PUT; -} - -static inline int mm_cid_clear_lazy_put(int cid) -{ - return cid & ~MM_CID_LAZY_PUT; -} - /* * mm_cpus_allowed: Union of all mm's threads allowed CPUs. */ @@ -1415,37 +1342,21 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm) } /* Accessor for struct mm_struct's cidmask. */ -static inline cpumask_t *mm_cidmask(struct mm_struct *mm) +static inline unsigned long *mm_cidmask(struct mm_struct *mm) { unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm); /* Skip mm_cpus_allowed */ cid_bitmap += cpumask_size(); - return (struct cpumask *)cid_bitmap; + return (unsigned long *)cid_bitmap; } -static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) -{ - int i; - - for_each_possible_cpu(i) { - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); - - pcpu_cid->cid = MM_CID_UNSET; - pcpu_cid->recent_cid = MM_CID_UNSET; - pcpu_cid->time = 0; - } - mm->nr_cpus_allowed = p->nr_cpus_allowed; - atomic_set(&mm->max_nr_cid, 0); - raw_spin_lock_init(&mm->cpus_allowed_lock); - cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); - cpumask_clear(mm_cidmask(mm)); -} +void mm_init_cid(struct mm_struct *mm, struct task_struct *p); static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) { - mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid); - if (!mm->pcpu_cid) + mm->mm_cid.pcpu = alloc_percpu_noprof(struct mm_cid_pcpu); + if (!mm->mm_cid.pcpu) return -ENOMEM; mm_init_cid(mm, p); return 0; @@ -1454,37 +1365,24 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct * static inline void mm_destroy_cid(struct mm_struct *mm) { - free_percpu(mm->pcpu_cid); - mm->pcpu_cid = NULL; + free_percpu(mm->mm_cid.pcpu); + mm->mm_cid.pcpu = NULL; } static inline unsigned int mm_cid_size(void) { - return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ + /* mm_cpus_allowed(), mm_cidmask(). */ + return cpumask_size() + bitmap_size(num_possible_cpus()); } -static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) -{ - struct cpumask *mm_allowed = mm_cpus_allowed(mm); - - if (!mm) - return; - /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ - raw_spin_lock(&mm->cpus_allowed_lock); - cpumask_or(mm_allowed, mm_allowed, cpumask); - WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed)); - raw_spin_unlock(&mm->cpus_allowed_lock); -} #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { } - static inline unsigned int mm_cid_size(void) { return 0; } -static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } #endif /* CONFIG_SCHED_MM_CID */ struct mmu_gather; diff --git a/include/linux/module.h b/include/linux/module.h index e135cc79acee..d80c3ea57472 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -251,10 +251,11 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name); */ #define __mod_device_table(type, name) \ __PASTE(__mod_device_table__, \ + __PASTE(kmod_, \ __PASTE(__KBUILD_MODNAME, \ __PASTE(__, \ __PASTE(type, \ - __PASTE(__, name))))) + __PASTE(__, name)))))) /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ diff --git a/include/linux/msi.h b/include/linux/msi.h index d415dd15a0a9..8003e3218c46 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -701,9 +701,6 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); -struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, - struct msi_domain_info *info, - struct irq_domain *parent); u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev); u32 pci_msi_map_rid_ctlr_node(struct pci_dev *pdev, struct device_node **node); struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 847b81ca6436..bf535f0118bb 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -86,8 +86,23 @@ do { \ #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) -extern void __mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_init_lockep(lock, name, key); +} +#else +extern void mutex_init_generic(struct mutex *lock); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_init_generic(lock); +} +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ /** * mutex_is_locked - is the mutex locked @@ -111,17 +126,27 @@ extern bool mutex_is_locked(struct mutex *lock); #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) -extern void __mutex_rt_init(struct mutex *lock, const char *name, - struct lock_class_key *key); - #define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex) -#define __mutex_init(mutex, name, key) \ -do { \ - rt_mutex_base_init(&(mutex)->rtmutex); \ - __mutex_rt_init((mutex), name, key); \ -} while (0) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, + struct lock_class_key *key); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_rt_init_lockdep(lock, name, key); +} +#else +extern void mutex_rt_init_generic(struct mutex *mutex); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_rt_init_generic(lock); +} +#endif /* !CONFIG_LOCKDEP */ #endif /* CONFIG_PREEMPT_RT */ #ifdef CONFIG_DEBUG_MUTEXES diff --git a/include/linux/namei.h b/include/linux/namei.h index fed86221c69c..58600cf234bc 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -7,6 +7,7 @@ #include <linux/path.h> #include <linux/fcntl.h> #include <linux/errno.h> +#include <linux/fs_struct.h> enum { MAX_NESTED_LINKS = 8 }; @@ -88,6 +89,81 @@ struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); +struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name); +struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name); +struct dentry *start_creating_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name); +struct dentry *start_removing_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name); +struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name); +struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name); +struct dentry *start_creating_dentry(struct dentry *parent, + struct dentry *child); +struct dentry *start_removing_dentry(struct dentry *parent, + struct dentry *child); + +/* end_creating - finish action started with start_creating + * @child: dentry returned by start_creating() or vfs_mkdir() + * + * Unlock and release the child. This can be called after + * start_creating() whether that function succeeded or not, + * but it is not needed on failure. + * + * If vfs_mkdir() was called then the value returned from that function + * should be given for @child rather than the original dentry, as vfs_mkdir() + * may have provided a new dentry. + * + * + * If vfs_mkdir() was not called, then @child will be a valid dentry and + * @parent will be ignored. + */ +static inline void end_creating(struct dentry *child) +{ + end_dirop(child); +} + +/* end_creating_keep - finish action started with start_creating() and return result + * @child: dentry returned by start_creating() or vfs_mkdir() + * + * Unlock and return the child. This can be called after + * start_creating() whether that function succeeded or not, + * but it is not needed on failure. + * + * If vfs_mkdir() was called then the value returned from that function + * should be given for @child rather than the original dentry, as vfs_mkdir() + * may have provided a new dentry. + * + * Returns: @child, which may be a dentry or an error. + * + */ +static inline struct dentry *end_creating_keep(struct dentry *child) +{ + if (!IS_ERR(child)) + dget(child); + end_dirop(child); + return child; +} + +/** + * end_removing - finish action started with start_removing + * @child: dentry returned by start_removing() + * @parent: dentry given to start_removing() + * + * Unlock and release the child. + * + * This is identical to end_dirop(). It can be passed the result of + * start_removing() whether that was successful or not, but it not needed + * if start_removing() failed. + */ +static inline void end_removing(struct dentry *child) +{ + end_dirop(child); +} + extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); @@ -95,6 +171,13 @@ extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); +int start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last); +int start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last); +int start_renaming_two_dentries(struct renamedata *rd, + struct dentry *old_dentry, struct dentry *new_dentry); +void end_renaming(struct renamedata *rd); /** * mode_strip_umask - handle vfs umask stripping diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bf99fe8622da..5870a9e514a5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3417,6 +3417,7 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp); +struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, diff --git a/include/linux/notifier.h b/include/linux/notifier.h index b42e64734968..01b6c9d9956f 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -109,7 +109,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ .srcuu = __SRCU_USAGE_INIT(name.srcuu), \ - .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ + .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu, 0), \ } #define ATOMIC_NOTIFIER_HEAD(name) \ diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h new file mode 100644 index 000000000000..b332b019b29c --- /dev/null +++ b/include/linux/ns/ns_common_types.h @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NS_COMMON_TYPES_H +#define _LINUX_NS_COMMON_TYPES_H + +#include <linux/atomic.h> +#include <linux/ns/nstree_types.h> +#include <linux/rbtree.h> +#include <linux/refcount.h> +#include <linux/types.h> + +struct cgroup_namespace; +struct dentry; +struct ipc_namespace; +struct mnt_namespace; +struct net; +struct pid_namespace; +struct proc_ns_operations; +struct time_namespace; +struct user_namespace; +struct uts_namespace; + +extern struct cgroup_namespace init_cgroup_ns; +extern struct ipc_namespace init_ipc_ns; +extern struct mnt_namespace init_mnt_ns; +extern struct net init_net; +extern struct pid_namespace init_pid_ns; +extern struct time_namespace init_time_ns; +extern struct user_namespace init_user_ns; +extern struct uts_namespace init_uts_ns; + +extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations utsns_operations; + +/* + * Namespace lifetimes are managed via a two-tier reference counting model: + * + * (1) __ns_ref (refcount_t): Main reference count tracking memory + * lifetime. Controls when the namespace structure itself is freed. + * It also pins the namespace on the namespace trees whereas (2) + * only regulates their visibility to userspace. + * + * (2) __ns_ref_active (atomic_t): Reference count tracking active users. + * Controls visibility of the namespace in the namespace trees. + * Any live task that uses the namespace (via nsproxy or cred) holds + * an active reference. Any open file descriptor or bind-mount of + * the namespace holds an active reference. Once all tasks have + * called exited their namespaces and all file descriptors and + * bind-mounts have been released the active reference count drops + * to zero and the namespace becomes inactive. IOW, the namespace + * cannot be listed or opened via file handles anymore. + * + * Note that it is valid to transition from active to inactive and + * back from inactive to active e.g., when resurrecting an inactive + * namespace tree via the SIOCGSKNS ioctl(). + * + * Relationship and lifecycle states: + * + * - Active (__ns_ref_active > 0): + * Namespace is actively used and visible to userspace. The namespace + * can be reopened via /proc/<pid>/ns/<ns_type>, via namespace file + * handles, or discovered via listns(). + * + * - Inactive (__ns_ref_active == 0, __ns_ref > 0): + * No tasks are actively using the namespace and it isn't pinned by + * any bind-mounts or open file descriptors anymore. But the namespace + * is still kept alive by internal references. For example, the user + * namespace could be pinned by an open file through file->f_cred + * references when one of the now defunct tasks had opened a file and + * handed the file descriptor off to another process via a UNIX + * sockets. Such references keep the namespace structure alive through + * __ns_ref but will not hold an active reference. + * + * - Destroyed (__ns_ref == 0): + * No references remain. The namespace is removed from the tree and freed. + * + * State transitions: + * + * Active -> Inactive: + * When the last task using the namespace exits it drops its active + * references to all namespaces. However, user and pid namespaces + * remain accessible until the task has been reaped. + * + * Inactive -> Active: + * An inactive namespace tree might be resurrected due to e.g., the + * SIOCGSKNS ioctl() on a socket. + * + * Inactive -> Destroyed: + * When __ns_ref drops to zero the namespace is removed from the + * namespaces trees and the memory is freed (after RCU grace period). + * + * Initial namespaces: + * Boot-time namespaces (init_net, init_pid_ns, etc.) start with + * __ns_ref_active = 1 and remain active forever. + * + * @ns_type: type of namespace (e.g., CLONE_NEWNET) + * @stashed: cached dentry to be used by the vfs + * @ops: namespace operations + * @inum: namespace inode number (quickly recycled for non-initial namespaces) + * @__ns_ref: main reference count (do not use directly) + * @ns_tree: namespace tree nodes and active reference count + */ +struct ns_common { + u32 ns_type; + struct dentry *stashed; + const struct proc_ns_operations *ops; + unsigned int inum; + refcount_t __ns_ref; /* do not use directly */ + union { + struct ns_tree; + struct rcu_head ns_rcu; + }; +}; + +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + const struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + const struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + const struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + const struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + const struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + const struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + const struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns, \ + const struct uts_namespace *: &(__ns)->ns) + +#define ns_init_inum(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ + struct ipc_namespace *: IPC_NS_INIT_INO, \ + struct mnt_namespace *: MNT_NS_INIT_INO, \ + struct net *: NET_NS_INIT_INO, \ + struct pid_namespace *: PID_NS_INIT_INO, \ + struct time_namespace *: TIME_NS_INIT_INO, \ + struct user_namespace *: USER_NS_INIT_INO, \ + struct uts_namespace *: UTS_NS_INIT_INO) + +#define ns_init_ns(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &init_cgroup_ns, \ + struct ipc_namespace *: &init_ipc_ns, \ + struct mnt_namespace *: &init_mnt_ns, \ + struct net *: &init_net, \ + struct pid_namespace *: &init_pid_ns, \ + struct time_namespace *: &init_time_ns, \ + struct user_namespace *: &init_user_ns, \ + struct uts_namespace *: &init_uts_ns) + +#define ns_init_id(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_ID, \ + struct ipc_namespace *: IPC_NS_INIT_ID, \ + struct mnt_namespace *: MNT_NS_INIT_ID, \ + struct net *: NET_NS_INIT_ID, \ + struct pid_namespace *: PID_NS_INIT_ID, \ + struct time_namespace *: TIME_NS_INIT_ID, \ + struct user_namespace *: USER_NS_INIT_ID, \ + struct uts_namespace *: UTS_NS_INIT_ID) + +#define to_ns_operations(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ + struct mnt_namespace *: &mntns_operations, \ + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) + +#define ns_common_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CLONE_NEWCGROUP, \ + struct ipc_namespace *: CLONE_NEWIPC, \ + struct mnt_namespace *: CLONE_NEWNS, \ + struct net *: CLONE_NEWNET, \ + struct pid_namespace *: CLONE_NEWPID, \ + struct time_namespace *: CLONE_NEWTIME, \ + struct user_namespace *: CLONE_NEWUSER, \ + struct uts_namespace *: CLONE_NEWUTS) + +#endif /* _LINUX_NS_COMMON_TYPES_H */ diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h new file mode 100644 index 000000000000..2fb28ee31efb --- /dev/null +++ b/include/linux/ns/nstree_types.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ +#ifndef _LINUX_NSTREE_TYPES_H +#define _LINUX_NSTREE_TYPES_H + +#include <linux/rbtree.h> +#include <linux/list.h> + +/** + * struct ns_tree_root - Root of a namespace tree + * @ns_rb: Red-black tree root for efficient lookups + * @ns_list_head: List head for sequential iteration + * + * Each namespace tree maintains both an rbtree (for O(log n) lookups) + * and a list (for efficient sequential iteration). The list is kept in + * the same sorted order as the rbtree. + */ +struct ns_tree_root { + struct rb_root ns_rb; + struct list_head ns_list_head; +}; + +/** + * struct ns_tree_node - Node in a namespace tree + * @ns_node: Red-black tree node + * @ns_list_entry: List entry for sequential iteration + * + * Represents a namespace's position in a tree. Each namespace has + * multiple tree nodes for different trees (unified, per-type, owner). + */ +struct ns_tree_node { + struct rb_node ns_node; + struct list_head ns_list_entry; +}; + +/** + * struct ns_tree - Namespace tree nodes and active reference count + * @ns_id: Unique namespace identifier + * @__ns_ref_active: Active reference count (do not use directly) + * @ns_unified_node: Node in the global namespace tree + * @ns_tree_node: Node in the per-type namespace tree + * @ns_owner_node: Node in the owner namespace's tree of owned namespaces + * @ns_owner_root: Root of the tree of namespaces owned by this namespace + * (only used when this namespace is an owner) + */ +struct ns_tree { + u64 ns_id; + atomic_t __ns_ref_active; + struct ns_tree_node ns_unified_node; + struct ns_tree_node ns_tree_node; + struct ns_tree_node ns_owner_node; + struct ns_tree_root ns_owner_root; +}; + +#endif /* _LINUX_NSTREE_TYPES_H */ diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index f5b68b8abb54..825f5865bfc5 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -2,122 +2,44 @@ #ifndef _LINUX_NS_COMMON_H #define _LINUX_NS_COMMON_H +#include <linux/ns/ns_common_types.h> #include <linux/refcount.h> -#include <linux/rbtree.h> +#include <linux/vfsdebug.h> #include <uapi/linux/sched.h> +#include <uapi/linux/nsfs.h> -struct proc_ns_operations; - -struct cgroup_namespace; -struct ipc_namespace; -struct mnt_namespace; -struct net; -struct pid_namespace; -struct time_namespace; -struct user_namespace; -struct uts_namespace; - -extern struct cgroup_namespace init_cgroup_ns; -extern struct ipc_namespace init_ipc_ns; -extern struct mnt_namespace init_mnt_ns; -extern struct net init_net; -extern struct pid_namespace init_pid_ns; -extern struct time_namespace init_time_ns; -extern struct user_namespace init_user_ns; -extern struct uts_namespace init_uts_ns; - -extern const struct proc_ns_operations netns_operations; -extern const struct proc_ns_operations utsns_operations; -extern const struct proc_ns_operations ipcns_operations; -extern const struct proc_ns_operations pidns_operations; -extern const struct proc_ns_operations pidns_for_children_operations; -extern const struct proc_ns_operations userns_operations; -extern const struct proc_ns_operations mntns_operations; -extern const struct proc_ns_operations cgroupns_operations; -extern const struct proc_ns_operations timens_operations; -extern const struct proc_ns_operations timens_for_children_operations; - -struct ns_common { - u32 ns_type; - struct dentry *stashed; - const struct proc_ns_operations *ops; - unsigned int inum; - refcount_t __ns_ref; /* do not use directly */ - union { - struct { - u64 ns_id; - struct rb_node ns_tree_node; - struct list_head ns_list_node; - }; - struct rcu_head ns_rcu; - }; -}; - +bool is_current_namespace(struct ns_common *ns); int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); +struct ns_common *__must_check ns_owner(struct ns_common *ns); + +static __always_inline bool is_ns_init_inum(const struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(ns->inum == 0); + return unlikely(in_range(ns->inum, MNT_NS_INIT_INO, + IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); +} + +static __always_inline bool is_ns_init_id(const struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(ns->ns_id == 0); + return ns->ns_id <= NS_LAST_INIT_ID; +} -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns)->ns, \ - const struct cgroup_namespace *: &(__ns)->ns, \ - struct ipc_namespace *: &(__ns)->ns, \ - const struct ipc_namespace *: &(__ns)->ns, \ - struct mnt_namespace *: &(__ns)->ns, \ - const struct mnt_namespace *: &(__ns)->ns, \ - struct net *: &(__ns)->ns, \ - const struct net *: &(__ns)->ns, \ - struct pid_namespace *: &(__ns)->ns, \ - const struct pid_namespace *: &(__ns)->ns, \ - struct time_namespace *: &(__ns)->ns, \ - const struct time_namespace *: &(__ns)->ns, \ - struct user_namespace *: &(__ns)->ns, \ - const struct user_namespace *: &(__ns)->ns, \ - struct uts_namespace *: &(__ns)->ns, \ - const struct uts_namespace *: &(__ns)->ns) - -#define ns_init_inum(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ - struct ipc_namespace *: IPC_NS_INIT_INO, \ - struct mnt_namespace *: MNT_NS_INIT_INO, \ - struct net *: NET_NS_INIT_INO, \ - struct pid_namespace *: PID_NS_INIT_INO, \ - struct time_namespace *: TIME_NS_INIT_INO, \ - struct user_namespace *: USER_NS_INIT_INO, \ - struct uts_namespace *: UTS_NS_INIT_INO) - -#define ns_init_ns(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &init_cgroup_ns, \ - struct ipc_namespace *: &init_ipc_ns, \ - struct mnt_namespace *: &init_mnt_ns, \ - struct net *: &init_net, \ - struct pid_namespace *: &init_pid_ns, \ - struct time_namespace *: &init_time_ns, \ - struct user_namespace *: &init_user_ns, \ - struct uts_namespace *: &init_uts_ns) - -#define to_ns_operations(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ - struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ - struct mnt_namespace *: &mntns_operations, \ - struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ - struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ - struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ - struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ - struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) - -#define ns_common_type(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: CLONE_NEWCGROUP, \ - struct ipc_namespace *: CLONE_NEWIPC, \ - struct mnt_namespace *: CLONE_NEWNS, \ - struct net *: CLONE_NEWNET, \ - struct pid_namespace *: CLONE_NEWPID, \ - struct time_namespace *: CLONE_NEWTIME, \ - struct user_namespace *: CLONE_NEWUSER, \ - struct uts_namespace *: CLONE_NEWUTS) +#define NS_COMMON_INIT(nsname) \ +{ \ + .ns_type = ns_common_type(&nsname), \ + .ns_id = ns_init_id(&nsname), \ + .inum = ns_init_inum(&nsname), \ + .ops = to_ns_operations(&nsname), \ + .stashed = NULL, \ + .__ns_ref = REFCOUNT_INIT(1), \ + .__ns_ref_active = ATOMIC_INIT(1), \ + .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \ + .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \ + .ns_owner_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry), \ + .ns_owner_root.ns_list_head = LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head), \ +} #define ns_common_init(__ns) \ __ns_common_init(to_ns_common(__ns), \ @@ -133,21 +55,96 @@ void __ns_common_free(struct ns_common *ns); #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) +{ + return atomic_read(&ns->__ns_ref_active); +} + +static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) +{ + return refcount_read(&ns->__ns_ref); +} + static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { - return refcount_dec_and_test(&ns->__ns_ref); + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); + return false; + } + if (refcount_dec_and_test(&ns->__ns_ref)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); + return true; + } + return false; } static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { - return refcount_inc_not_zero(&ns->__ns_ref); + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); + return true; + } + if (refcount_inc_not_zero(&ns->__ns_ref)) + return true; + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); + return false; } -#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) -#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) -#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) -#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) -#define ns_ref_put_and_lock(__ns, __lock) \ - refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) +static __always_inline void __ns_ref_inc(struct ns_common *ns) +{ + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); + return; + } + refcount_inc(&ns->__ns_ref); +} + +static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns, + spinlock_t *ns_lock) +{ + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); + return false; + } + return refcount_dec_and_lock(&ns->__ns_ref, ns_lock); +} + +#define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns))) +#define ns_ref_inc(__ns) \ + do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0) +#define ns_ref_get(__ns) \ + ((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false) +#define ns_ref_put(__ns) \ + ((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false) +#define ns_ref_put_and_lock(__ns, __ns_lock) \ + ((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false) + +#define ns_ref_active_read(__ns) \ + ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) + +void __ns_ref_active_put(struct ns_common *ns); + +#define ns_ref_active_put(__ns) \ + do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) + +static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) +{ + if (!__ns_ref_active_read(ns)) { + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + return NULL; + } + if (!__ns_ref_get(ns)) + return NULL; + return ns; +} + +void __ns_ref_active_get(struct ns_common *ns); + +#define ns_ref_active_get(__ns) \ + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) #endif diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h index e5a5fa83d36b..731b67fc2fec 100644 --- a/include/linux/nsfs.h +++ b/include/linux/nsfs.h @@ -37,4 +37,7 @@ void nsfs_init(void); #define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) +void nsproxy_ns_active_get(struct nsproxy *ns); +void nsproxy_ns_active_put(struct nsproxy *ns); + #endif /* _LINUX_NSFS_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index bd118a187dec..5a67648721c7 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -93,10 +93,13 @@ static inline struct cred *nsset_cred(struct nsset *set) */ int copy_namespaces(u64 flags, struct task_struct *tsk); -void exit_task_namespaces(struct task_struct *tsk); +void switch_cred_namespaces(const struct cred *old, const struct cred *new); +void exit_nsproxy_namespaces(struct task_struct *tsk); +void get_cred_namespaces(struct task_struct *tsk); +void exit_cred_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); -void free_nsproxy(struct nsproxy *ns); +void deactivate_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); @@ -104,7 +107,7 @@ int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { if (refcount_dec_and_test(&ns->count)) - free_nsproxy(ns); + deactivate_nsproxy(ns); } static inline void get_nsproxy(struct nsproxy *ns) diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 8b8636690473..175e4625bfa6 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -1,22 +1,34 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #ifndef _LINUX_NSTREE_H #define _LINUX_NSTREE_H -#include <linux/ns_common.h> +#include <linux/ns/nstree_types.h> #include <linux/nsproxy.h> #include <linux/rbtree.h> #include <linux/seqlock.h> #include <linux/rculist.h> #include <linux/cookie.h> +#include <uapi/linux/nsfs.h> -extern struct ns_tree cgroup_ns_tree; -extern struct ns_tree ipc_ns_tree; -extern struct ns_tree mnt_ns_tree; -extern struct ns_tree net_ns_tree; -extern struct ns_tree pid_ns_tree; -extern struct ns_tree time_ns_tree; -extern struct ns_tree user_ns_tree; -extern struct ns_tree uts_ns_tree; +struct ns_common; + +extern struct ns_tree_root cgroup_ns_tree; +extern struct ns_tree_root ipc_ns_tree; +extern struct ns_tree_root mnt_ns_tree; +extern struct ns_tree_root net_ns_tree; +extern struct ns_tree_root pid_ns_tree; +extern struct ns_tree_root time_ns_tree; +extern struct ns_tree_root user_ns_tree; +extern struct ns_tree_root uts_ns_tree; + +void ns_tree_node_init(struct ns_tree_node *node); +void ns_tree_root_init(struct ns_tree_root *root); +bool ns_tree_node_empty(const struct ns_tree_node *node); +struct rb_node *ns_tree_node_add(struct ns_tree_node *node, + struct ns_tree_root *root, + int (*cmp)(struct rb_node *, const struct rb_node *)); +void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root); #define to_ns_tree(__ns) \ _Generic((__ns), \ @@ -29,17 +41,21 @@ extern struct ns_tree uts_ns_tree; struct user_namespace *: &(user_ns_tree), \ struct uts_namespace *: &(uts_ns_tree)) -u64 ns_tree_gen_id(struct ns_common *ns); -void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); -void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); +#define ns_tree_gen_id(__ns) \ + __ns_tree_gen_id(to_ns_common(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) + +u64 __ns_tree_gen_id(struct ns_common *ns, u64 id); +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree); +void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree); struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, - struct ns_tree *ns_tree, + struct ns_tree_root *ns_tree, bool previous); -static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_tree, u64 id) { - ns_tree_gen_id(ns); + __ns_tree_gen_id(ns, id); __ns_tree_add_raw(ns, ns_tree); } @@ -59,7 +75,9 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) * This function assigns a new id to the namespace and adds it to the * appropriate namespace tree and list. */ -#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) +#define ns_tree_add(__ns) \ + __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) /** * ns_tree_remove - Remove a namespace from a namespace tree @@ -73,6 +91,6 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) #define ns_tree_adjoined_rcu(__ns, __previous) \ __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) -#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) +#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node.ns_node)) #endif /* _LINUX_NSTREE_H */ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 46ebaa46e6c5..b18ab53561c9 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -3,16 +3,16 @@ #define _LINUX_OBJTOOL_H #include <linux/objtool_types.h> +#include <linux/annotate.h> #ifdef CONFIG_OBJTOOL -#include <asm/asm.h> - #ifndef __ASSEMBLY__ -#define UNWIND_HINT(type, sp_reg, sp_offset, signal) \ +#define UNWIND_HINT(type, sp_reg, sp_offset, signal) \ "987: \n\t" \ ".pushsection .discard.unwind_hints\n\t" \ + ANNOTATE_DATA_SPECIAL \ /* struct unwind_hint */ \ ".long 987b - .\n\t" \ ".short " __stringify(sp_offset) "\n\t" \ @@ -53,16 +53,6 @@ #define __ASM_BREF(label) label ## b -#define __ASM_ANNOTATE(label, type) \ - ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ - ".long " __stringify(label) " - .\n\t" \ - ".long " __stringify(type) "\n\t" \ - ".popsection\n\t" - -#define ASM_ANNOTATE(type) \ - "911:\n\t" \ - __ASM_ANNOTATE(911b, type) - #else /* __ASSEMBLY__ */ /* @@ -89,6 +79,7 @@ .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .Lhere_\@: .pushsection .discard.unwind_hints + ANNOTATE_DATA_SPECIAL /* struct unwind_hint */ .long .Lhere_\@ - . .short \sp_offset @@ -101,7 +92,7 @@ .macro STACK_FRAME_NON_STANDARD func:req .pushsection .discard.func_stack_frame_non_standard, "aw" - .long \func - . + .quad \func .popsection .endm @@ -111,14 +102,6 @@ #endif .endm -.macro ANNOTATE type:req -.Lhere_\@: - .pushsection .discard.annotate_insn,"M",@progbits,8 - .long .Lhere_\@ - . - .long \type - .popsection -.endm - #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ @@ -128,84 +111,15 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) -#define __ASM_ANNOTATE(label, type) "" -#define ASM_ANNOTATE(type) #else .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm -.macro ANNOTATE type:req -.endm #endif #endif /* CONFIG_OBJTOOL */ -#ifndef __ASSEMBLY__ -/* - * Annotate away the various 'relocation to !ENDBR` complaints; knowing that - * these relocations will never be used for indirect calls. - */ -#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) -#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) - -/* - * This should be used immediately before an indirect jump/call. It tells - * objtool the subsequent indirect jump/call is vouched safe for retpoline - * builds. - */ -#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) -/* - * See linux/instrumentation.h - */ -#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN) -#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END) -/* - * objtool annotation to ignore the alternatives and only consider the original - * instruction(s). - */ -#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) -/* - * This macro indicates that the following intra-function call is valid. - * Any non-annotated intra-function call will cause objtool to issue a warning. - */ -#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL) -/* - * Use objtool to validate the entry requirement that all code paths do - * VALIDATE_UNRET_END before RET. - * - * NOTE: The macro must be used at the beginning of a global symbol, otherwise - * it will be ignored. - */ -#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) -/* - * This should be used to refer to an instruction that is considered - * terminating, like a noreturn CALL or UD2 when we know they are not -- eg - * WARN using UD2. - */ -#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) -/* - * This should not be used; it annotates away CFI violations. There are a few - * valid use cases like kexec handover to the next kernel image, and there is - * no security concern there. - * - * There are also a few real issues annotated away, like EFI because we can't - * control the EFI code. - */ -#define ANNOTATE_NOCFI_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI)) - -#else -#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR -#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE -/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */ -/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */ -#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS -#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL -#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN -#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE -#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI -#endif - #if defined(CONFIG_NOINSTR_VALIDATION) && \ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) #define VALIDATE_UNRET_BEGIN ANNOTATE_UNRET_BEGIN diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index aceac94632c8..c6def4049b1a 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -67,4 +67,6 @@ struct unwind_hint { #define ANNOTYPE_REACHABLE 8 #define ANNOTYPE_NOCFI 9 +#define ANNOTYPE_DATA_SPECIAL 1 + #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 1db8543dfc8a..1c2bc0281807 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -43,6 +43,8 @@ extern int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_args *out_irq); extern int of_irq_count(struct device_node *dev); extern int of_irq_get(struct device_node *dev, int index); +extern const struct cpumask *of_irq_get_affinity(struct device_node *dev, + int index); extern int of_irq_get_byname(struct device_node *dev, const char *name); extern int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs); @@ -76,6 +78,11 @@ static inline int of_irq_get_byname(struct device_node *dev, const char *name) { return 0; } +static inline const struct cpumask *of_irq_get_affinity(struct device_node *dev, + int index) +{ + return NULL; +} static inline int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs) { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0091ad1986bf..f7a0e4af0c73 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1048,19 +1048,7 @@ PAGE_TYPE_OPS(Table, table, pgtable) */ PAGE_TYPE_OPS(Guard, guard, guard) -FOLIO_TYPE_OPS(slab, slab) - -/** - * PageSlab - Determine if the page belongs to the slab allocator - * @page: The page to test. - * - * Context: Any context. - * Return: True for slab pages, false for any other kind of page. - */ -static inline bool PageSlab(const struct page *page) -{ - return folio_test_slab(page_folio(page)); -} +PAGE_TYPE_OPS(Slab, slab, slab) #ifdef CONFIG_HUGETLB_PAGE FOLIO_TYPE_OPS(hugetlb, hugetlb) @@ -1076,7 +1064,7 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) -FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc) +PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..e601a3144f28 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -38,6 +38,7 @@ int filemap_invalidate_pages(struct address_space *mapping, int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); int filemap_flush(struct address_space *); +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write); int filemap_fdatawait_keep_errors(struct address_space *mapping); int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); int filemap_fdatawait_range_keep_errors(struct address_space *mapping, @@ -53,14 +54,10 @@ static inline int filemap_fdatawait(struct address_space *mapping) bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); -int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); int filemap_check_errors(struct address_space *mapping); void __filemap_set_wb_err(struct address_space *mapping, int err); -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc); int kiocb_write_and_wait(struct kiocb *iocb, size_t count); static inline int filemap_write_and_wait(struct address_space *mapping) @@ -942,6 +939,17 @@ static inline pgoff_t folio_next_index(const struct folio *folio) } /** + * folio_next_pos - Get the file position of the next folio. + * @folio: The current folio. + * + * Return: The position of the folio which follows this folio in the file. + */ +static inline loff_t folio_next_pos(const struct folio *folio) +{ + return (loff_t)folio_next_index(folio) << PAGE_SHIFT; +} + +/** * folio_file_page - The page for a particular index. * @folio: The folio which contains this index. * @index: The index we want to look up. @@ -977,6 +985,8 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); +unsigned filemap_get_folios_dirty(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); struct folio *read_cache_folio(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 12d90360f6db..43c854a273c3 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -52,7 +52,7 @@ __section(".discard") __attribute__((unused)) /* - * s390 and alpha modules require percpu variables to be defined as + * alpha modules require percpu variables to be defined as * weak to force the compiler to generate GOT based external * references for them. This is necessary because percpu sections * will be located outside of the usually addressable area. diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 93c9a26492fc..52b37f7bdbf9 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -119,6 +119,7 @@ struct arm_pmu { /* PMUv3 only */ int pmuver; + bool has_smt; u64 reg_pmmir; u64 reg_brbidr; #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 @@ -132,8 +133,6 @@ struct arm_pmu { #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu)) -DECLARE_PER_CPU(struct arm_pmu *, cpu_armpmu); - u64 armpmu_event_update(struct perf_event *event); int armpmu_event_set_period(struct perf_event *event); @@ -190,8 +189,8 @@ bool arm_pmu_irq_is_nmi(void); struct arm_pmu *armpmu_alloc(void); void armpmu_free(struct arm_pmu *pmu); int armpmu_register(struct arm_pmu *pmu); -int armpmu_request_irq(int irq, int cpu); -void armpmu_free_irq(int irq, int cpu); +int armpmu_request_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu); +void armpmu_free_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu); #define ARMV8_PMU_PDEV_NAME "armv8-pmu" diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fd1d91017b99..9870d768db4c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1720,7 +1720,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark); + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 32e8457ad535..ee3148ef87f6 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1232,6 +1232,10 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif +#ifndef flush_tlb_fix_spurious_fault_pmd +#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0) +#endif + /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 445517a72ad0..0e7ae12c96d2 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -61,8 +61,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { - if (ns != &init_pid_ns) - ns_ref_inc(ns); + ns_ref_inc(ns); return ns; } diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 9d42d473d201..7f6a92ac9704 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -44,11 +44,11 @@ typedef unsigned int pipe_index_t; typedef unsigned short pipe_index_t; #endif -/* - * We have to declare this outside 'struct pipe_inode_info', - * but then we can't use 'union pipe_index' for an anonymous - * union, so we end up having to duplicate this declaration - * below. Annoying. +/** + * struct pipe_index - pipe indeces + * @head: The point of buffer production + * @tail: The point of buffer consumption + * @head_tail: unsigned long union of @head and @tail */ union pipe_index { unsigned long head_tail; @@ -63,9 +63,7 @@ union pipe_index { * @mutex: mutex protecting the whole thing * @rd_wait: reader wait point in case of empty pipe * @wr_wait: writer wait point in case of full pipe - * @head: The point of buffer production - * @tail: The point of buffer consumption - * @head_tail: unsigned long union of @head and @tail + * @pipe_index: the pipe indeces * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) @@ -87,14 +85,7 @@ struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; - /* This has to match the 'union pipe_index' above */ - union { - unsigned long head_tail; - struct { - pipe_index_t head; - pipe_index_t tail; - }; - }; + union pipe_index; unsigned int max_usage; unsigned int ring_size; diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 074754c23d33..93c945331f39 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -102,6 +102,8 @@ devm_platform_ioremap_resource_byname(struct platform_device *pdev, extern int platform_get_irq(struct platform_device *, unsigned int); extern int platform_get_irq_optional(struct platform_device *, unsigned int); +extern int platform_get_irq_affinity(struct platform_device *, unsigned int, + const struct cpumask **); extern int platform_irq_count(struct platform_device *); extern int devm_platform_get_irqs_affinity(struct platform_device *dev, struct irq_affinity *affd, @@ -232,6 +234,7 @@ extern int platform_device_add_data(struct platform_device *pdev, extern int platform_device_add(struct platform_device *pdev); extern void platform_device_del(struct platform_device *pdev); extern void platform_device_put(struct platform_device *pdev); +DEFINE_FREE(platform_device_put, struct platform_device *, if (_T) platform_device_put(_T)) struct platform_driver { int (*probe)(struct platform_device *); diff --git a/include/linux/pm.h b/include/linux/pm.h index cc7b2dc28574..7f69f739f613 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -25,11 +25,12 @@ extern void (*pm_power_off)(void); struct device; /* we have a circular dep with device.h */ #ifdef CONFIG_VT_CONSOLE_SLEEP -extern void pm_vt_switch_required(struct device *dev, bool required); +extern int pm_vt_switch_required(struct device *dev, bool required); extern void pm_vt_switch_unregister(struct device *dev); #else -static inline void pm_vt_switch_required(struct device *dev, bool required) +static inline int pm_vt_switch_required(struct device *dev, bool required) { + return 0; } static inline void pm_vt_switch_unregister(struct device *dev) { @@ -507,6 +508,7 @@ const struct dev_pm_ops name = { \ * RECOVER Creation of a hibernation image or restoration of the main * memory contents from a hibernation image has failed, call * ->thaw() and ->complete() for all devices. + * POWEROFF System will poweroff, call ->poweroff() for all devices. * * The following PM_EVENT_ messages are defined for internal use by * kernel subsystems. They are never issued by the PM core. @@ -537,6 +539,7 @@ const struct dev_pm_ops name = { \ #define PM_EVENT_USER 0x0100 #define PM_EVENT_REMOTE 0x0200 #define PM_EVENT_AUTO 0x0400 +#define PM_EVENT_POWEROFF 0x0800 #define PM_EVENT_SLEEP (PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE) #define PM_EVENT_USER_SUSPEND (PM_EVENT_USER | PM_EVENT_SUSPEND) @@ -551,6 +554,7 @@ const struct dev_pm_ops name = { \ #define PMSG_QUIESCE ((struct pm_message){ .event = PM_EVENT_QUIESCE, }) #define PMSG_SUSPEND ((struct pm_message){ .event = PM_EVENT_SUSPEND, }) #define PMSG_HIBERNATE ((struct pm_message){ .event = PM_EVENT_HIBERNATE, }) +#define PMSG_POWEROFF ((struct pm_message){ .event = PM_EVENT_POWEROFF, }) #define PMSG_RESUME ((struct pm_message){ .event = PM_EVENT_RESUME, }) #define PMSG_THAW ((struct pm_message){ .event = PM_EVENT_THAW, }) #define PMSG_RESTORE ((struct pm_message){ .event = PM_EVENT_RESTORE, }) diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index f67a2cb7d781..93ba0143ca47 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -153,6 +153,7 @@ enum genpd_sync_state { }; struct dev_power_governor { + bool (*system_power_down_ok)(struct dev_pm_domain *domain); bool (*power_down_ok)(struct dev_pm_domain *domain); bool (*suspend_ok)(struct device *dev); }; diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 4a69d4af3ff8..6cea4455f867 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -162,6 +162,15 @@ static inline void cpu_latency_qos_update_request(struct pm_qos_request *req, static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {} #endif +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP +s32 cpu_wakeup_latency_qos_limit(void); +#else +static inline s32 cpu_wakeup_latency_qos_limit(void) +{ + return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; +} +#endif + #ifdef CONFIG_PM enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask); enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask); diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 0b436e15f4cd..911d7a4d32c1 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -637,6 +637,30 @@ DEFINE_GUARD_COND(pm_runtime_active_auto, _try, DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled, pm_runtime_resume_and_get(_T), _RET == 0) +/* ACQUIRE() wrapper macros for the guards defined above. */ + +#define PM_RUNTIME_ACQUIRE(_dev, _var) \ + ACQUIRE(pm_runtime_active_try, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_AUTOSUSPEND(_dev, _var) \ + ACQUIRE(pm_runtime_active_auto_try, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_IF_ENABLED(_dev, _var) \ + ACQUIRE(pm_runtime_active_try_enabled, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_IF_ENABLED_AUTOSUSPEND(_dev, _var) \ + ACQUIRE(pm_runtime_active_auto_try_enabled, _var)(_dev) + +/* + * ACQUIRE_ERR() wrapper macro for guard pm_runtime_active. + * + * Always check PM_RUNTIME_ACQUIRE_ERR() after using one of the + * PM_RUNTIME_ACQUIRE*() macros defined above (yes, it can be used with + * any of them) and if it is nonzero, avoid accessing the given device. + */ +#define PM_RUNTIME_ACQUIRE_ERR(_var_ptr) \ + ACQUIRE_ERR(pm_runtime_active, _var_ptr) + /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. diff --git a/include/linux/prandom.h b/include/linux/prandom.h index f2ed5b72b3d6..ff7dcc3fa105 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -47,10 +47,4 @@ static inline void prandom_seed_state(struct rnd_state *state, u64 seed) state->s4 = __seed(i, 128U); } -/* Pseudo random number generator from numerical recipes. */ -static inline u32 next_pseudo_random32(u32 seed) -{ - return seed * 1664525 + 1013904223; -} - #endif diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 102202185d7a..d964f965c8ff 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -134,11 +134,9 @@ static __always_inline unsigned char interrupt_context_level(void) /* * The following macros are deprecated and should not be used in new code: - * in_irq() - Obsolete version of in_hardirq() * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled */ -#define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) diff --git a/include/linux/prmt.h b/include/linux/prmt.h index c53ab287e932..8cdc987de963 100644 --- a/include/linux/prmt.h +++ b/include/linux/prmt.h @@ -4,9 +4,11 @@ #ifdef CONFIG_ACPI_PRMT void init_prmt(void); +bool acpi_prm_handler_available(const guid_t *handler_guid); int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer); #else static inline void init_prmt(void) { } +static inline bool acpi_prm_handler_available(const guid_t *handler_guid) { return false; } static inline int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer) { return -EOPNOTSUPP; diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h index 2503f7625d65..a651e60d9410 100644 --- a/include/linux/pseudo_fs.h +++ b/include/linux/pseudo_fs.h @@ -9,6 +9,7 @@ struct pseudo_fs_context { const struct xattr_handler * const *xattr; const struct dentry_operations *dops; unsigned long magic; + unsigned int s_d_flags; }; struct pseudo_fs_context *init_pseudo(struct fs_context *fc, diff --git a/include/linux/random.h b/include/linux/random.h index 333cecfca93f..8a8064dc3970 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -130,21 +130,6 @@ static inline int get_random_bytes_wait(void *buf, size_t nbytes) return ret; } -#define declare_get_random_var_wait(name, ret_type) \ - static inline int get_random_ ## name ## _wait(ret_type *out) { \ - int ret = wait_for_random_bytes(); \ - if (unlikely(ret)) \ - return ret; \ - *out = get_random_ ## name(); \ - return 0; \ - } -declare_get_random_var_wait(u8, u8) -declare_get_random_var_wait(u16, u16) -declare_get_random_var_wait(u32, u32) -declare_get_random_var_wait(u64, u32) -declare_get_random_var_wait(long, unsigned long) -#undef declare_get_random_var - #ifdef CONFIG_SMP int random_prepare_cpu(unsigned int cpu); int random_online_cpu(unsigned int cpu); diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index c26cb83ca071..a97c3bcb1656 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -145,7 +145,7 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, if (last) { WRITE_ONCE(n->next, last->next); - n->pprev = &last->next; + WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_nulls_next_rcu(last), n); } else { hlist_nulls_add_head_rcu(n, h); @@ -155,8 +155,8 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, /* after that hlist_nulls_del will work */ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) { - n->pprev = &n->next; - n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); + WRITE_ONCE(n->pprev, &n->next); + WRITE_ONCE(n->next, (struct hlist_nulls_node *)NULLS_MARKER(NULL)); } /** diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index a7d92718b653..54701668b3df 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -206,6 +206,8 @@ struct rdt_mon_domain { * @arch_has_sparse_bitmasks: True if a bitmask like f00f is valid. * @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache * level has CPU scope. + * @io_alloc_capable: True if portion of the cache can be configured + * for I/O traffic. */ struct resctrl_cache { unsigned int cbm_len; @@ -213,6 +215,7 @@ struct resctrl_cache { unsigned int shareable_bits; bool arch_has_sparse_bitmasks; bool arch_has_per_cpu_cfg; + bool io_alloc_capable; }; /** @@ -654,6 +657,27 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid); +/** + * resctrl_arch_io_alloc_enable() - Enable/disable io_alloc feature. + * @r: The resctrl resource. + * @enable: Enable (true) or disable (false) io_alloc on resource @r. + * + * This can be called from any CPU. + * + * Return: + * 0 on success, <0 on error. + */ +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable); + +/** + * resctrl_arch_get_io_alloc_enabled() - Get io_alloc feature state. + * @r: The resctrl resource. + * + * Return: + * true if io_alloc is enabled or false if disabled. + */ +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 7e50bbc94e47..36ddfa1ec301 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -43,7 +43,7 @@ struct restart_block { struct __kernel_timespec __user *rmtp; struct old_timespec32 __user *compat_rmtp; }; - u64 expires; + ktime_t expires; } nanosleep; /* For poll */ struct { diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index e0135e0adae0..bf92227c78d0 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); - rseq_handle_notify_resume(NULL, regs); + rseq_handle_slowpath(regs); } #endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 05a221ce79a6..08e664b21f5a 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -355,12 +355,25 @@ static inline void rht_unlock(struct bucket_table *tbl, local_irq_restore(flags); } -static inline struct rhash_head *__rht_ptr( - struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) +enum rht_lookup_freq { + RHT_LOOKUP_NORMAL, + RHT_LOOKUP_LIKELY, +}; + +static __always_inline struct rhash_head *__rht_ptr( + struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt, + const enum rht_lookup_freq freq) { - return (struct rhash_head *) - ((unsigned long)p & ~BIT(0) ?: - (unsigned long)RHT_NULLS_MARKER(bkt)); + unsigned long p_val = (unsigned long)p & ~BIT(0); + + BUILD_BUG_ON(!__builtin_constant_p(freq)); + + if (freq == RHT_LOOKUP_LIKELY) + return (struct rhash_head *) + (likely(p_val) ? p_val : (unsigned long)RHT_NULLS_MARKER(bkt)); + else + return (struct rhash_head *) + (p_val ?: (unsigned long)RHT_NULLS_MARKER(bkt)); } /* @@ -370,10 +383,17 @@ static inline struct rhash_head *__rht_ptr( * rht_ptr_exclusive() dereferences in a context where exclusive * access is guaranteed, such as when destroying the table. */ +static __always_inline struct rhash_head *__rht_ptr_rcu( + struct rhash_lock_head __rcu *const *bkt, + const enum rht_lookup_freq freq) +{ + return __rht_ptr(rcu_dereference_all(*bkt), bkt, freq); +} + static inline struct rhash_head *rht_ptr_rcu( struct rhash_lock_head __rcu *const *bkt) { - return __rht_ptr(rcu_dereference_all(*bkt), bkt); + return __rht_ptr_rcu(bkt, RHT_LOOKUP_NORMAL); } static inline struct rhash_head *rht_ptr( @@ -381,13 +401,15 @@ static inline struct rhash_head *rht_ptr( struct bucket_table *tbl, unsigned int hash) { - return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); + return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt, + RHT_LOOKUP_NORMAL); } static inline struct rhash_head *rht_ptr_exclusive( struct rhash_lock_head __rcu *const *bkt) { - return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); + return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt, + RHT_LOOKUP_NORMAL); } static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, @@ -588,7 +610,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, /* Internal function, do not use. */ static __always_inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, - const struct rhashtable_params params) + const struct rhashtable_params params, + const enum rht_lookup_freq freq) { struct rhashtable_compare_arg arg = { .ht = ht, @@ -599,12 +622,13 @@ static __always_inline struct rhash_head *__rhashtable_lookup( struct rhash_head *he; unsigned int hash; + BUILD_BUG_ON(!__builtin_constant_p(freq)); tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); bkt = rht_bucket(tbl, hash); do { - rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { + rht_for_each_rcu_from(he, __rht_ptr_rcu(bkt, freq), tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) @@ -643,11 +667,22 @@ static __always_inline void *rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { - struct rhash_head *he = __rhashtable_lookup(ht, key, params); + struct rhash_head *he = __rhashtable_lookup(ht, key, params, + RHT_LOOKUP_NORMAL); return he ? rht_obj(ht, he) : NULL; } +static __always_inline void *rhashtable_lookup_likely( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(ht, key, params, + RHT_LOOKUP_LIKELY); + + return likely(he) ? rht_obj(ht, he) : NULL; +} + /** * rhashtable_lookup_fast - search hash table, without RCU read lock * @ht: hash table @@ -693,11 +728,22 @@ static __always_inline struct rhlist_head *rhltable_lookup( struct rhltable *hlt, const void *key, const struct rhashtable_params params) { - struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); + struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params, + RHT_LOOKUP_NORMAL); return he ? container_of(he, struct rhlist_head, rhead) : NULL; } +static __always_inline struct rhlist_head *rhltable_lookup_likely( + struct rhltable *hlt, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params, + RHT_LOOKUP_LIKELY); + + return likely(he) ? container_of(he, struct rhlist_head, rhead) : NULL; +} + /* Internal function, please use rhashtable_insert_fast() instead. This * function returns the existing element already in hashes if there is a clash, * otherwise it returns an error via ERR_PTR(). diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 69553e7c14c1..2266f4dc77b6 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -3,134 +3,164 @@ #define _LINUX_RSEQ_H #ifdef CONFIG_RSEQ - -#include <linux/preempt.h> #include <linux/sched.h> -#ifdef CONFIG_MEMBARRIER -# define RSEQ_EVENT_GUARD irq -#else -# define RSEQ_EVENT_GUARD preempt -#endif - -/* - * Map the event mask on the user-space ABI enum rseq_cs_flags - * for direct mask checks. - */ -enum rseq_event_mask_bits { - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, -}; - -enum rseq_event_mask { - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), -}; - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ - if (t->rseq) - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); -} +#include <uapi/linux/rseq.h> -void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); +void __rseq_handle_slowpath(struct pt_regs *regs); -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) +/* Invoked from resume_user_mode_work() */ +static inline void rseq_handle_slowpath(struct pt_regs *regs) { - if (current->rseq) - __rseq_handle_notify_resume(ksig, regs); + if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) { + if (current->rseq.event.slowpath) + __rseq_handle_slowpath(regs); + } else { + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) + __rseq_handle_slowpath(regs); + } } -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ - scoped_guard(RSEQ_EVENT_GUARD) - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - rseq_handle_notify_resume(ksig, regs); -} +void __rseq_signal_deliver(int sig, struct pt_regs *regs); -/* rseq_preempt() requires preemption to be disabled. */ -static inline void rseq_preempt(struct task_struct *t) +/* + * Invoked from signal delivery to fixup based on the register context before + * switching to the signal delivery context. + */ +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.has_rseq & current->rseq.event.user_irq) + __rseq_signal_deliver(ksig->sig, regs); + } else { + if (current->rseq.event.has_rseq) + __rseq_signal_deliver(ksig->sig, regs); + } } -/* rseq_migrate() requires preemption to be disabled. */ -static inline void rseq_migrate(struct task_struct *t) +static inline void rseq_raise_notify_resume(struct task_struct *t) { - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); + set_tsk_thread_flag(t, TIF_RSEQ); } -/* - * If parent process has a registered restartable sequences area, the - * child inherits. Unregister rseq for a clone with CLONE_VM set. - */ -static inline void rseq_fork(struct task_struct *t, u64 clone_flags) +/* Invoked from context switch to force evaluation on exit to user */ +static __always_inline void rseq_sched_switch_event(struct task_struct *t) { - if (clone_flags & CLONE_VM) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; + struct rseq_event *ev = &t->rseq.event; + + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* + * Avoid a boat load of conditionals by using simple logic + * to determine whether NOTIFY_RESUME needs to be raised. + * + * It's required when the CPU or MM CID has changed or + * the entry was from user space. + */ + bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; + + if (raise) { + ev->sched_switch = true; + rseq_raise_notify_resume(t); + } } else { - t->rseq = current->rseq; - t->rseq_len = current->rseq_len; - t->rseq_sig = current->rseq_sig; - t->rseq_event_mask = current->rseq_event_mask; + if (ev->has_rseq) { + t->rseq.event.sched_switch = true; + rseq_raise_notify_resume(t); + } } } -static inline void rseq_execve(struct task_struct *t) +/* + * Invoked from __set_task_cpu() when a task migrates or from + * mm_cid_schedin() when the CID changes to enforce an IDs update. + * + * This does not raise TIF_NOTIFY_RESUME as that happens in + * rseq_sched_switch_event(). + */ +static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; + t->rseq.event.ids_changed = true; } -#else - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ -} -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) +/* Enforce a full update after RSEQ registration and when execve() failed */ +static inline void rseq_force_update(void) { + if (current->rseq.event.has_rseq) { + current->rseq.event.ids_changed = true; + current->rseq.event.sched_switch = true; + rseq_raise_notify_resume(current); + } } -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) + +/* + * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, + * which clears TIF_NOTIFY_RESUME on architectures that don't use the + * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag. + * + * To avoid updating user space RSEQ in that case just to do it eventually + * again before returning to user space, because __rseq_handle_slowpath() + * does nothing when invoked with NULL register state. + * + * After returning from guest mode, before exiting to userspace, hypervisors + * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary. + */ +static inline void rseq_virt_userspace_exit(void) { + /* + * The generic optimization for deferring RSEQ updates until the next + * exit relies on having a dedicated TIF_RSEQ. + */ + if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && + current->rseq.event.sched_switch) + rseq_raise_notify_resume(current); } -static inline void rseq_preempt(struct task_struct *t) + +static inline void rseq_reset(struct task_struct *t) { + memset(&t->rseq, 0, sizeof(t->rseq)); + t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } -static inline void rseq_migrate(struct task_struct *t) + +static inline void rseq_execve(struct task_struct *t) { + rseq_reset(t); } + +/* + * If parent process has a registered restartable sequences area, the + * child inherits. Unregister rseq for a clone with CLONE_VM set. + * + * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault + * on the COW page on exit to user space, when the child stays on the same + * CPU as the parent. That's obviously not guaranteed, but in overcommit + * scenarios it is more likely and optimizes for the fork/exec case without + * taking the fault. + */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { -} -static inline void rseq_execve(struct task_struct *t) -{ + if (clone_flags & CLONE_VM) + rseq_reset(t); + else + t->rseq = current->rseq; } -#endif +#else /* CONFIG_RSEQ */ +static inline void rseq_handle_slowpath(struct pt_regs *regs) { } +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } +static inline void rseq_sched_switch_event(struct task_struct *t) { } +static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } +static inline void rseq_force_update(void) { } +static inline void rseq_virt_userspace_exit(void) { } +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } +static inline void rseq_execve(struct task_struct *t) { } +#endif /* !CONFIG_RSEQ */ #ifdef CONFIG_DEBUG_RSEQ - void rseq_syscall(struct pt_regs *regs); - -#else - -static inline void rseq_syscall(struct pt_regs *regs) -{ -} - -#endif +#else /* CONFIG_DEBUG_RSEQ */ +static inline void rseq_syscall(struct pt_regs *regs) { } +#endif /* !CONFIG_DEBUG_RSEQ */ #endif /* _LINUX_RSEQ_H */ diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h new file mode 100644 index 000000000000..c92167ff8a7f --- /dev/null +++ b/include/linux/rseq_entry.h @@ -0,0 +1,616 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RSEQ_ENTRY_H +#define _LINUX_RSEQ_ENTRY_H + +/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ +#ifdef CONFIG_RSEQ_STATS +#include <linux/percpu.h> + +struct rseq_stats { + unsigned long exit; + unsigned long signal; + unsigned long slowpath; + unsigned long fastpath; + unsigned long ids; + unsigned long cs; + unsigned long clear; + unsigned long fixup; +}; + +DECLARE_PER_CPU(struct rseq_stats, rseq_stats); + +/* + * Slow path has interrupts and preemption enabled, but the fast path + * runs with interrupts disabled so there is no point in having the + * preemption checks implied in __this_cpu_inc() for every operation. + */ +#ifdef RSEQ_BUILD_SLOW_PATH +#define rseq_stat_inc(which) this_cpu_inc((which)) +#else +#define rseq_stat_inc(which) raw_cpu_inc((which)) +#endif + +#else /* CONFIG_RSEQ_STATS */ +#define rseq_stat_inc(x) do { } while (0) +#endif /* !CONFIG_RSEQ_STATS */ + +#ifdef CONFIG_RSEQ +#include <linux/jump_label.h> +#include <linux/rseq.h> +#include <linux/uaccess.h> + +#include <linux/tracepoint-defs.h> + +#ifdef CONFIG_TRACEPOINTS +DECLARE_TRACEPOINT(rseq_update); +DECLARE_TRACEPOINT(rseq_ip_fixup); +void __rseq_trace_update(struct task_struct *t); +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip); + +static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) +{ + if (tracepoint_enabled(rseq_update) && ids) + __rseq_trace_update(t); +} + +static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + if (tracepoint_enabled(rseq_ip_fixup)) + __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); +} + +#else /* CONFIG_TRACEPOINT */ +static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { } +static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) { } +#endif /* !CONFIG_TRACEPOINT */ + +DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); + +#ifdef RSEQ_BUILD_SLOW_PATH +#define rseq_inline +#else +#define rseq_inline __always_inline +#endif + +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); +bool rseq_debug_validate_ids(struct task_struct *t); + +static __always_inline void rseq_note_user_irq_entry(void) +{ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) + current->rseq.event.user_irq = true; +} + +/* + * Check whether there is a valid critical section and whether the + * instruction pointer in @regs is inside the critical section. + * + * - If the critical section is invalid, terminate the task. + * + * - If valid and the instruction pointer is inside, set it to the abort IP. + * + * - If valid and the instruction pointer is outside, clear the critical + * section address. + * + * Returns true, if the section was valid and either fixup or clear was + * done, false otherwise. + * + * In the failure case task::rseq_event::fatal is set when a invalid + * section was found. It's clear when the failure was an unresolved page + * fault. + * + * If inlined into the exit to user path with interrupts disabled, the + * caller has to protect against page faults with pagefault_disable(). + * + * In preemptible task context this would be counterproductive as the page + * faults could not be fully resolved. As a consequence unresolved page + * faults in task context are fatal too. + */ + +#ifdef RSEQ_BUILD_SLOW_PATH +/* + * The debug version is put out of line, but kept here so the code stays + * together. + * + * @csaddr has already been checked by the caller to be in user space + */ +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, + unsigned long csaddr) +{ + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; + u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; + unsigned long ip = instruction_pointer(regs); + u64 __user *uc_head = (u64 __user *) ucs; + u32 usig, __user *uc_sig; + + scoped_user_rw_access(ucs, efault) { + /* + * Evaluate the user pile and exit if one of the conditions + * is not fulfilled. + */ + unsafe_get_user(start_ip, &ucs->start_ip, efault); + if (unlikely(start_ip >= tasksize)) + goto die; + /* If outside, just clear the critical section. */ + if (ip < start_ip) + goto clear; + + unsafe_get_user(offset, &ucs->post_commit_offset, efault); + cs_end = start_ip + offset; + /* Check for overflow and wraparound */ + if (unlikely(cs_end >= tasksize || cs_end < start_ip)) + goto die; + + /* If not inside, clear it. */ + if (ip >= cs_end) + goto clear; + + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); + /* Ensure it's "valid" */ + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) + goto die; + /* Validate that the abort IP is not in the critical section */ + if (unlikely(abort_ip - start_ip < offset)) + goto die; + + /* + * Check version and flags for 0. No point in emitting + * deprecated warnings before dying. That could be done in + * the slow path eventually, but *shrug*. + */ + unsafe_get_user(head, uc_head, efault); + if (unlikely(head)) + goto die; + + /* abort_ip - 4 is >= 0. See abort_ip check above */ + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); + unsafe_get_user(usig, uc_sig, efault); + if (unlikely(usig != t->rseq.sig)) + goto die; + + /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* If not in interrupt from user context, let it die */ + if (unlikely(!t->rseq.event.user_irq)) + goto die; + } + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + instruction_pointer_set(regs, (unsigned long)abort_ip); + rseq_stat_inc(rseq_stats.fixup); + break; + clear: + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + rseq_stat_inc(rseq_stats.clear); + abort_ip = 0ULL; + } + + if (unlikely(abort_ip)) + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + +/* + * On debug kernels validate that user space did not mess with it if the + * debug branch is enabled. + */ +bool rseq_debug_validate_ids(struct task_struct *t) +{ + struct rseq __user *rseq = t->rseq.usrptr; + u32 cpu_id, uval, node_id; + + /* + * On the first exit after registering the rseq region CPU ID is + * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! + */ + node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? + cpu_to_node(t->rseq.ids.cpu_id) : 0; + + scoped_user_read_access(rseq, efault) { + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); + if (cpu_id != t->rseq.ids.cpu_id) + goto die; + unsafe_get_user(uval, &rseq->cpu_id, efault); + if (uval != cpu_id) + goto die; + unsafe_get_user(uval, &rseq->node_id, efault); + if (uval != node_id) + goto die; + unsafe_get_user(uval, &rseq->mm_cid, efault); + if (uval != t->rseq.ids.mm_cid) + goto die; + } + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + +#endif /* RSEQ_BUILD_SLOW_PATH */ + +/* + * This only ensures that abort_ip is in the user address space and + * validates that it is preceded by the signature. + * + * No other sanity checks are done here, that's what the debug code is for. + */ +static rseq_inline bool +rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) +{ + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; + unsigned long ip = instruction_pointer(regs); + unsigned long tasksize = TASK_SIZE; + u64 start_ip, abort_ip, offset; + u32 usig, __user *uc_sig; + + rseq_stat_inc(rseq_stats.cs); + + if (unlikely(csaddr >= tasksize)) { + t->rseq.event.fatal = true; + return false; + } + + if (static_branch_unlikely(&rseq_debug_enabled)) + return rseq_debug_update_user_cs(t, regs, csaddr); + + scoped_user_rw_access(ucs, efault) { + unsafe_get_user(start_ip, &ucs->start_ip, efault); + unsafe_get_user(offset, &ucs->post_commit_offset, efault); + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); + + /* + * No sanity checks. If user space screwed it up, it can + * keep the pieces. That's what debug code is for. + * + * If outside, just clear the critical section. + */ + if (ip - start_ip >= offset) + goto clear; + + /* + * Two requirements for @abort_ip: + * - Must be in user space as x86 IRET would happily return to + * the kernel. + * - The four bytes preceding the instruction at @abort_ip must + * contain the signature. + * + * The latter protects against the following attack vector: + * + * An attacker with limited abilities to write, creates a critical + * section descriptor, sets the abort IP to a library function or + * some other ROP gadget and stores the address of the descriptor + * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP + * protection. + */ + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) + goto die; + + /* The address is guaranteed to be >= 0 and < TASK_SIZE */ + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); + unsafe_get_user(usig, uc_sig, efault); + if (unlikely(usig != t->rseq.sig)) + goto die; + + /* Invalidate the critical section */ + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + /* Update the instruction pointer */ + instruction_pointer_set(regs, (unsigned long)abort_ip); + rseq_stat_inc(rseq_stats.fixup); + break; + clear: + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + rseq_stat_inc(rseq_stats.clear); + abort_ip = 0ULL; + } + + if (unlikely(abort_ip)) + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + +/* + * Updates CPU ID, Node ID and MM CID and reads the critical section + * address, when @csaddr != NULL. This allows to put the ID update and the + * read under the same uaccess region to spare a separate begin/end. + * + * As this is either invoked from a C wrapper with @csaddr = NULL or from + * the fast path code with a valid pointer, a clever compiler should be + * able to optimize the read out. Spares a duplicate implementation. + * + * Returns true, if the operation was successful, false otherwise. + * + * In the failure case task::rseq_event::fatal is set when invalid data + * was found on debug kernels. It's clear when the failure was an unresolved page + * fault. + * + * If inlined into the exit to user path with interrupts disabled, the + * caller has to protect against page faults with pagefault_disable(). + * + * In preemptible task context this would be counterproductive as the page + * faults could not be fully resolved. As a consequence unresolved page + * faults in task context are fatal too. + */ +static rseq_inline +bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, + u32 node_id, u64 *csaddr) +{ + struct rseq __user *rseq = t->rseq.usrptr; + + if (static_branch_unlikely(&rseq_debug_enabled)) { + if (!rseq_debug_validate_ids(t)) + return false; + } + + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); + unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); + unsafe_put_user(node_id, &rseq->node_id, efault); + unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); + if (csaddr) + unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); + } + + /* Cache the new values */ + t->rseq.ids.cpu_cid = ids->cpu_cid; + rseq_stat_inc(rseq_stats.ids); + rseq_trace_update(t, ids); + return true; +efault: + return false; +} + +/* + * Update user space with new IDs and conditionally check whether the task + * is in a critical section. + */ +static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, + struct rseq_ids *ids, u32 node_id) +{ + u64 csaddr; + + if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) + return false; + + /* + * On architectures which utilize the generic entry code this + * allows to skip the critical section when the entry was not from + * a user space interrupt, unless debug mode is enabled. + */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + if (!static_branch_unlikely(&rseq_debug_enabled)) { + if (likely(!t->rseq.event.user_irq)) + return true; + } + } + if (likely(!csaddr)) + return true; + /* Sigh, this really needs to do work */ + return rseq_update_user_cs(t, regs, csaddr); +} + +/* + * If you want to use this then convert your architecture to the generic + * entry code. I'm tired of building workarounds for people who can't be + * bothered to make the maintenance of generic infrastructure less + * burdensome. Just sucking everything into the architecture code and + * thereby making others chase the horrible hacks and keep them working is + * neither acceptable nor sustainable. + */ +#ifdef CONFIG_GENERIC_ENTRY + +/* + * This is inlined into the exit path because: + * + * 1) It's a one time comparison in the fast path when there is no event to + * handle + * + * 2) The access to the user space rseq memory (TLS) is unlikely to fault + * so the straight inline operation is: + * + * - Four 32-bit stores only if CPU ID/ MM CID need to be updated + * - One 64-bit load to retrieve the critical section address + * + * 3) In the unlikely case that the critical section address is != NULL: + * + * - One 64-bit load to retrieve the start IP + * - One 64-bit load to retrieve the offset for calculating the end + * - One 64-bit load to retrieve the abort IP + * - One 64-bit load to retrieve the signature + * - One store to clear the critical section address + * + * The non-debug case implements only the minimal required checking. It + * provides protection against a rogue abort IP in kernel space, which + * would be exploitable at least on x86, and also against a rogue CS + * descriptor by checking the signature at the abort IP. Any fallout from + * invalid critical section descriptors is a user space problem. The debug + * case provides the full set of checks and terminates the task if a + * condition is not met. + * + * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and + * tells the caller to loop back into exit_to_user_mode_loop(). The rseq + * slow path there will handle the failure. + */ +static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t) +{ + /* + * Page faults need to be disabled as this is called with + * interrupts disabled + */ + guard(pagefault)(); + if (likely(!t->rseq.event.ids_changed)) { + struct rseq __user *rseq = t->rseq.usrptr; + /* + * If IDs have not changed rseq_event::user_irq must be true + * See rseq_sched_switch_event(). + */ + u64 csaddr; + + if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs))) + return false; + + if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { + if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) + return false; + } + return true; + } + + struct rseq_ids ids = { + .cpu_id = task_cpu(t), + .mm_cid = task_mm_cid(t), + }; + u32 node_id = cpu_to_node(ids.cpu_id); + + return rseq_update_usr(t, regs, &ids, node_id); +} + +static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) +{ + struct task_struct *t = current; + + /* + * If the task did not go through schedule or got the flag enforced + * by the rseq syscall or execve, then nothing to do here. + * + * CPU ID and MM CID can only change when going through a context + * switch. + * + * rseq_sched_switch_event() sets the rseq_event::sched_switch bit + * only when rseq_event::has_rseq is true. That conditional is + * required to avoid setting the TIF bit if RSEQ is not registered + * for a task. rseq_event::sched_switch is cleared when RSEQ is + * unregistered by a task so it's sufficient to check for the + * sched_switch bit alone. + * + * A sane compiler requires three instructions for the nothing to do + * case including clearing the events, but your mileage might vary. + */ + if (unlikely((t->rseq.event.sched_switch))) { + rseq_stat_inc(rseq_stats.fastpath); + + if (unlikely(!rseq_exit_user_update(regs, t))) + return true; + } + /* Clear state so next entry starts from a clean slate */ + t->rseq.event.events = 0; + return false; +} + +/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +static __always_inline bool test_tif_rseq(unsigned long ti_work) +{ + return ti_work & _TIF_RSEQ; +} + +static __always_inline void clear_tif_rseq(void) +{ + static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); + clear_thread_flag(TIF_RSEQ); +} +#else +static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } +static __always_inline void clear_tif_rseq(void) { } +#endif + +static __always_inline bool +rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + if (likely(!test_tif_rseq(ti_work))) + return false; + + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { + current->rseq.event.slowpath = true; + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + return true; + } + + clear_tif_rseq(); + return false; +} + +#else /* CONFIG_GENERIC_ENTRY */ +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + return false; +} +#endif /* !CONFIG_GENERIC_ENTRY */ + +static __always_inline void rseq_syscall_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + /* Needed to remove the store for the !lockdep case */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + WARN_ON_ONCE(ev->sched_switch); + ev->events = 0; + } +} + +static __always_inline void rseq_irqentry_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + lockdep_assert_once(!ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing could not clear it. + */ + ev->events = 0; +} + +/* Required to keep ARM64 working */ +static __always_inline void rseq_exit_to_user_mode_legacy(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + if (static_branch_unlikely(&rseq_debug_enabled)) + WARN_ON_ONCE(ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing did not clear it. + */ + ev->events = 0; +} + +void __rseq_debug_syscall_return(struct pt_regs *regs); + +static inline void rseq_debug_syscall_return(struct pt_regs *regs) +{ + if (static_branch_unlikely(&rseq_debug_enabled)) + __rseq_debug_syscall_return(regs); +} +#else /* CONFIG_RSEQ */ +static inline void rseq_note_user_irq_entry(void) { } +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + return false; +} +static inline void rseq_syscall_exit_to_user_mode(void) { } +static inline void rseq_irqentry_exit_to_user_mode(void) { } +static inline void rseq_exit_to_user_mode_legacy(void) { } +static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } +#endif /* !CONFIG_RSEQ */ + +#endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h new file mode 100644 index 000000000000..332dc14b81c9 --- /dev/null +++ b/include/linux/rseq_types.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RSEQ_TYPES_H +#define _LINUX_RSEQ_TYPES_H + +#include <linux/irq_work_types.h> +#include <linux/types.h> +#include <linux/workqueue_types.h> + +#ifdef CONFIG_RSEQ +struct rseq; + +/** + * struct rseq_event - Storage for rseq related event management + * @all: Compound to initialize and clear the data efficiently + * @events: Compound to access events with a single load/store + * @sched_switch: True if the task was scheduled and needs update on + * exit to user + * @ids_changed: Indicator that IDs need to be updated + * @user_irq: True on interrupt entry from user mode + * @has_rseq: True if the task has a rseq pointer installed + * @error: Compound error code for the slow path to analyze + * @fatal: User space data corrupted or invalid + * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME + * is required + * + * @sched_switch and @ids_changed must be adjacent and the combo must be + * 16bit aligned to allow a single store, when both are set at the same + * time in the scheduler. + */ +struct rseq_event { + union { + u64 all; + struct { + union { + u32 events; + struct { + u8 sched_switch; + u8 ids_changed; + u8 user_irq; + }; + }; + + u8 has_rseq; + u8 __pad; + union { + u16 error; + struct { + u8 fatal; + u8 slowpath; + }; + }; + }; + }; +}; + +/** + * struct rseq_ids - Cache for ids, which need to be updated + * @cpu_cid: Compound of @cpu_id and @mm_cid to make the + * compiler emit a single compare on 64-bit + * @cpu_id: The CPU ID which was written last to user space + * @mm_cid: The MM CID which was written last to user space + * + * @cpu_id and @mm_cid are updated when the data is written to user space. + */ +struct rseq_ids { + union { + u64 cpu_cid; + struct { + u32 cpu_id; + u32 mm_cid; + }; + }; +}; + +/** + * struct rseq_data - Storage for all rseq related data + * @usrptr: Pointer to the registered user space RSEQ memory + * @len: Length of the RSEQ region + * @sig: Signature of critial section abort IPs + * @event: Storage for event management + * @ids: Storage for cached CPU ID and MM CID + */ +struct rseq_data { + struct rseq __user *usrptr; + u32 len; + u32 sig; + struct rseq_event event; + struct rseq_ids ids; +}; + +#else /* CONFIG_RSEQ */ +struct rseq_data { }; +#endif /* !CONFIG_RSEQ */ + +#ifdef CONFIG_SCHED_MM_CID + +#define MM_CID_UNSET BIT(31) +#define MM_CID_ONCPU BIT(30) +#define MM_CID_TRANSIT BIT(29) + +/** + * struct sched_mm_cid - Storage for per task MM CID data + * @active: MM CID is active for the task + * @cid: The CID associated to the task either permanently or + * borrowed from the CPU + */ +struct sched_mm_cid { + unsigned int active; + unsigned int cid; +}; + +/** + * struct mm_cid_pcpu - Storage for per CPU MM_CID data + * @cid: The CID associated to the CPU either permanently or + * while a task with a CID is running + */ +struct mm_cid_pcpu { + unsigned int cid; +}____cacheline_aligned_in_smp; + +/** + * struct mm_mm_cid - Storage for per MM CID data + * @pcpu: Per CPU storage for CIDs associated to a CPU + * @percpu: Set, when CIDs are in per CPU mode + * @transit: Set to MM_CID_TRANSIT during a mode change transition phase + * @max_cids: The exclusive maximum CID value for allocation and convergence + * @irq_work: irq_work to handle the affinity mode change case + * @work: Regular work to handle the affinity mode change case + * @lock: Spinlock to protect against affinity setting which can't take @mutex + * @mutex: Mutex to serialize forks and exits related to this mm + * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map + * is growth only. + * @users: The number of tasks sharing this MM. Separate from mm::mm_users + * as that is modified by mmget()/mm_put() by other entities which + * do not actually share the MM. + * @pcpu_thrs: Threshold for switching back from per CPU mode + * @update_deferred: A deferred switch back to per task mode is pending. + */ +struct mm_mm_cid { + /* Hotpath read mostly members */ + struct mm_cid_pcpu __percpu *pcpu; + unsigned int percpu; + unsigned int transit; + unsigned int max_cids; + + /* Rarely used. Moves @lock and @mutex into the second cacheline */ + struct irq_work irq_work; + struct work_struct work; + + raw_spinlock_t lock; + struct mutex mutex; + + /* Low frequency modified */ + unsigned int nr_cpus_allowed; + unsigned int users; + unsigned int pcpu_thrs; + unsigned int update_deferred; +}____cacheline_aligned_in_smp; +#else /* CONFIG_SCHED_MM_CID */ +struct mm_mm_cid { }; +struct sched_mm_cid { }; +#endif /* !CONFIG_SCHED_MM_CID */ + +#endif diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index ffb9907c7070..cc7ad189caa5 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -75,7 +75,7 @@ struct sbitmap { */ struct sbitmap_word *map; - /* + /** * @alloc_hint: Cache of last successfully allocated or freed bit. * * This is per-cpu, which allows multiple users to stick to different @@ -128,7 +128,7 @@ struct sbitmap_queue { */ struct sbq_wait_state *ws; - /* + /** * @ws_active: count of currently active ws waitqueues */ atomic_t ws_active; @@ -547,6 +547,8 @@ static inline void sbq_index_atomic_inc(atomic_t *index) * sbitmap_queue. * @sbq: Bitmap queue to wait on. * @wait_index: A counter per "user" of @sbq. + * + * Return: Next wait queue to be used */ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, atomic_t *wait_index) diff --git a/include/linux/sched.h b/include/linux/sched.h index b469878de25c..d395f2810fac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -41,7 +41,7 @@ #include <linux/task_io_accounting.h> #include <linux/posix-timers_types.h> #include <linux/restart_block.h> -#include <uapi/linux/rseq.h> +#include <linux/rseq_types.h> #include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> @@ -637,8 +637,8 @@ struct sched_rt_entity { #endif } __randomize_layout; -typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); -typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); +struct rq_flags; +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); struct sched_dl_entity { struct rb_node rb_node; @@ -685,20 +685,22 @@ struct sched_dl_entity { * * @dl_server tells if this is a server entity. * - * @dl_defer tells if this is a deferred or regular server. For - * now only defer server exists. - * - * @dl_defer_armed tells if the deferrable server is waiting - * for the replenishment timer to activate it. - * * @dl_server_active tells if the dlserver is active(started). * dlserver is started on first cfs enqueue on an idle runqueue * and is stopped when a dequeue results in 0 cfs tasks on the * runqueue. In other words, dlserver is active only when cpu's * runqueue has atleast one cfs task. * + * @dl_defer tells if this is a deferred or regular server. For + * now only defer server exists. + * + * @dl_defer_armed tells if the deferrable server is waiting + * for the replenishment timer to activate it. + * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. + * + * @dl_defer_idle tracks idle state */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; @@ -709,6 +711,7 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; + unsigned int dl_defer_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -730,9 +733,6 @@ struct sched_dl_entity { * dl_server_update(). * * @rq the runqueue this server is for - * - * @server_has_tasks() returns true if @server_pick return a - * runnable task. */ struct rq *rq; dl_server_pick_f server_pick_task; @@ -1324,7 +1324,10 @@ struct task_struct { struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; -#endif +#ifdef CONFIG_PREEMPT_RT + struct llist_node cg_dead_lnode; +#endif /* CONFIG_PREEMPT_RT */ +#endif /* CONFIG_CGROUPS */ #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; @@ -1406,33 +1409,8 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_RSEQ - struct rseq __user *rseq; - u32 rseq_len; - u32 rseq_sig; - /* - * RmW on rseq_event_mask must be performed atomically - * with respect to preemption. - */ - unsigned long rseq_event_mask; -# ifdef CONFIG_DEBUG_RSEQ - /* - * This is a place holder to save a copy of the rseq fields for - * validation of read-only fields. The struct rseq has a - * variable-length array at the end, so it cannot be used - * directly. Reserve a size large enough for the known fields. - */ - char rseq_fields[sizeof(struct rseq)]; -# endif -#endif - -#ifdef CONFIG_SCHED_MM_CID - int mm_cid; /* Current cid in mm */ - int last_mm_cid; /* Most recent cid in mm */ - int migrate_from_cpu; - int mm_cid_active; /* Whether cid bitmap is active */ - struct callback_head cid_work; -#endif + struct rseq_data rseq; + struct sched_mm_cid mm_cid; struct tlbflush_unmap_batch tlb_ubc; @@ -1861,8 +1839,8 @@ extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); -/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ -extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); +/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ +extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task @@ -1901,6 +1879,7 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_para extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); +extern void sched_set_fifo_secondary(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); @@ -2058,6 +2037,13 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +static inline void set_need_resched_current(void) +{ + lockdep_assert_irqs_disabled(); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @@ -2318,6 +2304,32 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #define alloc_tag_restore(_tag, _old) do {} while (0) #endif +/* Avoids recursive inclusion hell */ +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit(struct task_struct *t); +static __always_inline int task_mm_cid(struct task_struct *t) +{ + return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); +} +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit(struct task_struct *t) { } +static __always_inline int task_mm_cid(struct task_struct *t) +{ + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return task_cpu(t); +} +#endif + #ifndef MODULE #ifndef COMPILE_OFFSETS diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index b7fafe999073..624fda17a785 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,7 +8,7 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ -static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm) +static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm) { /* * By convention, dumpable bits are contained in first 32 bits of the diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index d82b7a9b0658..bcb962d5ee7d 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -17,7 +17,18 @@ enum scx_public_consts { SCX_OPS_NAME_LEN = 128, + /* + * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses + * to set the slice for a task that is selected for execution. + * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice + * refill has been triggered. + * + * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass + * mode. As making forward progress for all tasks is the main goal of + * the bypass mode, a shorter slice is used. + */ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ + SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ }; @@ -46,6 +57,7 @@ enum scx_dsq_id_flags { SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3, SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; @@ -58,6 +70,7 @@ enum scx_dsq_id_flags { */ struct scx_dispatch_q { raw_spinlock_t lock; + struct task_struct __rcu *first_task; /* lockless peek at head */ struct list_head list; /* tasks in dispatch order */ struct rb_root priq; /* used to order by p->scx.dsq_vtime */ u32 nr; @@ -136,6 +149,13 @@ struct scx_dsq_list_node { u32 priv; /* can be used by iter cursor */ }; +#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ + (struct scx_dsq_list_node) { \ + .node = LIST_HEAD_INIT((__node).node), \ + .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ + .priv = (__priv), \ + } + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. @@ -207,16 +227,18 @@ struct sched_ext_entity { struct list_head tasks_node; }; -void sched_ext_free(struct task_struct *p); +void sched_ext_dead(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); void scx_softlockup(u32 dur_s); +bool scx_hardlockup(int cpu); bool scx_rcu_cpu_stall(void); #else /* !CONFIG_SCHED_CLASS_EXT */ -static inline void sched_ext_free(struct task_struct *p) {} +static inline void sched_ext_dead(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void scx_softlockup(u32 dur_s) {} +static inline bool scx_hardlockup(int cpu) { return false; } static inline bool scx_rcu_cpu_stall(void) { return false; } #endif /* CONFIG_SCHED_CLASS_EXT */ @@ -228,6 +250,7 @@ struct scx_task_group { u64 bw_period_us; u64 bw_quota_us; u64 bw_burst_us; + bool idle; #endif }; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index bbcfdf12aa6e..45c0022b91ce 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -92,6 +92,9 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ /* idle_balance() stats */ + unsigned int newidle_call; + unsigned int newidle_success; + unsigned int newidle_ratio; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; diff --git a/include/linux/security.h b/include/linux/security.h index 92ac3f27b973..eb36451ce41f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -85,6 +85,7 @@ struct timezone; enum lsm_event { LSM_POLICY_CHANGE, + LSM_STARTED_ALL, }; struct dm_verity_digest { @@ -167,8 +168,6 @@ struct lsm_prop { }; extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1]; -extern u32 lsm_active_cnt; -extern const struct lsm_id *lsm_idlist[]; /* These functions are in security/commoncap.c */ extern int cap_capable(const struct cred *cred, struct user_namespace *ns, diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5ce48eab7a2a..a8a8661839b6 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -1209,4 +1209,118 @@ done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) if (seq & 1) read_sequnlock_excl_irqrestore(lock, flags); } + +enum ss_state { + ss_done = 0, + ss_lock, + ss_lock_irqsave, + ss_lockless, +}; + +struct ss_tmp { + enum ss_state state; + unsigned long data; + spinlock_t *lock; + spinlock_t *lock_irqsave; +}; + +static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst) +{ + if (sst->lock) + spin_unlock(sst->lock); + if (sst->lock_irqsave) + spin_unlock_irqrestore(sst->lock_irqsave, sst->data); +} + +extern void __scoped_seqlock_invalid_target(void); + +#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000) || defined(CONFIG_KASAN) +/* + * For some reason some GCC-8 architectures (nios2, alpha) have trouble + * determining that the ss_done state is impossible in __scoped_seqlock_next() + * below. + * + * Similarly KASAN is known to confuse compilers enough to break this. But we + * don't care about code quality for KASAN builds anyway. + */ +static inline void __scoped_seqlock_bug(void) { } +#else +/* + * Canary for compiler optimization -- if the compiler doesn't realize this is + * an impossible state, it very likely generates sub-optimal code here. + */ +extern void __scoped_seqlock_bug(void); +#endif + +static inline void +__scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target) +{ + switch (sst->state) { + case ss_done: + __scoped_seqlock_bug(); + return; + + case ss_lock: + case ss_lock_irqsave: + sst->state = ss_done; + return; + + case ss_lockless: + if (!read_seqretry(lock, sst->data)) { + sst->state = ss_done; + return; + } + break; + } + + switch (target) { + case ss_done: + __scoped_seqlock_invalid_target(); + return; + + case ss_lock: + sst->lock = &lock->lock; + spin_lock(sst->lock); + sst->state = ss_lock; + return; + + case ss_lock_irqsave: + sst->lock_irqsave = &lock->lock; + spin_lock_irqsave(sst->lock_irqsave, sst->data); + sst->state = ss_lock_irqsave; + return; + + case ss_lockless: + sst->data = read_seqbegin(lock); + return; + } +} + +#define __scoped_seqlock_read(_seqlock, _target, _s) \ + for (struct ss_tmp _s __cleanup(__scoped_seqlock_cleanup) = \ + { .state = ss_lockless, .data = read_seqbegin(_seqlock) }; \ + _s.state != ss_done; \ + __scoped_seqlock_next(&_s, _seqlock, _target)) + +/** + * scoped_seqlock_read (lock, ss_state) - execute the read side critical + * section without manual sequence + * counter handling or calls to other + * helpers + * @lock: pointer to seqlock_t protecting the data + * @ss_state: one of {ss_lock, ss_lock_irqsave, ss_lockless} indicating + * the type of critical read section + * + * Example: + * + * scoped_seqlock_read (&lock, ss_lock) { + * // read-side critical section + * } + * + * Starts with a lockess pass first. If it fails, restarts the critical + * section with the lock held. + */ +#define scoped_seqlock_read(_seqlock, _target) \ + __scoped_seqlock_read(_seqlock, _target, __UNIQUE_ID(seqlock)) + #endif /* __LINUX_SEQLOCK_H */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0e47465ef0fd..774efe592a9a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -111,7 +111,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list); -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); +void shmem_truncate_range(struct inode *inode, loff_t start, uoff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/socket.h b/include/linux/socket.h index 944027f9765e..ec715ad4bf25 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -468,10 +468,10 @@ extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); extern int __sys_listen_socket(struct socket *sock, int backlog); +extern int do_getsockname(struct socket *sock, int peer, + struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len); -extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len); + int __user *usockaddr_len, int peer); extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); extern int __sys_shutdown_sock(struct socket *sock, int how); diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ada65b58bc4c..344ad51c8f6c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -25,8 +25,12 @@ struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *ssp, const char *name, - struct lock_class_key *key); +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); +#ifndef CONFIG_TINY_SRCU +int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); +int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name, + struct lock_class_key *key); +#endif // #ifndef CONFIG_TINY_SRCU #define init_srcu_struct(ssp) \ ({ \ @@ -35,22 +39,42 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name, __init_srcu_struct((ssp), #ssp, &__srcu_key); \ }) +#define init_srcu_struct_fast(ssp) \ +({ \ + static struct lock_class_key __srcu_key; \ + \ + __init_srcu_struct_fast((ssp), #ssp, &__srcu_key); \ +}) + +#define init_srcu_struct_fast_updown(ssp) \ +({ \ + static struct lock_class_key __srcu_key; \ + \ + __init_srcu_struct_fast_updown((ssp), #ssp, &__srcu_key); \ +}) + #define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name }, #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ int init_srcu_struct(struct srcu_struct *ssp); +#ifndef CONFIG_TINY_SRCU +int init_srcu_struct_fast(struct srcu_struct *ssp); +int init_srcu_struct_fast_updown(struct srcu_struct *ssp); +#endif // #ifndef CONFIG_TINY_SRCU #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */ -#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). -#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). -// 0x4 // SRCU-lite is no longer with us. -#define SRCU_READ_FLAVOR_FAST 0x8 // srcu_read_lock_fast(). -#define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ - SRCU_READ_FLAVOR_FAST) // All of the above. -#define SRCU_READ_FLAVOR_SLOWGP SRCU_READ_FLAVOR_FAST +#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). +#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). +// 0x4 // SRCU-lite is no longer with us. +#define SRCU_READ_FLAVOR_FAST 0x4 // srcu_read_lock_fast(). +#define SRCU_READ_FLAVOR_FAST_UPDOWN 0x8 // srcu_read_lock_fast(). +#define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ + SRCU_READ_FLAVOR_FAST | SRCU_READ_FLAVOR_FAST_UPDOWN) + // All of the above. +#define SRCU_READ_FLAVOR_SLOWGP (SRCU_READ_FLAVOR_FAST | SRCU_READ_FLAVOR_FAST_UPDOWN) // Flavors requiring synchronize_rcu() // instead of smp_mb(). void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); @@ -259,29 +283,78 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section, but for a light-weight - * smp_mb()-free reader. See srcu_read_lock() for more information. - * - * If srcu_read_lock_fast() is ever used on an srcu_struct structure, - * then none of the other flavors may be used, whether before, during, - * or after. Note that grace-period auto-expediting is disabled for _fast - * srcu_struct structures because auto-expedited grace periods invoke - * synchronize_rcu_expedited(), IPIs and all. - * - * Note that srcu_read_lock_fast() can be invoked only from those contexts - * where RCU is watching, that is, from contexts where it would be legal - * to invoke rcu_read_lock(). Otherwise, lockdep will complain. + * smp_mb()-free reader. See srcu_read_lock() for more information. This + * function is NMI-safe, in a manner similar to srcu_read_lock_nmisafe(). + * + * For srcu_read_lock_fast() to be used on an srcu_struct structure, + * that structure must have been defined using either DEFINE_SRCU_FAST() + * or DEFINE_STATIC_SRCU_FAST() on the one hand or initialized with + * init_srcu_struct_fast() on the other. Such an srcu_struct structure + * cannot be passed to any non-fast variant of srcu_read_{,un}lock() or + * srcu_{down,up}_read(). In kernels built with CONFIG_PROVE_RCU=y, + * __srcu_check_read_flavor() will complain bitterly if you ignore this + * restriction. + * + * Grace-period auto-expediting is disabled for SRCU-fast srcu_struct + * structures because SRCU-fast expedited grace periods invoke + * synchronize_rcu_expedited(), IPIs and all. If you need expedited + * SRCU-fast grace periods, use synchronize_srcu_expedited(). + * + * The srcu_read_lock_fast() function can be invoked only from those + * contexts where RCU is watching, that is, from contexts where it would + * be legal to invoke rcu_read_lock(). Otherwise, lockdep will complain. */ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp) { struct srcu_ctr __percpu *retval; RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); - srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); retval = __srcu_read_lock_fast(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; } +/** + * srcu_read_lock_fast_updown - register a new reader for an SRCU-fast-updown structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section, but for a light-weight + * smp_mb()-free reader. See srcu_read_lock() for more information. + * This function is compatible with srcu_down_read_fast(), but is not + * NMI-safe. + * + * For srcu_read_lock_fast_updown() to be used on an srcu_struct + * structure, that structure must have been defined using either + * DEFINE_SRCU_FAST_UPDOWN() or DEFINE_STATIC_SRCU_FAST_UPDOWN() on the one + * hand or initialized with init_srcu_struct_fast_updown() on the other. + * Such an srcu_struct structure cannot be passed to any non-fast-updown + * variant of srcu_read_{,un}lock() or srcu_{down,up}_read(). In kernels + * built with CONFIG_PROVE_RCU=y, __srcu_check_read_flavor() will complain + * bitterly if you ignore this * restriction. + * + * Grace-period auto-expediting is disabled for SRCU-fast-updown + * srcu_struct structures because SRCU-fast-updown expedited grace periods + * invoke synchronize_rcu_expedited(), IPIs and all. If you need expedited + * SRCU-fast-updown grace periods, use synchronize_srcu_expedited(). + * + * The srcu_read_lock_fast_updown() function can be invoked only from + * those contexts where RCU is watching, that is, from contexts where + * it would be legal to invoke rcu_read_lock(). Otherwise, lockdep will + * complain. + */ +static inline struct srcu_ctr __percpu *srcu_read_lock_fast_updown(struct srcu_struct *ssp) +__acquires(ssp) +{ + struct srcu_ctr __percpu *retval; + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast_updown()."); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN); + retval = __srcu_read_lock_fast_updown(ssp); + rcu_try_lock_acquire(&ssp->dep_map); + return retval; +} + /* * Used by tracing, cannot be traced and cannot call lockdep. * See srcu_read_lock_fast() for more information. @@ -291,7 +364,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_ { struct srcu_ctr __percpu *retval; - srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); retval = __srcu_read_lock_fast(ssp); return retval; } @@ -305,14 +378,15 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_ * srcu_down_read() for more information. * * The same srcu_struct may be used concurrently by srcu_down_read_fast() - * and srcu_read_lock_fast(). + * and srcu_read_lock_fast(). However, the same definition/initialization + * requirements called out for srcu_read_lock_safe() apply. */ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_down_read_fast()."); - srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); - return __srcu_read_lock_fast(ssp); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN); + return __srcu_read_lock_fast_updown(ssp); } /** @@ -408,6 +482,23 @@ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ct RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); } +/** + * srcu_read_unlock_fast_updown - unregister a old reader from an SRCU-fast-updown structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @scp: return value from corresponding srcu_read_lock_fast_updown(). + * + * Exit an SRCU-fast-updown read-side critical section. + */ +static inline void +srcu_read_unlock_fast_updown(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) __releases(ssp) +{ + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN); + srcu_lock_release(&ssp->dep_map); + __srcu_read_unlock_fast_updown(ssp, scp); + RCU_LOCKDEP_WARN(!rcu_is_watching(), + "RCU must be watching srcu_read_unlock_fast_updown()."); +} + /* * Used by tracing, cannot be traced and cannot call lockdep. * See srcu_read_unlock_fast() for more information. @@ -431,9 +522,9 @@ static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __ __releases(ssp) { WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); - srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); - __srcu_read_unlock_fast(ssp, scp); - RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_up_read_fast()."); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN); + __srcu_read_unlock_fast_updown(ssp, scp); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_up_read_fast_updown()."); } /** diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 51ce25f07930..e0698024667a 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -31,7 +31,7 @@ struct srcu_struct { void srcu_drive_gp(struct work_struct *wp); -#define __SRCU_STRUCT_INIT(name, __ignored, ___ignored) \ +#define __SRCU_STRUCT_INIT(name, __ignored, ___ignored, ____ignored) \ { \ .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ .srcu_cb_tail = &name.srcu_cb_head, \ @@ -44,13 +44,25 @@ void srcu_drive_gp(struct work_struct *wp); * Tree SRCU, which needs some per-CPU data. */ #define DEFINE_SRCU(name) \ - struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name) + struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) #define DEFINE_STATIC_SRCU(name) \ - static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name) + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) +#define DEFINE_SRCU_FAST(name) DEFINE_SRCU(name) +#define DEFINE_STATIC_SRCU_FAST(name) \ + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) +#define DEFINE_SRCU_FAST_UPDOWN(name) DEFINE_SRCU(name) +#define DEFINE_STATIC_SRCU_FAST_UPDOWN(name) \ + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) // Dummy structure for srcu_notifier_head. struct srcu_usage { }; #define __SRCU_USAGE_INIT(name) { } +#define __init_srcu_struct_fast __init_srcu_struct +#define __init_srcu_struct_fast_updown __init_srcu_struct +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#define init_srcu_struct_fast init_srcu_struct +#define init_srcu_struct_fast_updown init_srcu_struct +#endif // #ifndef CONFIG_DEBUG_LOCK_ALLOC void synchronize_srcu(struct srcu_struct *ssp); @@ -93,6 +105,17 @@ static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ __srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp)); } +static inline struct srcu_ctr __percpu *__srcu_read_lock_fast_updown(struct srcu_struct *ssp) +{ + return __srcu_ctr_to_ptr(ssp, __srcu_read_lock(ssp)); +} + +static inline +void __srcu_read_unlock_fast_updown(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + __srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp)); +} + static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) { synchronize_srcu(ssp); @@ -103,8 +126,8 @@ static inline void srcu_barrier(struct srcu_struct *ssp) synchronize_srcu(ssp); } +static inline void srcu_expedite_current(struct srcu_struct *ssp) { } #define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) -#define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0) /* Defined here to avoid size increase for non-torture kernels. */ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 42098e0fa0b7..d6f978b50472 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -42,6 +42,8 @@ struct srcu_data { struct timer_list delay_work; /* Delay for CB invoking */ struct work_struct work; /* Context for CB invoking. */ struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ + struct rcu_head srcu_ec_head; /* For srcu_expedite_current() use. */ + int srcu_ec_state; /* State for srcu_expedite_current(). */ struct srcu_node *mynode; /* Leaf srcu_node. */ unsigned long grpmask; /* Mask for leaf srcu_node */ /* ->srcu_data_have_cbs[]. */ @@ -102,6 +104,7 @@ struct srcu_usage { struct srcu_struct { struct srcu_ctr __percpu *srcu_ctrp; struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ + u8 srcu_reader_flavor; struct lockdep_map dep_map; struct srcu_usage *srcu_sup; /* Update-side data. */ }; @@ -135,6 +138,11 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 +/* Values for srcu_expedite_current() state (->srcu_ec_state). */ +#define SRCU_EC_IDLE 0 +#define SRCU_EC_PENDING 1 +#define SRCU_EC_REPOST 2 + /* * Values for initializing gp sequence fields. Higher values allow wrap arounds to * occur earlier. @@ -155,20 +163,21 @@ struct srcu_struct { .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ } -#define __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ +#define __SRCU_STRUCT_INIT_COMMON(name, usage_name, fast) \ .srcu_sup = &usage_name, \ + .srcu_reader_flavor = fast, \ __SRCU_DEP_MAP_INIT(name) -#define __SRCU_STRUCT_INIT_MODULE(name, usage_name) \ +#define __SRCU_STRUCT_INIT_MODULE(name, usage_name, fast) \ { \ - __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ + __SRCU_STRUCT_INIT_COMMON(name, usage_name, fast) \ } -#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \ +#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name, fast) \ { \ .sda = &pcpu_name, \ .srcu_ctrp = &pcpu_name.srcu_ctrs[0], \ - __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ + __SRCU_STRUCT_INIT_COMMON(name, usage_name, fast) \ } /* @@ -189,27 +198,45 @@ struct srcu_struct { * init_srcu_struct(&my_srcu); * * See include/linux/percpu-defs.h for the rules on per-CPU variables. + * + * DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST create an srcu_struct + * and associated structures whose readers must be of the SRCU-fast variety. + * DEFINE_SRCU_FAST_UPDOWN() and DEFINE_STATIC_SRCU_FAST_UPDOWN() create + * an srcu_struct and associated structures whose readers must be of the + * SRCU-fast-updown variety. The key point (aside from error checking) with + * both varieties is that the grace periods must use synchronize_rcu() + * instead of smp_mb(), and given that the first (for example) + * srcu_read_lock_fast() might race with the first synchronize_srcu(), + * this different must be specified at initialization time. */ #ifdef MODULE -# define __DEFINE_SRCU(name, is_static) \ +# define __DEFINE_SRCU(name, fast, is_static) \ static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage); \ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage, \ + fast); \ extern struct srcu_struct * const __srcu_struct_##name; \ struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name #else -# define __DEFINE_SRCU(name, is_static) \ +# define __DEFINE_SRCU(name, fast, is_static) \ static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ is_static struct srcu_struct name = \ - __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data) + __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data, fast) #endif -#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) -#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, 0, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, 0, static) +#define DEFINE_SRCU_FAST(name) __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, /* not static */) +#define DEFINE_STATIC_SRCU_FAST(name) __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, static) +#define DEFINE_SRCU_FAST_UPDOWN(name) __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST_UPDOWN, \ + /* not static */) +#define DEFINE_STATIC_SRCU_FAST_UPDOWN(name) \ + __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST_UPDOWN, static) int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); +void srcu_expedite_current(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); // Converts a per-CPU pointer to an ->srcu_ctrs[] array element to that @@ -289,23 +316,49 @@ __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); // Z, and implicit RCU reader. } -void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); - -// Record reader usage even for CONFIG_PROVE_RCU=n kernels. This is -// needed only for flavors that require grace-period smp_mb() calls to be -// promoted to synchronize_rcu(). -static inline void srcu_check_read_flavor_force(struct srcu_struct *ssp, int read_flavor) +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Returns a pointer that must be passed to the matching + * srcu_read_unlock_fast_updown(). This type of reader is compatible + * with srcu_down_read_fast() and srcu_up_read_fast(). + * + * See the __srcu_read_lock_fast() comment for more details. + */ +static inline +struct srcu_ctr __percpu notrace *__srcu_read_lock_fast_updown(struct srcu_struct *ssp) { - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - if (likely(READ_ONCE(sdp->srcu_reader_flavor) & read_flavor)) - return; + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_locks.counter); // Y, and implicit RCU reader. + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); // Y, and implicit RCU reader. + barrier(); /* Avoid leaking the critical section. */ + return scp; +} - // Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered. - __srcu_check_read_flavor(ssp, read_flavor); +/* + * Removes the count for the old reader from the appropriate + * per-CPU element of the srcu_struct. Note that this may well be a + * different CPU than that which was incremented by the corresponding + * srcu_read_lock_fast(), but it must be within the same task. + * + * Please see the __srcu_read_lock_fast() function's header comment for + * information on implicit RCU readers and NMI safety. + */ +static inline void notrace +__srcu_read_unlock_fast_updown(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + barrier(); /* Avoid leaking the critical section. */ + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_unlocks.counter); // Z, and implicit RCU reader. + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); // Z, and implicit RCU reader. } -// Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels. +void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); + +// Record SRCU-reader usage type only for CONFIG_PROVE_RCU=y kernels. static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { if (IS_ENABLED(CONFIG_PROVE_RCU)) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 66c06fcdfe19..cf84d98964b2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -77,6 +77,7 @@ struct cachestat_range; struct cachestat; struct statmount; struct mnt_id_req; +struct ns_id_req; struct xattr_args; struct file_attr; @@ -437,6 +438,9 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req, asmlinkage long sys_listmount(const struct mnt_id_req __user *req, u64 __user *mnt_ids, size_t nr_mnt_ids, unsigned int flags); +asmlinkage long sys_listns(const struct ns_id_req __user *req, + u64 __user *ns_ids, size_t nr_ns_ids, + unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); asmlinkage long sys_ftruncate(unsigned int fd, off_t length); #if BITS_PER_LONG == 32 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index dd925d84fa46..b40de9bab4b7 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -67,6 +67,11 @@ enum syscall_work_bit { #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED #endif +#ifndef TIF_RSEQ +# define TIF_RSEQ TIF_NOTIFY_RESUME +# define _TIF_RSEQ _TIF_NOTIFY_RESUME +#endif + #ifdef __KERNEL__ #ifndef arch_set_restart_data diff --git a/include/linux/timer.h b/include/linux/timer.h index 0414d9e6b4fc..62e1cea71125 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -188,4 +188,13 @@ int timers_dead_cpu(unsigned int cpu); #define timers_dead_cpu NULL #endif +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask); +#else +static inline int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) +{ + return 0; +} +#endif + #endif diff --git a/include/linux/tpm.h b/include/linux/tpm.h index dc0338a783f3..b15360ff78d7 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -473,6 +473,7 @@ extern int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, extern int tpm_get_random(struct tpm_chip *chip, u8 *data, size_t max); extern struct tpm_chip *tpm_default_chip(void); void tpm2_flush_context(struct tpm_chip *chip, u32 handle); +int tpm2_find_hash_alg(unsigned int crypto_id); static inline void tpm_buf_append_empty_auth(struct tpm_buf *buf, u32 handle) { diff --git a/include/linux/types.h b/include/linux/types.h index 6dfdb8e8e4c3..d4437e9c452c 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -50,6 +50,7 @@ typedef __kernel_old_gid_t old_gid_t; #if defined(__GNUC__) typedef __kernel_loff_t loff_t; +typedef __kernel_uoff_t uoff_t; #endif /* diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1beb5b395d81..be395f5f7ee3 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,6 +2,7 @@ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ +#include <linux/cleanup.h> #include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/minmax.h> @@ -35,9 +36,17 @@ #ifdef masked_user_access_begin #define can_do_masked_user_access() 1 +# ifndef masked_user_write_access_begin +# define masked_user_write_access_begin masked_user_access_begin +# endif +# ifndef masked_user_read_access_begin +# define masked_user_read_access_begin masked_user_access_begin +#endif #else #define can_do_masked_user_access() 0 #define masked_user_access_begin(src) NULL + #define masked_user_read_access_begin(src) NULL + #define masked_user_write_access_begin(src) NULL #define mask_user_address(src) (src) #endif @@ -518,7 +527,34 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); -#ifndef __get_kernel_nofault +#ifdef arch_get_kernel_nofault +/* + * Wrap the architecture implementation so that @label can be outside of a + * cleanup() scope. A regular C goto works correctly, but ASM goto does + * not. Clang rejects such an attempt, but GCC silently emits buggy code. + */ +#define __get_kernel_nofault(dst, src, type, label) \ +do { \ + __label__ local_label; \ + arch_get_kernel_nofault(dst, src, type, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, label) \ +do { \ + __label__ local_label; \ + arch_put_kernel_nofault(dst, src, type, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#elif !defined(__get_kernel_nofault) /* arch_get_kernel_nofault */ + #define __get_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(src); \ @@ -535,7 +571,8 @@ do { \ if (__put_user(data, p)) \ goto label; \ } while (0) -#endif + +#endif /* !__get_kernel_nofault */ /** * get_kernel_nofault(): safely attempt to read from a location @@ -549,7 +586,42 @@ do { \ copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\ }) -#ifndef user_access_begin +#ifdef user_access_begin + +#ifdef arch_unsafe_get_user +/* + * Wrap the architecture implementation so that @label can be outside of a + * cleanup() scope. A regular C goto works correctly, but ASM goto does + * not. Clang rejects such an attempt, but GCC silently emits buggy code. + * + * Some architectures use internal local labels already, but this extra + * indirection here is harmless because the compiler optimizes it out + * completely in any case. This construct just ensures that the ASM GOTO + * target is always in the local scope. The C goto 'label' works correctly + * when leaving a cleanup() scope. + */ +#define unsafe_get_user(x, ptr, label) \ +do { \ + __label__ local_label; \ + arch_unsafe_get_user(x, ptr, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#define unsafe_put_user(x, ptr, label) \ +do { \ + __label__ local_label; \ + arch_unsafe_put_user(x, ptr, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) +#endif /* arch_unsafe_get_user */ + +#else /* user_access_begin */ #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) @@ -559,7 +631,8 @@ do { \ #define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } -#endif +#endif /* !user_access_begin */ + #ifndef user_write_access_begin #define user_write_access_begin user_access_begin #define user_write_access_end user_access_end @@ -569,6 +642,239 @@ static inline void user_access_restore(unsigned long flags) { } #define user_read_access_end user_access_end #endif +/* Define RW variant so the below _mode macro expansion works */ +#define masked_user_rw_access_begin(u) masked_user_access_begin(u) +#define user_rw_access_begin(u, s) user_access_begin(u, s) +#define user_rw_access_end() user_access_end() + +/* Scoped user access */ +#define USER_ACCESS_GUARD(_mode) \ +static __always_inline void __user * \ +class_user_##_mode##_begin(void __user *ptr) \ +{ \ + return ptr; \ +} \ + \ +static __always_inline void \ +class_user_##_mode##_end(void __user *ptr) \ +{ \ + user_##_mode##_access_end(); \ +} \ + \ +DEFINE_CLASS(user_ ##_mode## _access, void __user *, \ + class_user_##_mode##_end(_T), \ + class_user_##_mode##_begin(ptr), void __user *ptr) \ + \ +static __always_inline class_user_##_mode##_access_t \ +class_user_##_mode##_access_ptr(void __user *scope) \ +{ \ + return scope; \ +} + +USER_ACCESS_GUARD(read) +USER_ACCESS_GUARD(write) +USER_ACCESS_GUARD(rw) +#undef USER_ACCESS_GUARD + +/** + * __scoped_user_access_begin - Start a scoped user access + * @mode: The mode of the access class (read, write, rw) + * @uptr: The pointer to access user space memory + * @size: Size of the access + * @elbl: Error label to goto when the access region is rejected + * + * Internal helper for __scoped_user_access(). Don't use directly. + */ +#define __scoped_user_access_begin(mode, uptr, size, elbl) \ +({ \ + typeof(uptr) __retptr; \ + \ + if (can_do_masked_user_access()) { \ + __retptr = masked_user_##mode##_access_begin(uptr); \ + } else { \ + __retptr = uptr; \ + if (!user_##mode##_access_begin(uptr, size)) \ + goto elbl; \ + } \ + __retptr; \ +}) + +/** + * __scoped_user_access - Open a scope for user access + * @mode: The mode of the access class (read, write, rw) + * @uptr: The pointer to access user space memory + * @size: Size of the access + * @elbl: Error label to goto when the access region is rejected. It + * must be placed outside the scope + * + * If the user access function inside the scope requires a fault label, it + * can use @elbl or a different label outside the scope, which requires + * that user access which is implemented with ASM GOTO has been properly + * wrapped. See unsafe_get_user() for reference. + * + * scoped_user_rw_access(ptr, efault) { + * unsafe_get_user(rval, &ptr->rval, efault); + * unsafe_put_user(wval, &ptr->wval, efault); + * } + * return 0; + * efault: + * return -EFAULT; + * + * The scope is internally implemented as a autoterminating nested for() + * loop, which can be left with 'return', 'break' and 'goto' at any + * point. + * + * When the scope is left user_##@_mode##_access_end() is automatically + * invoked. + * + * When the architecture supports masked user access and the access region + * which is determined by @uptr and @size is not a valid user space + * address, i.e. < TASK_SIZE, the scope sets the pointer to a faulting user + * space address and does not terminate early. This optimizes for the good + * case and lets the performance uncritical bad case go through the fault. + * + * The eventual modification of the pointer is limited to the scope. + * Outside of the scope the original pointer value is unmodified, so that + * the original pointer value is available for diagnostic purposes in an + * out of scope fault path. + * + * Nesting scoped user access into a user access scope is invalid and fails + * the build. Nesting into other guards, e.g. pagefault is safe. + * + * The masked variant does not check the size of the access and relies on a + * mapping hole (e.g. guard page) to catch an out of range pointer, the + * first access to user memory inside the scope has to be within + * @uptr ... @uptr + PAGE_SIZE - 1 + * + * Don't use directly. Use scoped_masked_user_$MODE_access() instead. + */ +#define __scoped_user_access(mode, uptr, size, elbl) \ +for (bool done = false; !done; done = true) \ + for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ + !done; done = true) \ + for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true) \ + /* Force modified pointer usage within the scope */ \ + for (const typeof(uptr) uptr = _tmpptr; !done; done = true) + +/** + * scoped_user_read_access_size - Start a scoped user read access with given size + * @usrc: Pointer to the user space address to read from + * @size: Size of the access starting from @usrc + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_read_access_size(usrc, size, elbl) \ + __scoped_user_access(read, usrc, size, elbl) + +/** + * scoped_user_read_access - Start a scoped user read access + * @usrc: Pointer to the user space address to read from + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @usrc is determined via sizeof(*@usrc)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_read_access(usrc, elbl) \ + scoped_user_read_access_size(usrc, sizeof(*(usrc)), elbl) + +/** + * scoped_user_write_access_size - Start a scoped user write access with given size + * @udst: Pointer to the user space address to write to + * @size: Size of the access starting from @udst + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_write_access_size(udst, size, elbl) \ + __scoped_user_access(write, udst, size, elbl) + +/** + * scoped_user_write_access - Start a scoped user write access + * @udst: Pointer to the user space address to write to + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @udst is determined via sizeof(*@udst)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_write_access(udst, elbl) \ + scoped_user_write_access_size(udst, sizeof(*(udst)), elbl) + +/** + * scoped_user_rw_access_size - Start a scoped user read/write access with given size + * @uptr Pointer to the user space address to read from and write to + * @size: Size of the access starting from @uptr + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_rw_access_size(uptr, size, elbl) \ + __scoped_user_access(rw, uptr, size, elbl) + +/** + * scoped_user_rw_access - Start a scoped user read/write access + * @uptr Pointer to the user space address to read from and write to + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @uptr is determined via sizeof(*@uptr)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_rw_access(uptr, elbl) \ + scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl) + +/** + * get_user_inline - Read user data inlined + * @val: The variable to store the value read from user memory + * @usrc: Pointer to the user space memory to read from + * + * Return: 0 if successful, -EFAULT when faulted + * + * Inlined variant of get_user(). Only use when there is a demonstrable + * performance reason. + */ +#define get_user_inline(val, usrc) \ +({ \ + __label__ efault; \ + typeof(usrc) _tmpsrc = usrc; \ + int _ret = 0; \ + \ + scoped_user_read_access(_tmpsrc, efault) \ + unsafe_get_user(val, _tmpsrc, efault); \ + if (0) { \ + efault: \ + _ret = -EFAULT; \ + } \ + _ret; \ +}) + +/** + * put_user_inline - Write to user memory inlined + * @val: The value to write + * @udst: Pointer to the user space memory to write to + * + * Return: 0 if successful, -EFAULT when faulted + * + * Inlined variant of put_user(). Only use when there is a demonstrable + * performance reason. + */ +#define put_user_inline(val, udst) \ +({ \ + __label__ efault; \ + typeof(udst) _tmpdst = udst; \ + int _ret = 0; \ + \ + scoped_user_write_access(_tmpdst, efault) \ + unsafe_put_user(val, _tmpdst, efault); \ + if (0) { \ + efault: \ + _ret = -EFAULT; \ + } \ + _ret; \ +}) + #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h index 26122d00708a..bc7ae7d21900 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -6,16 +6,6 @@ #include <linux/unwind_user.h> #include <linux/unwind_deferred_types.h> -struct unwind_work; - -typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie); - -struct unwind_work { - struct list_head list; - unwind_callback_t func; - int bit; -}; - #ifdef CONFIG_UNWIND_USER enum { @@ -44,22 +34,22 @@ void unwind_deferred_task_exit(struct task_struct *task); static __always_inline void unwind_reset_info(void) { struct unwind_task_info *info = ¤t->unwind_info; - unsigned long bits; + unsigned long bits = atomic_long_read(&info->unwind_mask); /* Was there any unwinding? */ - if (unlikely(info->unwind_mask)) { - bits = info->unwind_mask; - do { - /* Is a task_work going to run again before going back */ - if (bits & UNWIND_PENDING) - return; - } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL)); - current->unwind_info.id.id = 0; - - if (unlikely(info->cache)) { - info->cache->nr_entries = 0; - info->cache->unwind_completed = 0; - } + if (likely(!bits)) + return; + + do { + /* Is a task_work going to run again before going back */ + if (bits & UNWIND_PENDING) + return; + } while (!atomic_long_try_cmpxchg(&info->unwind_mask, &bits, 0UL)); + current->unwind_info.id.id = 0; + + if (unlikely(info->cache)) { + info->cache->nr_entries = 0; + info->cache->unwind_completed = 0; } } @@ -68,9 +58,17 @@ static __always_inline void unwind_reset_info(void) static inline void unwind_task_init(struct task_struct *task) {} static inline void unwind_task_free(struct task_struct *task) {} -static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; } -static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; } -static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; } +static inline int unwind_user_faultable(struct unwind_stacktrace *trace) +{ return -ENOSYS; } + +static inline int +unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) +{ return -ENOSYS; } + +static inline int +unwind_deferred_request(struct unwind_work *work, u64 *timestamp) +{ return -ENOSYS; } + static inline void unwind_deferred_cancel(struct unwind_work *work) {} static inline void unwind_deferred_task_exit(struct task_struct *task) {} diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h index 33b62ac25c86..18fa3932f61c 100644 --- a/include/linux/unwind_deferred_types.h +++ b/include/linux/unwind_deferred_types.h @@ -2,6 +2,9 @@ #ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H +#include <linux/types.h> +#include <linux/atomic.h> + struct unwind_cache { unsigned long unwind_completed; unsigned int nr_entries; @@ -30,10 +33,23 @@ union unwind_task_id { }; struct unwind_task_info { - unsigned long unwind_mask; + atomic_long_t unwind_mask; struct unwind_cache *cache; struct callback_head work; union unwind_task_id id; }; +struct unwind_work; +struct unwind_stacktrace; + +typedef void (*unwind_callback_t)(struct unwind_work *work, + struct unwind_stacktrace *trace, + u64 cookie); + +struct unwind_work { + struct list_head list; + unwind_callback_t func; + int bit; +}; + #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */ diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h index a449f15be890..412729a269bc 100644 --- a/include/linux/unwind_user_types.h +++ b/include/linux/unwind_user_types.h @@ -36,8 +36,10 @@ struct unwind_user_state { unsigned long ip; unsigned long sp; unsigned long fp; + unsigned int ws; enum unwind_user_type current_type; unsigned int available_types; + bool topmost; bool done; }; diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 3aaf19e77558..8285b19a25e0 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -376,6 +376,9 @@ struct usb_gadget_ops { * can handle. The UDC must support this and all slower speeds and lower * number of lanes. * @state: the state we are now (attached, suspended, configured, etc) + * @state_lock: Spinlock protecting the `state` and `teardown` members. + * @teardown: True if the device is undergoing teardown, used to prevent + * new work from being scheduled during cleanup. * @name: Identifies the controller hardware type. Used in diagnostics * and sometimes configuration. * @dev: Driver model state for this abstract device. @@ -451,6 +454,8 @@ struct usb_gadget { enum usb_ssp_rate max_ssp_rate; enum usb_device_state state; + spinlock_t state_lock; + bool teardown; const char *name; struct device dev; unsigned isoch_delay; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 9a9aebbf96b9..9c3be157397e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -166,13 +166,13 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } -#ifdef CONFIG_USER_NS - static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } +#ifdef CONFIG_USER_NS + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 22dd4adc5667..f48e8ccffe81 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -189,11 +189,11 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, void inode_wait_for_writeback(struct inode *inode); void inode_io_list_del(struct inode *inode); -/* writeback.h requires fs.h; it, too, is not included from here. */ -static inline void wait_on_inode(struct inode *inode) +static inline xa_mark_t wbc_to_tag(struct writeback_control *wbc) { - wait_var_event(inode_state_wait_address(inode, __I_NEW), - !(READ_ONCE(inode->i_state) & I_NEW)); + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + return PAGECACHE_TAG_TOWRITE; + return PAGECACHE_TAG_DIRTY; } #ifdef CONFIG_CGROUP_WRITEBACK @@ -234,7 +234,7 @@ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { - WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); + WARN_ON_ONCE(!(inode_state_read_once(inode) & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } @@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) + #endif /* WRITEBACK_H */ diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 86b0d47984a1..64e9afe7d647 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -85,12 +85,12 @@ int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int, - struct inode **); + struct delegated_inode *); int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *, - const char *, struct inode **); + const char *, struct delegated_inode *); int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); |
