summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/acpi/processor.h34
-rw-r--r--include/asm-generic/bug.h80
-rw-r--r--include/asm-generic/thread_info_tif.h3
-rw-r--r--include/asm-generic/vmlinux.lds.h81
-rw-r--r--include/crypto/blake2b.h143
-rw-r--r--include/crypto/blake2s.h126
-rw-r--r--include/crypto/chacha.h12
-rw-r--r--include/crypto/chacha20poly1305.h19
-rw-r--r--include/crypto/curve25519.h24
-rw-r--r--include/crypto/internal/blake2b.h101
-rw-r--r--include/crypto/md5.h11
-rw-r--r--include/crypto/poly1305.h2
-rw-r--r--include/crypto/polyval.h182
-rw-r--r--include/crypto/sha1.h12
-rw-r--r--include/crypto/sha2.h53
-rw-r--r--include/crypto/sha3.h320
-rw-r--r--include/drm/Makefile2
-rw-r--r--include/drm/intel/pciids.h5
-rw-r--r--include/linux/acpi.h49
-rw-r--r--include/linux/annotate.h134
-rw-r--r--include/linux/arch_topology.h17
-rw-r--r--include/linux/arm_mpam.h66
-rw-r--r--include/linux/ata.h1
-rw-r--r--include/linux/atomic/atomic-instrumented.h26
-rw-r--r--include/linux/backing-dev-defs.h2
-rw-r--r--include/linux/backing-dev.h5
-rw-r--r--include/linux/bitmap.h15
-rw-r--r--include/linux/blk_types.h11
-rw-r--r--include/linux/bug.h8
-rw-r--r--include/linux/byteorder/generic.h16
-rw-r--r--include/linux/cc_platform.h2
-rw-r--r--include/linux/ceph/libceph.h3
-rw-r--r--include/linux/cleanup.h57
-rw-r--r--include/linux/compiler.h8
-rw-r--r--include/linux/compiler_types.h26
-rw-r--r--include/linux/cpuidle.h6
-rw-r--r--include/linux/cpumask.h28
-rw-r--r--include/linux/cred.h22
-rw-r--r--include/linux/delay.h8
-rw-r--r--include/linux/devfreq-governor.h102
-rw-r--r--include/linux/dma-mapping.h2
-rw-r--r--include/linux/efi.h2
-rw-r--r--include/linux/elfnote.h13
-rw-r--r--include/linux/energy_model.h4
-rw-r--r--include/linux/entry-common.h38
-rw-r--r--include/linux/entry-virt.h2
-rw-r--r--include/linux/ethtool.h2
-rw-r--r--include/linux/fbcon.h2
-rw-r--r--include/linux/file.h126
-rw-r--r--include/linux/filelock.h98
-rw-r--r--include/linux/filter.h20
-rw-r--r--include/linux/freezer.h12
-rw-r--r--include/linux/fs.h727
-rw-r--r--include/linux/fs/super.h238
-rw-r--r--include/linux/fs/super_types.h336
-rw-r--r--include/linux/fs_dirent.h (renamed from include/linux/fs_types.h)11
-rw-r--r--include/linux/fs_struct.h6
-rw-r--r--include/linux/ftrace.h10
-rw-r--r--include/linux/gfp.h3
-rw-r--r--include/linux/highmem.h6
-rw-r--r--include/linux/huge_mm.h57
-rw-r--r--include/linux/iio/buffer-dma.h1
-rw-r--r--include/linux/iio/buffer_impl.h2
-rw-r--r--include/linux/init.h3
-rw-r--r--include/linux/init_task.h1
-rw-r--r--include/linux/intel_rapl.h2
-rw-r--r--include/linux/interrupt.h25
-rw-r--r--include/linux/interval_tree.h4
-rw-r--r--include/linux/interval_tree_generic.h2
-rw-r--r--include/linux/iomap.h86
-rw-r--r--include/linux/irq-entry-common.h77
-rw-r--r--include/linux/irq.h5
-rw-r--r--include/linux/irq_work.h9
-rw-r--r--include/linux/irq_work_types.h14
-rw-r--r--include/linux/irqchip.h8
-rw-r--r--include/linux/irqchip/irq-partition-percpu.h53
-rw-r--r--include/linux/irqdesc.h1
-rw-r--r--include/linux/irqdomain.h33
-rw-r--r--include/linux/kvm_types.h14
-rw-r--r--include/linux/livepatch.h25
-rw-r--r--include/linux/livepatch_external.h76
-rw-r--r--include/linux/livepatch_helpers.h77
-rw-r--r--include/linux/local_lock.h4
-rw-r--r--include/linux/local_lock_internal.h62
-rw-r--r--include/linux/lockdep.h2
-rw-r--r--include/linux/mailbox/mtk-cmdq-mailbox.h10
-rw-r--r--include/linux/map_benchmark.h1
-rw-r--r--include/linux/memory.h9
-rw-r--r--include/linux/memory_hotplug.h18
-rw-r--r--include/linux/memremap.h1
-rw-r--r--include/linux/mlx5/cq.h1
-rw-r--r--include/linux/mm.h48
-rw-r--r--include/linux/mm_types.h128
-rw-r--r--include/linux/module.h3
-rw-r--r--include/linux/msi.h3
-rw-r--r--include/linux/mutex.h45
-rw-r--r--include/linux/namei.h83
-rw-r--r--include/linux/net/intel/libie/fwlog.h12
-rw-r--r--include/linux/ns/ns_common_types.h196
-rw-r--r--include/linux/ns/nstree_types.h55
-rw-r--r--include/linux/ns_common.h233
-rw-r--r--include/linux/nsfs.h3
-rw-r--r--include/linux/nsproxy.h9
-rw-r--r--include/linux/nstree.h52
-rw-r--r--include/linux/objtool.h96
-rw-r--r--include/linux/objtool_types.h2
-rw-r--r--include/linux/of_irq.h7
-rw-r--r--include/linux/pagemap.h18
-rw-r--r--include/linux/pci.h2
-rw-r--r--include/linux/percpu-defs.h2
-rw-r--r--include/linux/perf/arm_pmu.h7
-rw-r--r--include/linux/perf_event.h2
-rw-r--r--include/linux/pgtable.h4
-rw-r--r--include/linux/pid_namespace.h3
-rw-r--r--include/linux/pipe_fs_i.h23
-rw-r--r--include/linux/platform_data/x86/int3472.h1
-rw-r--r--include/linux/platform_device.h3
-rw-r--r--include/linux/pm.h8
-rw-r--r--include/linux/pm_domain.h1
-rw-r--r--include/linux/pm_qos.h9
-rw-r--r--include/linux/pm_runtime.h24
-rw-r--r--include/linux/preempt.h2
-rw-r--r--include/linux/prmt.h2
-rw-r--r--include/linux/pseudo_fs.h1
-rw-r--r--include/linux/regmap.h2
-rw-r--r--include/linux/resctrl.h24
-rw-r--r--include/linux/restart_block.h2
-rw-r--r--include/linux/resume_user_mode.h2
-rw-r--r--include/linux/rseq.h214
-rw-r--r--include/linux/rseq_entry.h616
-rw-r--r--include/linux/rseq_types.h164
-rw-r--r--include/linux/sched.h95
-rw-r--r--include/linux/sched/coredump.h2
-rw-r--r--include/linux/sched/topology.h3
-rw-r--r--include/linux/seqlock.h114
-rw-r--r--include/linux/shmem_fs.h2
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--include/linux/thread_info.h5
-rw-r--r--include/linux/timer.h9
-rw-r--r--include/linux/types.h1
-rw-r--r--include/linux/uaccess.h314
-rw-r--r--include/linux/unwind_deferred.h52
-rw-r--r--include/linux/unwind_deferred_types.h18
-rw-r--r--include/linux/unwind_user_types.h2
-rw-r--r--include/linux/usb/gadget.h5
-rw-r--r--include/linux/user_namespace.h4
-rw-r--r--include/linux/virtio_net.h10
-rw-r--r--include/linux/writeback.h15
-rw-r--r--include/linux/xattr.h4
-rw-r--r--include/net/bluetooth/hci.h6
-rw-r--r--include/net/bluetooth/hci_core.h22
-rw-r--r--include/net/bluetooth/l2cap.h4
-rw-r--r--include/net/bluetooth/mgmt.h4
-rw-r--r--include/net/cfg80211.h78
-rw-r--r--include/net/libeth/xdp.h2
-rw-r--r--include/net/pkt_cls.h2
-rw-r--r--include/net/tcp.h2
-rw-r--r--include/net/tls.h25
-rw-r--r--include/net/xfrm.h3
-rw-r--r--include/scsi/scsi_device.h10
-rw-r--r--include/trace/events/power.h3
-rw-r--r--include/trace/events/rseq.h4
-rw-r--r--include/trace/events/tcp.h9
-rw-r--r--include/trace/events/timer_migration.h4
-rw-r--r--include/trace/events/writeback.h8
-rw-r--r--include/uapi/asm-generic/posix_types.h1
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--include/uapi/drm/drm_fourcc.h25
-rw-r--r--include/uapi/linux/energy_model.h62
-rw-r--r--include/uapi/linux/fb.h2
-rw-r--r--include/uapi/linux/fcntl.h16
-rw-r--r--include/uapi/linux/input-event-codes.h14
-rw-r--r--include/uapi/linux/io_uring.h12
-rw-r--r--include/uapi/linux/io_uring/query.h3
-rw-r--r--include/uapi/linux/isst_if.h50
-rw-r--r--include/uapi/linux/mount.h2
-rw-r--r--include/uapi/linux/nsfs.h58
-rw-r--r--include/uapi/linux/perf_event.h23
-rw-r--r--include/uapi/linux/pidfd.h11
-rw-r--r--include/uapi/linux/rseq.h21
-rw-r--r--include/uapi/linux/tee.h23
-rw-r--r--include/uapi/linux/virtio_net.h3
-rw-r--r--include/ufs/ufshcd.h7
183 files changed, 5483 insertions, 1973 deletions
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 7146a8e9e9c2..d0eccbd920e5 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -417,15 +417,32 @@ static inline void acpi_processor_throttling_init(void) {}
#endif /* CONFIG_ACPI_CPU_FREQ_PSS */
/* in processor_idle.c */
+extern struct cpuidle_driver acpi_idle_driver;
#ifdef CONFIG_ACPI_PROCESSOR_IDLE
-void acpi_processor_power_init(struct acpi_processor *pr);
-void acpi_processor_power_exit(struct acpi_processor *pr);
+int acpi_processor_power_init(struct acpi_processor *pr);
+int acpi_processor_power_exit(struct acpi_processor *pr);
int acpi_processor_power_state_has_changed(struct acpi_processor *pr);
int acpi_processor_hotplug(struct acpi_processor *pr);
-void acpi_processor_register_idle_driver(void);
-void acpi_processor_unregister_idle_driver(void);
-int acpi_processor_ffh_lpi_probe(unsigned int cpu);
-int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi);
+#else
+static inline int acpi_processor_power_init(struct acpi_processor *pr)
+{
+ return -ENODEV;
+}
+
+static inline int acpi_processor_power_exit(struct acpi_processor *pr)
+{
+ return -ENODEV;
+}
+
+static inline int acpi_processor_power_state_has_changed(struct acpi_processor *pr)
+{
+ return -ENODEV;
+}
+
+static inline int acpi_processor_hotplug(struct acpi_processor *pr)
+{
+ return -ENODEV;
+}
#endif /* CONFIG_ACPI_PROCESSOR_IDLE */
/* in processor_thermal.c */
@@ -448,6 +465,11 @@ static inline void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy)
}
#endif /* CONFIG_CPU_FREQ */
+#ifdef CONFIG_ACPI_PROCESSOR_IDLE
+extern int acpi_processor_ffh_lpi_probe(unsigned int cpu);
+extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi);
+#endif
+
void acpi_processor_init_invariance_cppc(void);
#endif
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 387720933973..09e8eccee8ed 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -13,10 +13,19 @@
#define BUGFLAG_ONCE (1 << 1)
#define BUGFLAG_DONE (1 << 2)
#define BUGFLAG_NO_CUT_HERE (1 << 3) /* CUT_HERE already sent */
+#define BUGFLAG_ARGS (1 << 4)
#define BUGFLAG_TAINT(taint) ((taint) << 8)
#define BUG_GET_TAINT(bug) ((bug)->flags >> 8)
#endif
+#ifndef WARN_CONDITION_STR
+#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED
+# define WARN_CONDITION_STR(cond_str) "[" cond_str "] "
+#else
+# define WARN_CONDITION_STR(cond_str)
+#endif
+#endif /* WARN_CONDITION_STR */
+
#ifndef __ASSEMBLY__
#include <linux/panic.h>
#include <linux/printk.h>
@@ -29,19 +38,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
#ifdef CONFIG_BUG
-#ifdef CONFIG_GENERIC_BUG
-struct bug_entry {
#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
- unsigned long bug_addr;
+#define BUG_REL(type, name) type name
#else
- signed int bug_addr_disp;
+#define BUG_REL(type, name) signed int name##_disp
#endif
-#ifdef CONFIG_DEBUG_BUGVERBOSE
-#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
- const char *file;
-#else
- signed int file_disp;
+
+#ifdef CONFIG_GENERIC_BUG
+struct bug_entry {
+ BUG_REL(unsigned long, bug_addr);
+#ifdef HAVE_ARCH_BUG_FORMAT
+ BUG_REL(const char *, format);
#endif
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+ BUG_REL(const char *, file);
unsigned short line;
#endif
unsigned short flags;
@@ -92,28 +102,50 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint,
const char *fmt, ...);
extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
-#ifndef __WARN_FLAGS
-#define __WARN() __WARN_printf(TAINT_WARN, NULL)
+#ifdef __WARN_FLAGS
+#define __WARN() __WARN_FLAGS("", BUGFLAG_TAINT(TAINT_WARN))
+
+#ifndef WARN_ON
+#define WARN_ON(condition) ({ \
+ int __ret_warn_on = !!(condition); \
+ if (unlikely(__ret_warn_on)) \
+ __WARN_FLAGS(#condition, \
+ BUGFLAG_TAINT(TAINT_WARN)); \
+ unlikely(__ret_warn_on); \
+})
+#endif
+
+#ifndef WARN_ON_ONCE
+#define WARN_ON_ONCE(condition) ({ \
+ int __ret_warn_on = !!(condition); \
+ if (unlikely(__ret_warn_on)) \
+ __WARN_FLAGS(#condition, \
+ BUGFLAG_ONCE | \
+ BUGFLAG_TAINT(TAINT_WARN)); \
+ unlikely(__ret_warn_on); \
+})
+#endif
+#endif /* __WARN_FLAGS */
+
+#if defined(__WARN_FLAGS) && !defined(__WARN_printf)
#define __WARN_printf(taint, arg...) do { \
instrumentation_begin(); \
- warn_slowpath_fmt(__FILE__, __LINE__, taint, arg); \
+ __warn_printk(arg); \
+ __WARN_FLAGS("", BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
instrumentation_end(); \
} while (0)
-#else
-#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN))
+#endif
+
+#ifndef __WARN_printf
#define __WARN_printf(taint, arg...) do { \
instrumentation_begin(); \
- __warn_printk(arg); \
- __WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
+ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg); \
instrumentation_end(); \
} while (0)
-#define WARN_ON_ONCE(condition) ({ \
- int __ret_warn_on = !!(condition); \
- if (unlikely(__ret_warn_on)) \
- __WARN_FLAGS(BUGFLAG_ONCE | \
- BUGFLAG_TAINT(TAINT_WARN)); \
- unlikely(__ret_warn_on); \
-})
+#endif
+
+#ifndef __WARN
+#define __WARN() __WARN_printf(TAINT_WARN, NULL)
#endif
/* used internally by panic.c */
@@ -148,8 +180,10 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
DO_ONCE_LITE_IF(condition, WARN_ON, 1)
#endif
+#ifndef WARN_ONCE
#define WARN_ONCE(condition, format...) \
DO_ONCE_LITE_IF(condition, WARN, 1, format)
+#endif
#define WARN_TAINT_ONCE(condition, taint, format...) \
DO_ONCE_LITE_IF(condition, WARN_TAINT, 1, taint, format)
diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h
index ee3793e9b1a4..da1610a78f92 100644
--- a/include/asm-generic/thread_info_tif.h
+++ b/include/asm-generic/thread_info_tif.h
@@ -45,4 +45,7 @@
# define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK)
#endif
+#define TIF_RSEQ 11 // Run RSEQ fast path
+#define _TIF_RSEQ BIT(TIF_RSEQ)
+
#endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8a9a2e732a65..a464ff6c1a61 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -87,39 +87,56 @@
#define ALIGN_FUNCTION() . = ALIGN(CONFIG_FUNCTION_ALIGNMENT)
/*
- * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections, which
- * generates .data.identifier sections, which need to be pulled in with
- * .data. We don't want to pull in .data..other sections, which Linux
- * has defined. Same for text and bss.
+ * Support -ffunction-sections by matching .text and .text.*,
+ * but exclude '.text..*', .text.startup[.*], and .text.exit[.*].
*
- * With LTO_CLANG, the linker also splits sections by default, so we need
- * these macros to combine the sections during the final link.
+ * .text.startup and .text.startup.* are matched later by INIT_TEXT, and
+ * .text.exit and .text.exit.* are matched later by EXIT_TEXT, so they must be
+ * explicitly excluded here.
*
- * With AUTOFDO_CLANG and PROPELLER_CLANG, by default, the linker splits
- * text sections and regroups functions into subsections.
+ * Other .text.* sections that are typically grouped separately, such as
+ * .text.unlikely or .text.hot, must be matched explicitly before using
+ * TEXT_MAIN.
*
- * RODATA_MAIN is not used because existing code already defines .rodata.x
- * sections to be brought in with rodata.
+ * NOTE: builds *with* and *without* -ffunction-sections are both supported by
+ * this single macro. Even with -ffunction-sections, there may be some objects
+ * NOT compiled with the flag due to the use of a specific Makefile override
+ * like cflags-y or AUTOFDO_PROFILE_foo.o. So this single catchall rule is
+ * needed to support mixed object builds.
+ *
+ * One implication is that functions named startup(), exit(), split(),
+ * unlikely(), hot(), and unknown() are not allowed in the kernel due to the
+ * ambiguity of their section names with -ffunction-sections. For example,
+ * .text.startup could be __attribute__((constructor)) code in a *non*
+ * ffunction-sections object, which should be placed in .init.text; or it could
+ * be an actual function named startup() in an ffunction-sections object, which
+ * should be placed in .text. The build will detect and complain about any such
+ * ambiguously named functions.
+ */
+#define TEXT_MAIN \
+ .text \
+ .text.[_0-9A-Za-df-rt-z]* \
+ .text.s[_0-9A-Za-su-z]* .text.s .text.s.* \
+ .text.st[_0-9A-Zb-z]* .text.st .text.st.* \
+ .text.sta[_0-9A-Za-qs-z]* .text.sta .text.sta.* \
+ .text.star[_0-9A-Za-su-z]* .text.star .text.star.* \
+ .text.start[_0-9A-Za-tv-z]* .text.start .text.start.* \
+ .text.startu[_0-9A-Za-oq-z]* .text.startu .text.startu.* \
+ .text.startup[_0-9A-Za-z]* \
+ .text.e[_0-9A-Za-wy-z]* .text.e .text.e.* \
+ .text.ex[_0-9A-Za-hj-z]* .text.ex .text.ex.* \
+ .text.exi[_0-9A-Za-su-z]* .text.exi .text.exi.* \
+ .text.exit[_0-9A-Za-z]*
+
+/*
+ * Support -fdata-sections by matching .data, .data.*, and others,
+ * but exclude '.data..*'.
*/
-#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG) || \
-defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
-#define TEXT_MAIN .text .text.[0-9a-zA-Z_]*
-#else
-#define TEXT_MAIN .text
-#endif
-#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG)
#define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data.rel.* .data..L* .data..compoundliteral* .data.$__unnamed_* .data.$L*
#define SDATA_MAIN .sdata .sdata.[0-9a-zA-Z_]*
#define RODATA_MAIN .rodata .rodata.[0-9a-zA-Z_]* .rodata..L*
#define BSS_MAIN .bss .bss.[0-9a-zA-Z_]* .bss..L* .bss..compoundliteral*
#define SBSS_MAIN .sbss .sbss.[0-9a-zA-Z_]*
-#else
-#define DATA_MAIN .data .data.rel .data.rel.local
-#define SDATA_MAIN .sdata
-#define RODATA_MAIN .rodata
-#define BSS_MAIN .bss
-#define SBSS_MAIN .sbss
-#endif
/*
* GCC 4.5 and later have a 32 bytes section alignment for structures.
@@ -581,9 +598,8 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
* during second ld run in second ld pass when generating System.map
*
* TEXT_MAIN here will match symbols with a fixed pattern (for example,
- * .text.hot or .text.unlikely) if dead code elimination or
- * function-section is enabled. Match these symbols first before
- * TEXT_MAIN to ensure they are grouped together.
+ * .text.hot or .text.unlikely). Match those before TEXT_MAIN to ensure
+ * they get grouped together.
*
* Also placing .text.hot section at the beginning of a page, this
* would help the TLB performance.
@@ -729,16 +745,16 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
#define INIT_TEXT \
*(.init.text .init.text.*) \
- *(.text.startup)
+ *(.text.startup .text.startup.*)
#define EXIT_DATA \
*(.exit.data .exit.data.*) \
*(.fini_array .fini_array.*) \
- *(.dtors .dtors.*) \
+ *(.dtors .dtors.*)
#define EXIT_TEXT \
*(.exit.text) \
- *(.text.exit) \
+ *(.text.exit .text.exit.*)
#define EXIT_CALL \
*(.exitcall.exit)
@@ -832,7 +848,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
/* Required sections not related to debugging. */
#define ELF_DETAILS \
- .modinfo : { *(.modinfo) } \
+ .modinfo : { *(.modinfo) . = ALIGN(8); } \
.comment 0 : { *(.comment) } \
.symtab 0 : { *(.symtab) } \
.strtab 0 : { *(.strtab) } \
@@ -955,7 +971,8 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
#define RUNTIME_CONST_VARIABLES \
RUNTIME_CONST(shift, d_hash_shift) \
- RUNTIME_CONST(ptr, dentry_hashtable)
+ RUNTIME_CONST(ptr, dentry_hashtable) \
+ RUNTIME_CONST(ptr, __dentry_cache)
/* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */
#define KUNIT_TABLE() \
diff --git a/include/crypto/blake2b.h b/include/crypto/blake2b.h
index dd7694477e50..3bc37fd103a7 100644
--- a/include/crypto/blake2b.h
+++ b/include/crypto/blake2b.h
@@ -7,20 +7,10 @@
#include <linux/types.h>
#include <linux/string.h>
-struct blake2b_state {
- /* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
- u64 h[8];
- u64 t[2];
- /* The true state ends here. The rest is temporary storage. */
- u64 f[2];
-};
-
enum blake2b_lengths {
BLAKE2B_BLOCK_SIZE = 128,
BLAKE2B_HASH_SIZE = 64,
BLAKE2B_KEY_SIZE = 64,
- BLAKE2B_STATE_SIZE = offsetof(struct blake2b_state, f),
- BLAKE2B_DESC_SIZE = sizeof(struct blake2b_state),
BLAKE2B_160_HASH_SIZE = 20,
BLAKE2B_256_HASH_SIZE = 32,
@@ -28,6 +18,25 @@ enum blake2b_lengths {
BLAKE2B_512_HASH_SIZE = 64,
};
+/**
+ * struct blake2b_ctx - Context for hashing a message with BLAKE2b
+ * @h: compression function state
+ * @t: block counter
+ * @f: finalization indicator
+ * @buf: partial block buffer; 'buflen' bytes are valid
+ * @buflen: number of bytes buffered in @buf
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ */
+struct blake2b_ctx {
+ /* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
+ u64 h[8];
+ u64 t[2];
+ u64 f[2];
+ u8 buf[BLAKE2B_BLOCK_SIZE];
+ unsigned int buflen;
+ unsigned int outlen;
+};
+
enum blake2b_iv {
BLAKE2B_IV0 = 0x6A09E667F3BCC908ULL,
BLAKE2B_IV1 = 0xBB67AE8584CAA73BULL,
@@ -39,19 +48,109 @@ enum blake2b_iv {
BLAKE2B_IV7 = 0x5BE0CD19137E2179ULL,
};
-static inline void __blake2b_init(struct blake2b_state *state, size_t outlen,
- size_t keylen)
+static inline void __blake2b_init(struct blake2b_ctx *ctx, size_t outlen,
+ const void *key, size_t keylen)
+{
+ ctx->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+ ctx->h[1] = BLAKE2B_IV1;
+ ctx->h[2] = BLAKE2B_IV2;
+ ctx->h[3] = BLAKE2B_IV3;
+ ctx->h[4] = BLAKE2B_IV4;
+ ctx->h[5] = BLAKE2B_IV5;
+ ctx->h[6] = BLAKE2B_IV6;
+ ctx->h[7] = BLAKE2B_IV7;
+ ctx->t[0] = 0;
+ ctx->t[1] = 0;
+ ctx->f[0] = 0;
+ ctx->f[1] = 0;
+ ctx->buflen = 0;
+ ctx->outlen = outlen;
+ if (keylen) {
+ memcpy(ctx->buf, key, keylen);
+ memset(&ctx->buf[keylen], 0, BLAKE2B_BLOCK_SIZE - keylen);
+ ctx->buflen = BLAKE2B_BLOCK_SIZE;
+ }
+}
+
+/**
+ * blake2b_init() - Initialize a BLAKE2b context for a new message (unkeyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b_init(struct blake2b_ctx *ctx, size_t outlen)
{
- state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
- state->h[1] = BLAKE2B_IV1;
- state->h[2] = BLAKE2B_IV2;
- state->h[3] = BLAKE2B_IV3;
- state->h[4] = BLAKE2B_IV4;
- state->h[5] = BLAKE2B_IV5;
- state->h[6] = BLAKE2B_IV6;
- state->h[7] = BLAKE2B_IV7;
- state->t[0] = 0;
- state->t[1] = 0;
+ __blake2b_init(ctx, outlen, NULL, 0);
+}
+
+/**
+ * blake2b_init_key() - Initialize a BLAKE2b context for a new message (keyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ * @key: the key
+ * @keylen: the key length in bytes, at most BLAKE2B_KEY_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b_init_key(struct blake2b_ctx *ctx, size_t outlen,
+ const void *key, size_t keylen)
+{
+ WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2B_HASH_SIZE ||
+ !key || !keylen || keylen > BLAKE2B_KEY_SIZE));
+
+ __blake2b_init(ctx, outlen, key, keylen);
+}
+
+/**
+ * blake2b_update() - Update a BLAKE2b context with message data
+ * @ctx: the context to update; must have been initialized
+ * @in: the message data
+ * @inlen: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen);
+
+/**
+ * blake2b_final() - Finish computing a BLAKE2b hash
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting BLAKE2b hash. Its length will be equal to the
+ * @outlen that was passed to blake2b_init() or blake2b_init_key().
+ *
+ * After finishing, this zeroizes @ctx. So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void blake2b_final(struct blake2b_ctx *ctx, u8 *out);
+
+/**
+ * blake2b() - Compute BLAKE2b hash in one shot
+ * @key: the key, or NULL for an unkeyed hash
+ * @keylen: the key length in bytes (at most BLAKE2B_KEY_SIZE), or 0 for an
+ * unkeyed hash
+ * @in: the message data
+ * @inlen: the data length in bytes
+ * @out: (output) the resulting BLAKE2b hash, with length @outlen
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b(const u8 *key, size_t keylen,
+ const u8 *in, size_t inlen,
+ u8 *out, size_t outlen)
+{
+ struct blake2b_ctx ctx;
+
+ WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
+ outlen > BLAKE2B_HASH_SIZE || keylen > BLAKE2B_KEY_SIZE ||
+ (!key && keylen)));
+
+ __blake2b_init(&ctx, outlen, key, keylen);
+ blake2b_update(&ctx, in, inlen);
+ blake2b_final(&ctx, out);
}
#endif /* _CRYPTO_BLAKE2B_H */
diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index f9ffd39194eb..648cb7824358 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -22,7 +22,16 @@ enum blake2s_lengths {
BLAKE2S_256_HASH_SIZE = 32,
};
-struct blake2s_state {
+/**
+ * struct blake2s_ctx - Context for hashing a message with BLAKE2s
+ * @h: compression function state
+ * @t: block counter
+ * @f: finalization indicator
+ * @buf: partial block buffer; 'buflen' bytes are valid
+ * @buflen: number of bytes buffered in @buf
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ */
+struct blake2s_ctx {
/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
u32 h[8];
u32 t[2];
@@ -43,62 +52,109 @@ enum blake2s_iv {
BLAKE2S_IV7 = 0x5BE0CD19UL,
};
-static inline void __blake2s_init(struct blake2s_state *state, size_t outlen,
+static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen,
const void *key, size_t keylen)
{
- state->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen);
- state->h[1] = BLAKE2S_IV1;
- state->h[2] = BLAKE2S_IV2;
- state->h[3] = BLAKE2S_IV3;
- state->h[4] = BLAKE2S_IV4;
- state->h[5] = BLAKE2S_IV5;
- state->h[6] = BLAKE2S_IV6;
- state->h[7] = BLAKE2S_IV7;
- state->t[0] = 0;
- state->t[1] = 0;
- state->f[0] = 0;
- state->f[1] = 0;
- state->buflen = 0;
- state->outlen = outlen;
+ ctx->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+ ctx->h[1] = BLAKE2S_IV1;
+ ctx->h[2] = BLAKE2S_IV2;
+ ctx->h[3] = BLAKE2S_IV3;
+ ctx->h[4] = BLAKE2S_IV4;
+ ctx->h[5] = BLAKE2S_IV5;
+ ctx->h[6] = BLAKE2S_IV6;
+ ctx->h[7] = BLAKE2S_IV7;
+ ctx->t[0] = 0;
+ ctx->t[1] = 0;
+ ctx->f[0] = 0;
+ ctx->f[1] = 0;
+ ctx->buflen = 0;
+ ctx->outlen = outlen;
if (keylen) {
- memcpy(state->buf, key, keylen);
- memset(&state->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen);
- state->buflen = BLAKE2S_BLOCK_SIZE;
+ memcpy(ctx->buf, key, keylen);
+ memset(&ctx->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen);
+ ctx->buflen = BLAKE2S_BLOCK_SIZE;
}
}
-static inline void blake2s_init(struct blake2s_state *state,
- const size_t outlen)
+/**
+ * blake2s_init() - Initialize a BLAKE2s context for a new message (unkeyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2s_init(struct blake2s_ctx *ctx, size_t outlen)
{
- __blake2s_init(state, outlen, NULL, 0);
+ __blake2s_init(ctx, outlen, NULL, 0);
}
-static inline void blake2s_init_key(struct blake2s_state *state,
- const size_t outlen, const void *key,
- const size_t keylen)
+/**
+ * blake2s_init_key() - Initialize a BLAKE2s context for a new message (keyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ * @key: the key
+ * @keylen: the key length in bytes, at most BLAKE2S_KEY_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen,
+ const void *key, size_t keylen)
{
WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE ||
!key || !keylen || keylen > BLAKE2S_KEY_SIZE));
- __blake2s_init(state, outlen, key, keylen);
+ __blake2s_init(ctx, outlen, key, keylen);
}
-void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen);
-void blake2s_final(struct blake2s_state *state, u8 *out);
+/**
+ * blake2s_update() - Update a BLAKE2s context with message data
+ * @ctx: the context to update; must have been initialized
+ * @in: the message data
+ * @inlen: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen);
-static inline void blake2s(u8 *out, const u8 *in, const u8 *key,
- const size_t outlen, const size_t inlen,
- const size_t keylen)
+/**
+ * blake2s_final() - Finish computing a BLAKE2s hash
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting BLAKE2s hash. Its length will be equal to the
+ * @outlen that was passed to blake2s_init() or blake2s_init_key().
+ *
+ * After finishing, this zeroizes @ctx. So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out);
+
+/**
+ * blake2s() - Compute BLAKE2s hash in one shot
+ * @key: the key, or NULL for an unkeyed hash
+ * @keylen: the key length in bytes (at most BLAKE2S_KEY_SIZE), or 0 for an
+ * unkeyed hash
+ * @in: the message data
+ * @inlen: the data length in bytes
+ * @out: (output) the resulting BLAKE2s hash, with length @outlen
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2s(const u8 *key, size_t keylen,
+ const u8 *in, size_t inlen,
+ u8 *out, size_t outlen)
{
- struct blake2s_state state;
+ struct blake2s_ctx ctx;
WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE ||
(!key && keylen)));
- __blake2s_init(&state, outlen, key, keylen);
- blake2s_update(&state, in, inlen);
- blake2s_final(&state, out);
+ __blake2s_init(&ctx, outlen, key, keylen);
+ blake2s_update(&ctx, in, inlen);
+ blake2s_final(&ctx, out);
}
#endif /* _CRYPTO_BLAKE2S_H */
diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
index 38e26dff27b0..1cc301a48469 100644
--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
@@ -38,18 +38,18 @@ struct chacha_state {
};
void chacha_block_generic(struct chacha_state *state,
- u8 out[CHACHA_BLOCK_SIZE], int nrounds);
+ u8 out[at_least CHACHA_BLOCK_SIZE], int nrounds);
static inline void chacha20_block(struct chacha_state *state,
- u8 out[CHACHA_BLOCK_SIZE])
+ u8 out[at_least CHACHA_BLOCK_SIZE])
{
chacha_block_generic(state, out, 20);
}
void hchacha_block_generic(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
+ u32 out[at_least HCHACHA_OUT_WORDS], int nrounds);
void hchacha_block(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
+ u32 out[at_least HCHACHA_OUT_WORDS], int nrounds);
enum chacha_constants { /* expand 32-byte k */
CHACHA_CONSTANT_EXPA = 0x61707865U,
@@ -67,8 +67,8 @@ static inline void chacha_init_consts(struct chacha_state *state)
}
static inline void chacha_init(struct chacha_state *state,
- const u32 key[CHACHA_KEY_WORDS],
- const u8 iv[CHACHA_IV_SIZE])
+ const u32 key[at_least CHACHA_KEY_WORDS],
+ const u8 iv[at_least CHACHA_IV_SIZE])
{
chacha_init_consts(state);
state->x[4] = key[0];
diff --git a/include/crypto/chacha20poly1305.h b/include/crypto/chacha20poly1305.h
index d2ac3ff7dc1e..0f71b037702d 100644
--- a/include/crypto/chacha20poly1305.h
+++ b/include/crypto/chacha20poly1305.h
@@ -18,32 +18,33 @@ enum chacha20poly1305_lengths {
void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
bool __must_check
chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 *ad, const size_t ad_len, const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 *ad, const size_t ad_len,
- const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+ const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
bool __must_check xchacha20poly1305_decrypt(
- u8 *dst, const u8 *src, const size_t src_len, const u8 *ad,
- const size_t ad_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+ u8 *dst, const u8 *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len,
+ const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]);
bool chacha20poly1305_selftest(void);
diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h
index db63a5577c00..2362b48f8741 100644
--- a/include/crypto/curve25519.h
+++ b/include/crypto/curve25519.h
@@ -13,24 +13,28 @@ enum curve25519_lengths {
CURVE25519_KEY_SIZE = 32
};
-void curve25519_generic(u8 out[CURVE25519_KEY_SIZE],
- const u8 scalar[CURVE25519_KEY_SIZE],
- const u8 point[CURVE25519_KEY_SIZE]);
+void curve25519_generic(u8 out[at_least CURVE25519_KEY_SIZE],
+ const u8 scalar[at_least CURVE25519_KEY_SIZE],
+ const u8 point[at_least CURVE25519_KEY_SIZE]);
-bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
- const u8 secret[CURVE25519_KEY_SIZE],
- const u8 basepoint[CURVE25519_KEY_SIZE]);
+bool __must_check
+curve25519(u8 mypublic[at_least CURVE25519_KEY_SIZE],
+ const u8 secret[at_least CURVE25519_KEY_SIZE],
+ const u8 basepoint[at_least CURVE25519_KEY_SIZE]);
-bool __must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
- const u8 secret[CURVE25519_KEY_SIZE]);
+bool __must_check
+curve25519_generate_public(u8 pub[at_least CURVE25519_KEY_SIZE],
+ const u8 secret[at_least CURVE25519_KEY_SIZE]);
-static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE])
+static inline void
+curve25519_clamp_secret(u8 secret[at_least CURVE25519_KEY_SIZE])
{
secret[0] &= 248;
secret[31] = (secret[31] & 127) | 64;
}
-static inline void curve25519_generate_secret(u8 secret[CURVE25519_KEY_SIZE])
+static inline void
+curve25519_generate_secret(u8 secret[at_least CURVE25519_KEY_SIZE])
{
get_random_bytes_wait(secret, CURVE25519_KEY_SIZE);
curve25519_clamp_secret(secret);
diff --git a/include/crypto/internal/blake2b.h b/include/crypto/internal/blake2b.h
deleted file mode 100644
index 3e09e2485306..000000000000
--- a/include/crypto/internal/blake2b.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Helper functions for BLAKE2b implementations.
- * Keep this in sync with the corresponding BLAKE2s header.
- */
-
-#ifndef _CRYPTO_INTERNAL_BLAKE2B_H
-#define _CRYPTO_INTERNAL_BLAKE2B_H
-
-#include <asm/byteorder.h>
-#include <crypto/blake2b.h>
-#include <crypto/internal/hash.h>
-#include <linux/array_size.h>
-#include <linux/compiler.h>
-#include <linux/build_bug.h>
-#include <linux/errno.h>
-#include <linux/math.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-static inline void blake2b_set_lastblock(struct blake2b_state *state)
-{
- state->f[0] = -1;
- state->f[1] = 0;
-}
-
-static inline void blake2b_set_nonlast(struct blake2b_state *state)
-{
- state->f[0] = 0;
- state->f[1] = 0;
-}
-
-typedef void (*blake2b_compress_t)(struct blake2b_state *state,
- const u8 *block, size_t nblocks, u32 inc);
-
-/* Helper functions for shash implementations of BLAKE2b */
-
-struct blake2b_tfm_ctx {
- u8 key[BLAKE2B_BLOCK_SIZE];
- unsigned int keylen;
-};
-
-static inline int crypto_blake2b_setkey(struct crypto_shash *tfm,
- const u8 *key, unsigned int keylen)
-{
- struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(tfm);
-
- if (keylen > BLAKE2B_KEY_SIZE)
- return -EINVAL;
-
- BUILD_BUG_ON(BLAKE2B_KEY_SIZE > BLAKE2B_BLOCK_SIZE);
-
- memcpy(tctx->key, key, keylen);
- memset(tctx->key + keylen, 0, BLAKE2B_BLOCK_SIZE - keylen);
- tctx->keylen = keylen;
-
- return 0;
-}
-
-static inline int crypto_blake2b_init(struct shash_desc *desc)
-{
- const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
- struct blake2b_state *state = shash_desc_ctx(desc);
- unsigned int outlen = crypto_shash_digestsize(desc->tfm);
-
- __blake2b_init(state, outlen, tctx->keylen);
- return tctx->keylen ?
- crypto_shash_update(desc, tctx->key, BLAKE2B_BLOCK_SIZE) : 0;
-}
-
-static inline int crypto_blake2b_update_bo(struct shash_desc *desc,
- const u8 *in, unsigned int inlen,
- blake2b_compress_t compress)
-{
- struct blake2b_state *state = shash_desc_ctx(desc);
-
- blake2b_set_nonlast(state);
- compress(state, in, inlen / BLAKE2B_BLOCK_SIZE, BLAKE2B_BLOCK_SIZE);
- return inlen - round_down(inlen, BLAKE2B_BLOCK_SIZE);
-}
-
-static inline int crypto_blake2b_finup(struct shash_desc *desc, const u8 *in,
- unsigned int inlen, u8 *out,
- blake2b_compress_t compress)
-{
- struct blake2b_state *state = shash_desc_ctx(desc);
- u8 buf[BLAKE2B_BLOCK_SIZE];
- int i;
-
- memcpy(buf, in, inlen);
- memset(buf + inlen, 0, BLAKE2B_BLOCK_SIZE - inlen);
- blake2b_set_lastblock(state);
- compress(state, buf, 1, inlen);
- for (i = 0; i < ARRAY_SIZE(state->h); i++)
- __cpu_to_le64s(&state->h[i]);
- memcpy(out, state->h, crypto_shash_digestsize(desc->tfm));
- memzero_explicit(buf, sizeof(buf));
- return 0;
-}
-
-#endif /* _CRYPTO_INTERNAL_BLAKE2B_H */
diff --git a/include/crypto/md5.h b/include/crypto/md5.h
index c9aa5c3abc53..c47aedfe67ec 100644
--- a/include/crypto/md5.h
+++ b/include/crypto/md5.h
@@ -76,7 +76,7 @@ void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len);
*
* Context: Any context.
*/
-void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
+void md5_final(struct md5_ctx *ctx, u8 out[at_least MD5_DIGEST_SIZE]);
/**
* md5() - Compute MD5 message digest in one shot
@@ -86,7 +86,7 @@ void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
*
* Context: Any context.
*/
-void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE]);
+void md5(const u8 *data, size_t len, u8 out[at_least MD5_DIGEST_SIZE]);
/**
* struct hmac_md5_key - Prepared key for HMAC-MD5
@@ -173,7 +173,7 @@ static inline void hmac_md5_update(struct hmac_md5_ctx *ctx,
*
* Context: Any context.
*/
-void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
+void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[at_least MD5_DIGEST_SIZE]);
/**
* hmac_md5() - Compute HMAC-MD5 in one shot, using a prepared key
@@ -187,7 +187,8 @@ void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
* Context: Any context.
*/
void hmac_md5(const struct hmac_md5_key *key,
- const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE]);
+ const u8 *data, size_t data_len,
+ u8 out[at_least MD5_DIGEST_SIZE]);
/**
* hmac_md5_usingrawkey() - Compute HMAC-MD5 in one shot, using a raw key
@@ -204,6 +205,6 @@ void hmac_md5(const struct hmac_md5_key *key,
*/
void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len,
const u8 *data, size_t data_len,
- u8 out[MD5_DIGEST_SIZE]);
+ u8 out[at_least MD5_DIGEST_SIZE]);
#endif /* _CRYPTO_MD5_H */
diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h
index d4daeec8da19..190beb427c6d 100644
--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
@@ -59,7 +59,7 @@ struct poly1305_desc_ctx {
};
void poly1305_init(struct poly1305_desc_ctx *desc,
- const u8 key[POLY1305_KEY_SIZE]);
+ const u8 key[at_least POLY1305_KEY_SIZE]);
void poly1305_update(struct poly1305_desc_ctx *desc,
const u8 *src, unsigned int nbytes);
void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest);
diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h
index d2e63743e592..b28b8ef11353 100644
--- a/include/crypto/polyval.h
+++ b/include/crypto/polyval.h
@@ -1,14 +1,190 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
- * Common values for the Polyval hash algorithm
+ * POLYVAL library API
*
- * Copyright 2021 Google LLC
+ * Copyright 2025 Google LLC
*/
#ifndef _CRYPTO_POLYVAL_H
#define _CRYPTO_POLYVAL_H
+#include <linux/string.h>
+#include <linux/types.h>
+
#define POLYVAL_BLOCK_SIZE 16
#define POLYVAL_DIGEST_SIZE 16
+/**
+ * struct polyval_elem - An element of the POLYVAL finite field
+ * @bytes: View of the element as a byte array (unioned with @lo and @hi)
+ * @lo: The low 64 terms of the element's polynomial
+ * @hi: The high 64 terms of the element's polynomial
+ *
+ * This represents an element of the finite field GF(2^128), using the POLYVAL
+ * convention: little-endian byte order and natural bit order.
+ */
+struct polyval_elem {
+ union {
+ u8 bytes[POLYVAL_BLOCK_SIZE];
+ struct {
+ __le64 lo;
+ __le64 hi;
+ };
+ };
+};
+
+/**
+ * struct polyval_key - Prepared key for POLYVAL
+ *
+ * This may contain just the raw key H, or it may contain precomputed key
+ * powers, depending on the platform's POLYVAL implementation. Use
+ * polyval_preparekey() to initialize this.
+ *
+ * By H^i we mean H^(i-1) * H * x^-128, with base case H^1 = H. I.e. the
+ * exponentiation repeats the POLYVAL dot operation, with its "extra" x^-128.
+ */
+struct polyval_key {
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#ifdef CONFIG_ARM64
+ /** @h_powers: Powers of the hash key H^8 through H^1 */
+ struct polyval_elem h_powers[8];
+#elif defined(CONFIG_X86)
+ /** @h_powers: Powers of the hash key H^8 through H^1 */
+ struct polyval_elem h_powers[8];
+#else
+#error "Unhandled arch"
#endif
+#else /* CONFIG_CRYPTO_LIB_POLYVAL_ARCH */
+ /** @h: The hash key H */
+ struct polyval_elem h;
+#endif /* !CONFIG_CRYPTO_LIB_POLYVAL_ARCH */
+};
+
+/**
+ * struct polyval_ctx - Context for computing a POLYVAL value
+ * @key: Pointer to the prepared POLYVAL key. The user of the API is
+ * responsible for ensuring that the key lives as long as the context.
+ * @acc: The accumulator
+ * @partial: Number of data bytes processed so far modulo POLYVAL_BLOCK_SIZE
+ */
+struct polyval_ctx {
+ const struct polyval_key *key;
+ struct polyval_elem acc;
+ size_t partial;
+};
+
+/**
+ * polyval_preparekey() - Prepare a POLYVAL key
+ * @key: (output) The key structure to initialize
+ * @raw_key: The raw hash key
+ *
+ * Initialize a POLYVAL key structure from a raw key. This may be a simple
+ * copy, or it may involve precomputing powers of the key, depending on the
+ * platform's POLYVAL implementation.
+ *
+ * Context: Any context.
+ */
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+void polyval_preparekey(struct polyval_key *key,
+ const u8 raw_key[POLYVAL_BLOCK_SIZE]);
+
+#else
+static inline void polyval_preparekey(struct polyval_key *key,
+ const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+ /* Just a simple copy, so inline it. */
+ memcpy(key->h.bytes, raw_key, POLYVAL_BLOCK_SIZE);
+}
+#endif
+
+/**
+ * polyval_init() - Initialize a POLYVAL context for a new message
+ * @ctx: The context to initialize
+ * @key: The key to use. Note that a pointer to the key is saved in the
+ * context, so the key must live at least as long as the context.
+ */
+static inline void polyval_init(struct polyval_ctx *ctx,
+ const struct polyval_key *key)
+{
+ *ctx = (struct polyval_ctx){ .key = key };
+}
+
+/**
+ * polyval_import_blkaligned() - Import a POLYVAL accumulator value
+ * @ctx: The context to initialize
+ * @key: The key to import. Note that a pointer to the key is saved in the
+ * context, so the key must live at least as long as the context.
+ * @acc: The accumulator value to import.
+ *
+ * This imports an accumulator that was saved by polyval_export_blkaligned().
+ * The same key must be used.
+ */
+static inline void
+polyval_import_blkaligned(struct polyval_ctx *ctx,
+ const struct polyval_key *key,
+ const struct polyval_elem *acc)
+{
+ *ctx = (struct polyval_ctx){ .key = key, .acc = *acc };
+}
+
+/**
+ * polyval_export_blkaligned() - Export a POLYVAL accumulator value
+ * @ctx: The context to export the accumulator value from
+ * @acc: (output) The exported accumulator value
+ *
+ * This exports the accumulator from a POLYVAL context. The number of data
+ * bytes processed so far must be a multiple of POLYVAL_BLOCK_SIZE.
+ */
+static inline void polyval_export_blkaligned(const struct polyval_ctx *ctx,
+ struct polyval_elem *acc)
+{
+ *acc = ctx->acc;
+}
+
+/**
+ * polyval_update() - Update a POLYVAL context with message data
+ * @ctx: The context to update; must have been initialized
+ * @data: The message data
+ * @len: The data length in bytes. Doesn't need to be block-aligned.
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len);
+
+/**
+ * polyval_final() - Finish computing a POLYVAL value
+ * @ctx: The context to finalize
+ * @out: The output value
+ *
+ * If the total data length isn't a multiple of POLYVAL_BLOCK_SIZE, then the
+ * final block is automatically zero-padded.
+ *
+ * After finishing, this zeroizes @ctx. So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE]);
+
+/**
+ * polyval() - Compute a POLYVAL value
+ * @key: The prepared key
+ * @data: The message data
+ * @len: The data length in bytes. Doesn't need to be block-aligned.
+ * @out: The output value
+ *
+ * Context: Any context.
+ */
+static inline void polyval(const struct polyval_key *key,
+ const u8 *data, size_t len,
+ u8 out[POLYVAL_BLOCK_SIZE])
+{
+ struct polyval_ctx ctx;
+
+ polyval_init(&ctx, key);
+ polyval_update(&ctx, data, len);
+ polyval_final(&ctx, out);
+}
+
+#endif /* _CRYPTO_POLYVAL_H */
diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h
index 162a529ec841..27f08b972931 100644
--- a/include/crypto/sha1.h
+++ b/include/crypto/sha1.h
@@ -84,7 +84,7 @@ void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len);
*
* Context: Any context.
*/
-void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
+void sha1_final(struct sha1_ctx *ctx, u8 out[at_least SHA1_DIGEST_SIZE]);
/**
* sha1() - Compute SHA-1 message digest in one shot
@@ -94,7 +94,7 @@ void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
*
* Context: Any context.
*/
-void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE]);
+void sha1(const u8 *data, size_t len, u8 out[at_least SHA1_DIGEST_SIZE]);
/**
* struct hmac_sha1_key - Prepared key for HMAC-SHA1
@@ -181,7 +181,8 @@ static inline void hmac_sha1_update(struct hmac_sha1_ctx *ctx,
*
* Context: Any context.
*/
-void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
+void hmac_sha1_final(struct hmac_sha1_ctx *ctx,
+ u8 out[at_least SHA1_DIGEST_SIZE]);
/**
* hmac_sha1() - Compute HMAC-SHA1 in one shot, using a prepared key
@@ -195,7 +196,8 @@ void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
* Context: Any context.
*/
void hmac_sha1(const struct hmac_sha1_key *key,
- const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE]);
+ const u8 *data, size_t data_len,
+ u8 out[at_least SHA1_DIGEST_SIZE]);
/**
* hmac_sha1_usingrawkey() - Compute HMAC-SHA1 in one shot, using a raw key
@@ -212,6 +214,6 @@ void hmac_sha1(const struct hmac_sha1_key *key,
*/
void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
const u8 *data, size_t data_len,
- u8 out[SHA1_DIGEST_SIZE]);
+ u8 out[at_least SHA1_DIGEST_SIZE]);
#endif /* _CRYPTO_SHA1_H */
diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index e5dafb935cc8..7bb8fe169daf 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -190,7 +190,7 @@ static inline void sha224_update(struct sha224_ctx *ctx,
*
* Context: Any context.
*/
-void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
+void sha224_final(struct sha224_ctx *ctx, u8 out[at_least SHA224_DIGEST_SIZE]);
/**
* sha224() - Compute SHA-224 message digest in one shot
@@ -200,7 +200,7 @@ void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
*
* Context: Any context.
*/
-void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]);
+void sha224(const u8 *data, size_t len, u8 out[at_least SHA224_DIGEST_SIZE]);
/**
* struct hmac_sha224_key - Prepared key for HMAC-SHA224
@@ -287,7 +287,8 @@ static inline void hmac_sha224_update(struct hmac_sha224_ctx *ctx,
*
* Context: Any context.
*/
-void hmac_sha224_final(struct hmac_sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
+void hmac_sha224_final(struct hmac_sha224_ctx *ctx,
+ u8 out[at_least SHA224_DIGEST_SIZE]);
/**
* hmac_sha224() - Compute HMAC-SHA224 in one shot, using a prepared key
@@ -301,7 +302,8 @@ void hmac_sha224_final(struct hmac_sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
* Context: Any context.
*/
void hmac_sha224(const struct hmac_sha224_key *key,
- const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE]);
+ const u8 *data, size_t data_len,
+ u8 out[at_least SHA224_DIGEST_SIZE]);
/**
* hmac_sha224_usingrawkey() - Compute HMAC-SHA224 in one shot, using a raw key
@@ -318,7 +320,7 @@ void hmac_sha224(const struct hmac_sha224_key *key,
*/
void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
const u8 *data, size_t data_len,
- u8 out[SHA224_DIGEST_SIZE]);
+ u8 out[at_least SHA224_DIGEST_SIZE]);
/**
* struct sha256_ctx - Context for hashing a message with SHA-256
@@ -363,7 +365,7 @@ static inline void sha256_update(struct sha256_ctx *ctx,
*
* Context: Any context.
*/
-void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
+void sha256_final(struct sha256_ctx *ctx, u8 out[at_least SHA256_DIGEST_SIZE]);
/**
* sha256() - Compute SHA-256 message digest in one shot
@@ -373,7 +375,7 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
*
* Context: Any context.
*/
-void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
+void sha256(const u8 *data, size_t len, u8 out[at_least SHA256_DIGEST_SIZE]);
/**
* sha256_finup_2x() - Compute two SHA-256 digests from a common initial
@@ -390,8 +392,9 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
* Context: Any context.
*/
void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
- const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
- u8 out2[SHA256_DIGEST_SIZE]);
+ const u8 *data2, size_t len,
+ u8 out1[at_least SHA256_DIGEST_SIZE],
+ u8 out2[at_least SHA256_DIGEST_SIZE]);
/**
* sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real
@@ -488,7 +491,8 @@ static inline void hmac_sha256_update(struct hmac_sha256_ctx *ctx,
*
* Context: Any context.
*/
-void hmac_sha256_final(struct hmac_sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
+void hmac_sha256_final(struct hmac_sha256_ctx *ctx,
+ u8 out[at_least SHA256_DIGEST_SIZE]);
/**
* hmac_sha256() - Compute HMAC-SHA256 in one shot, using a prepared key
@@ -502,7 +506,8 @@ void hmac_sha256_final(struct hmac_sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
* Context: Any context.
*/
void hmac_sha256(const struct hmac_sha256_key *key,
- const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE]);
+ const u8 *data, size_t data_len,
+ u8 out[at_least SHA256_DIGEST_SIZE]);
/**
* hmac_sha256_usingrawkey() - Compute HMAC-SHA256 in one shot, using a raw key
@@ -519,7 +524,7 @@ void hmac_sha256(const struct hmac_sha256_key *key,
*/
void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
const u8 *data, size_t data_len,
- u8 out[SHA256_DIGEST_SIZE]);
+ u8 out[at_least SHA256_DIGEST_SIZE]);
/* State for the SHA-512 (and SHA-384) compression function */
struct sha512_block_state {
@@ -598,7 +603,7 @@ static inline void sha384_update(struct sha384_ctx *ctx,
*
* Context: Any context.
*/
-void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
+void sha384_final(struct sha384_ctx *ctx, u8 out[at_least SHA384_DIGEST_SIZE]);
/**
* sha384() - Compute SHA-384 message digest in one shot
@@ -608,7 +613,7 @@ void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
*
* Context: Any context.
*/
-void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE]);
+void sha384(const u8 *data, size_t len, u8 out[at_least SHA384_DIGEST_SIZE]);
/**
* struct hmac_sha384_key - Prepared key for HMAC-SHA384
@@ -695,7 +700,8 @@ static inline void hmac_sha384_update(struct hmac_sha384_ctx *ctx,
*
* Context: Any context.
*/
-void hmac_sha384_final(struct hmac_sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
+void hmac_sha384_final(struct hmac_sha384_ctx *ctx,
+ u8 out[at_least SHA384_DIGEST_SIZE]);
/**
* hmac_sha384() - Compute HMAC-SHA384 in one shot, using a prepared key
@@ -709,7 +715,8 @@ void hmac_sha384_final(struct hmac_sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
* Context: Any context.
*/
void hmac_sha384(const struct hmac_sha384_key *key,
- const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE]);
+ const u8 *data, size_t data_len,
+ u8 out[at_least SHA384_DIGEST_SIZE]);
/**
* hmac_sha384_usingrawkey() - Compute HMAC-SHA384 in one shot, using a raw key
@@ -726,7 +733,7 @@ void hmac_sha384(const struct hmac_sha384_key *key,
*/
void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len,
const u8 *data, size_t data_len,
- u8 out[SHA384_DIGEST_SIZE]);
+ u8 out[at_least SHA384_DIGEST_SIZE]);
/**
* struct sha512_ctx - Context for hashing a message with SHA-512
@@ -771,7 +778,7 @@ static inline void sha512_update(struct sha512_ctx *ctx,
*
* Context: Any context.
*/
-void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
+void sha512_final(struct sha512_ctx *ctx, u8 out[at_least SHA512_DIGEST_SIZE]);
/**
* sha512() - Compute SHA-512 message digest in one shot
@@ -781,7 +788,7 @@ void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
*
* Context: Any context.
*/
-void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]);
+void sha512(const u8 *data, size_t len, u8 out[at_least SHA512_DIGEST_SIZE]);
/**
* struct hmac_sha512_key - Prepared key for HMAC-SHA512
@@ -868,7 +875,8 @@ static inline void hmac_sha512_update(struct hmac_sha512_ctx *ctx,
*
* Context: Any context.
*/
-void hmac_sha512_final(struct hmac_sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
+void hmac_sha512_final(struct hmac_sha512_ctx *ctx,
+ u8 out[at_least SHA512_DIGEST_SIZE]);
/**
* hmac_sha512() - Compute HMAC-SHA512 in one shot, using a prepared key
@@ -882,7 +890,8 @@ void hmac_sha512_final(struct hmac_sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
* Context: Any context.
*/
void hmac_sha512(const struct hmac_sha512_key *key,
- const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE]);
+ const u8 *data, size_t data_len,
+ u8 out[at_least SHA512_DIGEST_SIZE]);
/**
* hmac_sha512_usingrawkey() - Compute HMAC-SHA512 in one shot, using a raw key
@@ -899,6 +908,6 @@ void hmac_sha512(const struct hmac_sha512_key *key,
*/
void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
const u8 *data, size_t data_len,
- u8 out[SHA512_DIGEST_SIZE]);
+ u8 out[at_least SHA512_DIGEST_SIZE]);
#endif /* _CRYPTO_SHA2_H */
diff --git a/include/crypto/sha3.h b/include/crypto/sha3.h
index 41e1b83a6d91..c9e4182ff74f 100644
--- a/include/crypto/sha3.h
+++ b/include/crypto/sha3.h
@@ -1,11 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Common values for SHA-3 algorithms
+ *
+ * See also Documentation/crypto/sha3.rst
*/
#ifndef __CRYPTO_SHA3_H__
#define __CRYPTO_SHA3_H__
#include <linux/types.h>
+#include <linux/string.h>
#define SHA3_224_DIGEST_SIZE (224 / 8)
#define SHA3_224_BLOCK_SIZE (200 - 2 * SHA3_224_DIGEST_SIZE)
@@ -23,14 +26,321 @@
#define SHA3_512_BLOCK_SIZE (200 - 2 * SHA3_512_DIGEST_SIZE)
#define SHA3_512_EXPORT_SIZE SHA3_STATE_SIZE + SHA3_512_BLOCK_SIZE + 1
-#define SHA3_STATE_SIZE 200
+/*
+ * SHAKE128 and SHAKE256 actually have variable output size, but this is used to
+ * calculate the block size (rate) analogously to the above.
+ */
+#define SHAKE128_DEFAULT_SIZE (128 / 8)
+#define SHAKE128_BLOCK_SIZE (200 - 2 * SHAKE128_DEFAULT_SIZE)
+#define SHAKE256_DEFAULT_SIZE (256 / 8)
+#define SHAKE256_BLOCK_SIZE (200 - 2 * SHAKE256_DEFAULT_SIZE)
-struct shash_desc;
+#define SHA3_STATE_SIZE 200
+/*
+ * State for the Keccak-f[1600] permutation: 25 64-bit words.
+ *
+ * We usually keep the state words as little-endian, to make absorbing and
+ * squeezing easier. (It means that absorbing and squeezing can just treat the
+ * state as a byte array.) The state words are converted to native-endian only
+ * temporarily by implementations of the permutation that need native-endian
+ * words. Of course, that conversion is a no-op on little-endian machines.
+ */
struct sha3_state {
- u64 st[SHA3_STATE_SIZE / 8];
+ union {
+ __le64 words[SHA3_STATE_SIZE / 8];
+ u8 bytes[SHA3_STATE_SIZE];
+
+ u64 native_words[SHA3_STATE_SIZE / 8]; /* see comment above */
+ };
+};
+
+/* Internal context, shared by the digests (SHA3-*) and the XOFs (SHAKE*) */
+struct __sha3_ctx {
+ struct sha3_state state;
+ u8 digest_size; /* Digests only: the digest size in bytes */
+ u8 block_size; /* Block size in bytes */
+ u8 absorb_offset; /* Index of next state byte to absorb into */
+ u8 squeeze_offset; /* XOFs only: index of next state byte to extract */
+};
+
+void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len);
+
+/**
+ * struct sha3_ctx - Context for SHA3-224, SHA3-256, SHA3-384, or SHA3-512
+ * @ctx: private
+ */
+struct sha3_ctx {
+ struct __sha3_ctx ctx;
};
-int crypto_sha3_init(struct shash_desc *desc);
+/**
+ * sha3_zeroize_ctx() - Zeroize a SHA-3 context
+ * @ctx: The context to zeroize
+ *
+ * This is already called by sha3_final(). Call this explicitly when abandoning
+ * a context without calling sha3_final().
+ */
+static inline void sha3_zeroize_ctx(struct sha3_ctx *ctx)
+{
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+
+/**
+ * struct shake_ctx - Context for SHAKE128 or SHAKE256
+ * @ctx: private
+ */
+struct shake_ctx {
+ struct __sha3_ctx ctx;
+};
+
+/**
+ * shake_zeroize_ctx() - Zeroize a SHAKE context
+ * @ctx: The context to zeroize
+ *
+ * Call this after the last squeeze.
+ */
+static inline void shake_zeroize_ctx(struct shake_ctx *ctx)
+{
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+
+/**
+ * sha3_224_init() - Initialize a context for SHA3-224
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-224 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_224_init(struct sha3_ctx *ctx)
+{
+ *ctx = (struct sha3_ctx){
+ .ctx.digest_size = SHA3_224_DIGEST_SIZE,
+ .ctx.block_size = SHA3_224_BLOCK_SIZE,
+ };
+}
+
+/**
+ * sha3_256_init() - Initialize a context for SHA3-256
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-256 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_256_init(struct sha3_ctx *ctx)
+{
+ *ctx = (struct sha3_ctx){
+ .ctx.digest_size = SHA3_256_DIGEST_SIZE,
+ .ctx.block_size = SHA3_256_BLOCK_SIZE,
+ };
+}
+
+/**
+ * sha3_384_init() - Initialize a context for SHA3-384
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-384 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_384_init(struct sha3_ctx *ctx)
+{
+ *ctx = (struct sha3_ctx){
+ .ctx.digest_size = SHA3_384_DIGEST_SIZE,
+ .ctx.block_size = SHA3_384_BLOCK_SIZE,
+ };
+}
+
+/**
+ * sha3_512_init() - Initialize a context for SHA3-512
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-512 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_512_init(struct sha3_ctx *ctx)
+{
+ *ctx = (struct sha3_ctx){
+ .ctx.digest_size = SHA3_512_DIGEST_SIZE,
+ .ctx.block_size = SHA3_512_BLOCK_SIZE,
+ };
+}
+
+/**
+ * sha3_update() - Update a SHA-3 digest context with input data
+ * @ctx: The context to update; must have been initialized
+ * @in: The input data
+ * @in_len: Length of the input data in bytes
+ *
+ * This can be called any number of times to add data to a SHA3-224, SHA3-256,
+ * SHA3-384, or SHA3-512 digest (depending on which init function was called).
+ *
+ * Context: Any context.
+ */
+static inline void sha3_update(struct sha3_ctx *ctx,
+ const u8 *in, size_t in_len)
+{
+ __sha3_update(&ctx->ctx, in, in_len);
+}
+
+/**
+ * sha3_final() - Finish computing a SHA-3 message digest
+ * @ctx: The context to finalize; must have been initialized
+ * @out: (output) The resulting SHA3-224, SHA3-256, SHA3-384, or SHA3-512
+ * message digest, matching the init function that was called. Note that
+ * the size differs for each one; see SHA3_*_DIGEST_SIZE.
+ *
+ * After finishing, this zeroizes @ctx. So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void sha3_final(struct sha3_ctx *ctx, u8 *out);
+
+/**
+ * shake128_init() - Initialize a context for SHAKE128
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHAKE128 extendable-output function (XOF) computation.
+ *
+ * Context: Any context.
+ */
+static inline void shake128_init(struct shake_ctx *ctx)
+{
+ *ctx = (struct shake_ctx){
+ .ctx.block_size = SHAKE128_BLOCK_SIZE,
+ };
+}
+
+/**
+ * shake256_init() - Initialize a context for SHAKE256
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHAKE256 extendable-output function (XOF) computation.
+ *
+ * Context: Any context.
+ */
+static inline void shake256_init(struct shake_ctx *ctx)
+{
+ *ctx = (struct shake_ctx){
+ .ctx.block_size = SHAKE256_BLOCK_SIZE,
+ };
+}
+
+/**
+ * shake_update() - Update a SHAKE context with input data
+ * @ctx: The context to update; must have been initialized
+ * @in: The input data
+ * @in_len: Length of the input data in bytes
+ *
+ * This can be called any number of times to add more input data to SHAKE128 or
+ * SHAKE256. This cannot be called after squeezing has begun.
+ *
+ * Context: Any context.
+ */
+static inline void shake_update(struct shake_ctx *ctx,
+ const u8 *in, size_t in_len)
+{
+ __sha3_update(&ctx->ctx, in, in_len);
+}
+
+/**
+ * shake_squeeze() - Generate output from SHAKE128 or SHAKE256
+ * @ctx: The context to squeeze; must have been initialized
+ * @out: Where to write the resulting output data
+ * @out_len: The amount of data to extract to @out in bytes
+ *
+ * This may be called multiple times. A number of consecutive squeezes laid
+ * end-to-end will yield the same output as one big squeeze generating the same
+ * total amount of output. More input cannot be provided after squeezing has
+ * begun. After the last squeeze, call shake_zeroize_ctx().
+ *
+ * Context: Any context.
+ */
+void shake_squeeze(struct shake_ctx *ctx, u8 *out, size_t out_len);
+
+/**
+ * sha3_224() - Compute SHA3-224 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-224 digest. Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE]);
+
+/**
+ * sha3_256() - Compute SHA3-256 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-256 digest. Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE]);
+
+/**
+ * sha3_384() - Compute SHA3-384 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-384 digest. Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE]);
+
+/**
+ * sha3_512() - Compute SHA3-512 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-512 digest. Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE]);
+
+/**
+ * shake128() - Compute SHAKE128 in one shot
+ * @in: The input data to be used
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the output will be stored
+ * @out_len: Length of the output to produce in bytes
+ *
+ * Convenience function that computes SHAKE128 in one shot. Use this instead of
+ * the incremental API if you're able to provide all the input at once as well
+ * as receive all the output at once. All output lengths are supported.
+ *
+ * Context: Any context.
+ */
+void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len);
+
+/**
+ * shake256() - Compute SHAKE256 in one shot
+ * @in: The input data to be used
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the output will be stored
+ * @out_len: Length of the output to produce in bytes
+ *
+ * Convenience function that computes SHAKE256 in one shot. Use this instead of
+ * the incremental API if you're able to provide all the input at once as well
+ * as receive all the output at once. All output lengths are supported.
+ *
+ * Context: Any context.
+ */
+void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len);
-#endif
+#endif /* __CRYPTO_SHA3_H__ */
diff --git a/include/drm/Makefile b/include/drm/Makefile
index 1df6962556ef..48fae3f167c7 100644
--- a/include/drm/Makefile
+++ b/include/drm/Makefile
@@ -11,7 +11,7 @@ always-$(CONFIG_DRM_HEADER_TEST) += \
quiet_cmd_hdrtest = HDRTEST $(patsubst %.hdrtest,%.h,$@)
cmd_hdrtest = \
$(CC) $(c_flags) -fsyntax-only -x c /dev/null -include $< -include $<; \
- PYTHONDONTWRITEBYTECODE=1 $(KERNELDOC) -none $(if $(CONFIG_WERROR)$(CONFIG_DRM_WERROR),-Werror) $<; \
+ PYTHONDONTWRITEBYTECODE=1 $(PYTHON3) $(KERNELDOC) -none $(if $(CONFIG_WERROR)$(CONFIG_DRM_WERROR),-Werror) $<; \
touch $@
$(obj)/%.hdrtest: $(src)/%.h FORCE
diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h
index da6301a6fcea..69d4ae92d822 100644
--- a/include/drm/intel/pciids.h
+++ b/include/drm/intel/pciids.h
@@ -877,7 +877,10 @@
MACRO__(0xB08F, ## __VA_ARGS__), \
MACRO__(0xB090, ## __VA_ARGS__), \
MACRO__(0xB0A0, ## __VA_ARGS__), \
- MACRO__(0xB0B0, ## __VA_ARGS__), \
+ MACRO__(0xB0B0, ## __VA_ARGS__)
+
+/* WCL */
+#define INTEL_WCL_IDS(MACRO__, ...) \
MACRO__(0xFD80, ## __VA_ARGS__), \
MACRO__(0xFD81, ## __VA_ARGS__)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 5ff5d99f6ead..fbf0c3a65f59 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -8,6 +8,7 @@
#ifndef _LINUX_ACPI_H
#define _LINUX_ACPI_H
+#include <linux/cleanup.h>
#include <linux/errno.h>
#include <linux/ioport.h> /* for struct resource */
#include <linux/resource_ext.h>
@@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void);
void acpi_table_init_complete (void);
int acpi_table_init (void);
+static inline struct acpi_table_header *acpi_get_table_pointer(char *signature, u32 instance)
+{
+ struct acpi_table_header *table;
+ int status = acpi_get_table(signature, instance, &table);
+
+ if (ACPI_FAILURE(status))
+ return ERR_PTR(-ENOENT);
+ return table;
+}
+DEFINE_FREE(acpi_put_table, struct acpi_table_header *, if (!IS_ERR_OR_NULL(_T)) acpi_put_table(_T))
+
int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
int __init_or_acpilib acpi_table_parse_entries(char *id,
unsigned long table_size, int entry_id,
@@ -755,7 +767,6 @@ int acpi_reconfig_notifier_unregister(struct notifier_block *nb);
int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count);
int acpi_gtdt_map_ppi(int type);
bool acpi_gtdt_c3stop(int type);
-int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count);
#endif
#ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER
@@ -1146,12 +1157,7 @@ struct acpi_s2idle_dev_ops {
#if defined(CONFIG_SUSPEND) && defined(CONFIG_X86)
int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg);
void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg);
-int acpi_get_lps0_constraint(struct acpi_device *adev);
#else /* CONFIG_SUSPEND && CONFIG_X86 */
-static inline int acpi_get_lps0_constraint(struct device *dev)
-{
- return ACPI_STATE_UNKNOWN;
-}
static inline int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg)
{
return -ENODEV;
@@ -1349,9 +1355,6 @@ acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
void **valptr);
-struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
- struct fwnode_handle *child);
-
struct acpi_probe_entry;
typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
struct acpi_probe_entry *);
@@ -1451,13 +1454,6 @@ static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode,
}
static inline struct fwnode_handle *
-acpi_get_next_subnode(const struct fwnode_handle *fwnode,
- struct fwnode_handle *child)
-{
- return NULL;
-}
-
-static inline struct fwnode_handle *
acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
struct fwnode_handle *prev)
{
@@ -1509,12 +1505,19 @@ static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console)
#if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI)
int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res);
+const struct cpumask *acpi_irq_get_affinity(acpi_handle handle,
+ unsigned int index);
#else
static inline
int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res)
{
return -EINVAL;
}
+static inline const struct cpumask *acpi_irq_get_affinity(acpi_handle handle,
+ unsigned int index)
+{
+ return NULL;
+}
#endif
#ifdef CONFIG_ACPI_LPIT
@@ -1541,6 +1544,9 @@ int find_acpi_cpu_topology(unsigned int cpu, int level);
int find_acpi_cpu_topology_cluster(unsigned int cpu);
int find_acpi_cpu_topology_package(unsigned int cpu);
int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
+void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus);
+int find_acpi_cache_level_from_id(u32 cache_id);
+int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus);
#else
static inline int acpi_pptt_cpu_is_thread(unsigned int cpu)
{
@@ -1562,6 +1568,17 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu)
{
return -EINVAL;
}
+static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id,
+ cpumask_t *cpus) { }
+static inline int find_acpi_cache_level_from_id(u32 cache_id)
+{
+ return -ENOENT;
+}
+static inline int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id,
+ cpumask_t *cpus)
+{
+ return -ENOENT;
+}
#endif
void acpi_arch_init(void);
diff --git a/include/linux/annotate.h b/include/linux/annotate.h
new file mode 100644
index 000000000000..7c10d34d198c
--- /dev/null
+++ b/include/linux/annotate.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ANNOTATE_H
+#define _LINUX_ANNOTATE_H
+
+#include <linux/objtool_types.h>
+
+#ifdef CONFIG_OBJTOOL
+
+#ifndef __ASSEMBLY__
+
+#define __ASM_ANNOTATE(section, label, type) \
+ ".pushsection " section ",\"M\", @progbits, 8\n\t" \
+ ".long " __stringify(label) " - .\n\t" \
+ ".long " __stringify(type) "\n\t" \
+ ".popsection\n\t"
+
+#define ASM_ANNOTATE_LABEL(label, type) \
+ __ASM_ANNOTATE(".discard.annotate_insn", label, type)
+
+#define ASM_ANNOTATE(type) \
+ "911:\n\t" \
+ ASM_ANNOTATE_LABEL(911b, type)
+
+#define ASM_ANNOTATE_DATA(type) \
+ "912:\n\t" \
+ __ASM_ANNOTATE(".discard.annotate_data", 912b, type)
+
+#else /* __ASSEMBLY__ */
+
+.macro __ANNOTATE section, type
+.Lhere_\@:
+ .pushsection \section, "M", @progbits, 8
+ .long .Lhere_\@ - .
+ .long \type
+ .popsection
+.endm
+
+.macro ANNOTATE type
+ __ANNOTATE ".discard.annotate_insn", \type
+.endm
+
+.macro ANNOTATE_DATA type
+ __ANNOTATE ".discard.annotate_data", \type
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#else /* !CONFIG_OBJTOOL */
+#ifndef __ASSEMBLY__
+#define ASM_ANNOTATE_LABEL(label, type) ""
+#define ASM_ANNOTATE(type)
+#define ASM_ANNOTATE_DATA(type)
+#else /* __ASSEMBLY__ */
+.macro ANNOTATE type
+.endm
+.macro ANNOTATE_DATA type
+.endm
+#endif /* __ASSEMBLY__ */
+#endif /* !CONFIG_OBJTOOL */
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Annotate away the various 'relocation to !ENDBR` complaints; knowing that
+ * these relocations will never be used for indirect calls.
+ */
+#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR)
+#define ANNOTATE_NOENDBR_SYM(sym) asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOENDBR))
+
+/*
+ * This should be used immediately before an indirect jump/call. It tells
+ * objtool the subsequent indirect jump/call is vouched safe for retpoline
+ * builds.
+ */
+#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE)
+/*
+ * See linux/instrumentation.h
+ */
+#define ANNOTATE_INSTR_BEGIN(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_BEGIN)
+#define ANNOTATE_INSTR_END(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_END)
+/*
+ * objtool annotation to ignore the alternatives and only consider the original
+ * instruction(s).
+ */
+#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS)
+/*
+ * This macro indicates that the following intra-function call is valid.
+ * Any non-annotated intra-function call will cause objtool to issue a warning.
+ */
+#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL)
+/*
+ * Use objtool to validate the entry requirement that all code paths do
+ * VALIDATE_UNRET_END before RET.
+ *
+ * NOTE: The macro must be used at the beginning of a global symbol, otherwise
+ * it will be ignored.
+ */
+#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN)
+/*
+ * This should be used to refer to an instruction that is considered
+ * terminating, like a noreturn CALL or UD2 when we know they are not -- eg
+ * WARN using UD2.
+ */
+#define ANNOTATE_REACHABLE(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_REACHABLE)
+/*
+ * This should not be used; it annotates away CFI violations. There are a few
+ * valid use cases like kexec handover to the next kernel image, and there is
+ * no security concern there.
+ *
+ * There are also a few real issues annotated away, like EFI because we can't
+ * control the EFI code.
+ */
+#define ANNOTATE_NOCFI_SYM(sym) asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOCFI))
+
+/*
+ * Annotate a special section entry. This emables livepatch module generation
+ * to find and extract individual special section entries as needed.
+ */
+#define ANNOTATE_DATA_SPECIAL ASM_ANNOTATE_DATA(ANNOTYPE_DATA_SPECIAL)
+
+#else /* __ASSEMBLY__ */
+#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR
+#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE
+/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */
+/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */
+#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS
+#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL
+#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN
+#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE
+#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI
+#define ANNOTATE_DATA_SPECIAL ANNOTATE_DATA type=ANNOTYPE_DATA_SPECIAL
+#endif /* __ASSEMBLY__ */
+
+#endif /* _LINUX_ANNOTATE_H */
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index d72d6e5aa200..0c2a8b846c20 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -89,6 +89,21 @@ void remove_cpu_topology(unsigned int cpuid);
void reset_cpu_topology(void);
int parse_acpi_topology(void);
void freq_inv_set_max_ratio(int cpu, u64 max_rate);
-#endif
+
+/*
+ * Architectures like ARM64 don't have reliable architectural way to get SMT
+ * information and depend on the firmware (ACPI/OF) report. Non-SMT core won't
+ * initialize thread_id so we can use this to detect the SMT implementation.
+ */
+static inline bool topology_core_has_smt(int cpu)
+{
+ return cpu_topology[cpu].thread_id != -1;
+}
+
+#else
+
+static inline bool topology_core_has_smt(int cpu) { return false; }
+
+#endif /* CONFIG_GENERIC_ARCH_TOPOLOGY */
#endif /* _LINUX_ARCH_TOPOLOGY_H_ */
diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h
new file mode 100644
index 000000000000..7f00c5285a32
--- /dev/null
+++ b/include/linux/arm_mpam.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2025 Arm Ltd. */
+
+#ifndef __LINUX_ARM_MPAM_H
+#define __LINUX_ARM_MPAM_H
+
+#include <linux/acpi.h>
+#include <linux/types.h>
+
+struct mpam_msc;
+
+enum mpam_msc_iface {
+ MPAM_IFACE_MMIO, /* a real MPAM MSC */
+ MPAM_IFACE_PCC, /* a fake MPAM MSC */
+};
+
+enum mpam_class_types {
+ MPAM_CLASS_CACHE, /* Caches, e.g. L2, L3 */
+ MPAM_CLASS_MEMORY, /* Main memory */
+ MPAM_CLASS_UNKNOWN, /* Everything else, e.g. SMMU */
+};
+
+#define MPAM_CLASS_ID_DEFAULT 255
+
+#ifdef CONFIG_ACPI_MPAM
+int acpi_mpam_parse_resources(struct mpam_msc *msc,
+ struct acpi_mpam_msc_node *tbl_msc);
+
+int acpi_mpam_count_msc(void);
+#else
+static inline int acpi_mpam_parse_resources(struct mpam_msc *msc,
+ struct acpi_mpam_msc_node *tbl_msc)
+{
+ return -EINVAL;
+}
+
+static inline int acpi_mpam_count_msc(void) { return -EINVAL; }
+#endif
+
+#ifdef CONFIG_ARM64_MPAM_DRIVER
+int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
+ enum mpam_class_types type, u8 class_id, int component_id);
+#else
+static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
+ enum mpam_class_types type, u8 class_id,
+ int component_id)
+{
+ return -EINVAL;
+}
+#endif
+
+/**
+ * mpam_register_requestor() - Register a requestor with the MPAM driver
+ * @partid_max: The maximum PARTID value the requestor can generate.
+ * @pmg_max: The maximum PMG value the requestor can generate.
+ *
+ * Registers a requestor with the MPAM driver to ensure the chosen system-wide
+ * minimum PARTID and PMG values will allow the requestors features to be used.
+ *
+ * Returns an error if the registration is too late, and a larger PARTID/PMG
+ * value has been advertised to user-space. In this case the requestor should
+ * not use its MPAM features. Returns 0 on success.
+ */
+int mpam_register_requestor(u16 partid_max, u8 pmg_max);
+
+#endif /* __LINUX_ARM_MPAM_H */
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 792e10a09787..c9013e472aa3 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -566,6 +566,7 @@ struct ata_bmdma_prd {
#define ata_id_has_ncq(id) ((id)[ATA_ID_SATA_CAPABILITY] & (1 << 8))
#define ata_id_queue_depth(id) (((id)[ATA_ID_QUEUE_DEPTH] & 0x1f) + 1)
#define ata_id_removable(id) ((id)[ATA_ID_CONFIG] & (1 << 7))
+#define ata_id_is_locked(id) (((id)[ATA_ID_DLF] & 0x7) == 0x7)
#define ata_id_has_atapi_AN(id) \
((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index 9409a6ddf3e0..37ab6314a9f7 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -1276,7 +1276,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
kcsan_mb();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_try_cmpxchg(v, old, new);
}
@@ -1298,7 +1298,7 @@ static __always_inline bool
atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_try_cmpxchg_acquire(v, old, new);
}
@@ -1321,7 +1321,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
kcsan_release();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_try_cmpxchg_release(v, old, new);
}
@@ -1343,7 +1343,7 @@ static __always_inline bool
atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_try_cmpxchg_relaxed(v, old, new);
}
@@ -2854,7 +2854,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
kcsan_mb();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic64_try_cmpxchg(v, old, new);
}
@@ -2876,7 +2876,7 @@ static __always_inline bool
atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic64_try_cmpxchg_acquire(v, old, new);
}
@@ -2899,7 +2899,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
kcsan_release();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic64_try_cmpxchg_release(v, old, new);
}
@@ -2921,7 +2921,7 @@ static __always_inline bool
atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic64_try_cmpxchg_relaxed(v, old, new);
}
@@ -4432,7 +4432,7 @@ atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
kcsan_mb();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_long_try_cmpxchg(v, old, new);
}
@@ -4454,7 +4454,7 @@ static __always_inline bool
atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_long_try_cmpxchg_acquire(v, old, new);
}
@@ -4477,7 +4477,7 @@ atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
kcsan_release();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_long_try_cmpxchg_release(v, old, new);
}
@@ -4499,7 +4499,7 @@ static __always_inline bool
atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_long_try_cmpxchg_relaxed(v, old, new);
}
@@ -5050,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 8829b337928e9508259079d32581775ececd415b
+// f618ac667f868941a84ce0ab2242f1786e049ed4
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index c5c9d89c73ed..610ef62b6a32 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -63,6 +63,8 @@ enum wb_reason {
struct wb_completion {
atomic_t cnt;
wait_queue_head_t *waitq;
+ unsigned long progress_stamp; /* The jiffies when slow progress is detected */
+ unsigned long wait_start; /* The jiffies when waiting for the writeback work to finish */
};
#define __WB_COMPLETION_INIT(_waitq) \
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 3e64f14739dd..0c8342747cab 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -277,10 +277,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
rcu_read_lock();
/*
- * Paired with store_release in inode_switch_wbs_work_fn() and
+ * Paired with a release fence in inode_do_switch_wbs() and
* ensures that we see the new wb if we see cleared I_WB_SWITCH.
*/
- cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+ cookie->locked = inode_state_read_once(inode) & I_WB_SWITCH;
+ smp_rmb();
if (unlikely(cookie->locked))
xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags);
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 595217b7a6e7..b0395e4ccf90 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -45,6 +45,7 @@ struct device;
* bitmap_copy(dst, src, nbits) *dst = *src
* bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2
* bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2
+ * bitmap_weighted_or(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst
* bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2
* bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2)
* bitmap_complement(dst, src, nbits) *dst = ~(*src)
@@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
+unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1,
+ const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
@@ -338,6 +341,18 @@ void bitmap_or(unsigned long *dst, const unsigned long *src1,
}
static __always_inline
+unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1,
+ const unsigned long *src2, unsigned int nbits)
+{
+ if (small_const_nbits(nbits)) {
+ *dst = *src1 | *src2;
+ return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
+ } else {
+ return __bitmap_weighted_or(dst, src1, src2, nbits);
+ }
+}
+
+static __always_inline
void bitmap_xor(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, unsigned int nbits)
{
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8e8d1cc8b06c..44c30183ecc3 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -341,15 +341,15 @@ enum req_op {
/* write the zero filled sector many times */
REQ_OP_WRITE_ZEROES = (__force blk_opf_t)9,
/* Open a zone */
- REQ_OP_ZONE_OPEN = (__force blk_opf_t)10,
+ REQ_OP_ZONE_OPEN = (__force blk_opf_t)11,
/* Close a zone */
- REQ_OP_ZONE_CLOSE = (__force blk_opf_t)11,
+ REQ_OP_ZONE_CLOSE = (__force blk_opf_t)13,
/* Transition a zone to full */
- REQ_OP_ZONE_FINISH = (__force blk_opf_t)13,
+ REQ_OP_ZONE_FINISH = (__force blk_opf_t)15,
/* reset a zone write pointer */
- REQ_OP_ZONE_RESET = (__force blk_opf_t)15,
+ REQ_OP_ZONE_RESET = (__force blk_opf_t)17,
/* reset all the zone present on the device */
- REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)17,
+ REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)19,
/* Driver private requests */
REQ_OP_DRV_IN = (__force blk_opf_t)34,
@@ -478,6 +478,7 @@ static inline bool op_is_zone_mgmt(enum req_op op)
{
switch (op & REQ_OP_MASK) {
case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_RESET_ALL:
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
case REQ_OP_ZONE_FINISH:
diff --git a/include/linux/bug.h b/include/linux/bug.h
index a9948a9f1093..17a4933c611b 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -42,6 +42,7 @@ void bug_get_file_line(struct bug_entry *bug, const char **file,
struct bug_entry *find_bug(unsigned long bugaddr);
enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
+enum bug_trap_type report_bug_entry(struct bug_entry *bug, struct pt_regs *regs);
/* These are defined by the architecture */
int is_valid_bugaddr(unsigned long addr);
@@ -62,6 +63,13 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
}
struct bug_entry;
+
+static inline enum bug_trap_type
+report_bug_entry(struct bug_entry *bug, struct pt_regs *regs)
+{
+ return BUG_TRAP_TYPE_BUG;
+}
+
static inline void bug_get_file_line(struct bug_entry *bug, const char **file,
unsigned int *line)
{
diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index b3705e8bbe2b..55a44199de87 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -173,6 +173,22 @@ static inline void cpu_to_le32_array(u32 *buf, unsigned int words)
}
}
+static inline void le64_to_cpu_array(u64 *buf, unsigned int words)
+{
+ while (words--) {
+ __le64_to_cpus(buf);
+ buf++;
+ }
+}
+
+static inline void cpu_to_le64_array(u64 *buf, unsigned int words)
+{
+ while (words--) {
+ __cpu_to_le64s(buf);
+ buf++;
+ }
+}
+
static inline void memcpy_from_le32(u32 *dst, const __le32 *src, size_t words)
{
size_t i;
diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h
index 7fcec025c5e0..559353ad64ac 100644
--- a/include/linux/cc_platform.h
+++ b/include/linux/cc_platform.h
@@ -74,7 +74,7 @@ enum cc_attr {
CC_ATTR_GUEST_UNROLL_STRING_IO,
/**
- * @CC_ATTR_SEV_SNP: Guest SNP is active.
+ * @CC_ATTR_GUEST_SEV_SNP: Guest SNP is active.
*
* The platform/OS is running as a guest/virtual machine and actively
* using AMD SEV-SNP features.
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 733e7f93db66..63e0e2aa1ce9 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -306,8 +306,7 @@ struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
u64 ceph_client_gid(struct ceph_client *client);
extern void ceph_destroy_client(struct ceph_client *client);
extern void ceph_reset_client_addr(struct ceph_client *client);
-extern int __ceph_open_session(struct ceph_client *client,
- unsigned long started);
+extern int __ceph_open_session(struct ceph_client *client);
extern int ceph_open_session(struct ceph_client *client);
int ceph_wait_for_latest_osdmap(struct ceph_client *client,
unsigned long timeout);
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 2573585b7f06..0b55a8f6c59e 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -208,7 +208,7 @@
*/
#define DEFINE_FREE(_name, _type, _free) \
- static inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; }
+ static __always_inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; }
#define __free(_name) __cleanup(__free_##_name)
@@ -220,7 +220,7 @@
__val; \
})
-static inline __must_check
+static __always_inline __must_check
const volatile void * __must_check_fn(const volatile void *val)
{ return val; }
@@ -261,6 +261,10 @@ const volatile void * __must_check_fn(const volatile void *val)
* CLASS(name, var)(args...):
* declare the variable @var as an instance of the named class
*
+ * CLASS_INIT(name, var, init_expr):
+ * declare the variable @var as an instance of the named class with
+ * custom initialization expression.
+ *
* Ex.
*
* DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd)
@@ -274,31 +278,35 @@ const volatile void * __must_check_fn(const volatile void *val)
#define DEFINE_CLASS(_name, _type, _exit, _init, _init_args...) \
typedef _type class_##_name##_t; \
-static inline void class_##_name##_destructor(_type *p) \
+static __always_inline void class_##_name##_destructor(_type *p) \
{ _type _T = *p; _exit; } \
-static inline _type class_##_name##_constructor(_init_args) \
+static __always_inline _type class_##_name##_constructor(_init_args) \
{ _type t = _init; return t; }
#define EXTEND_CLASS(_name, ext, _init, _init_args...) \
typedef class_##_name##_t class_##_name##ext##_t; \
-static inline void class_##_name##ext##_destructor(class_##_name##_t *p)\
+static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \
{ class_##_name##_destructor(p); } \
-static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
+static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
{ class_##_name##_t t = _init; return t; }
#define CLASS(_name, var) \
class_##_name##_t var __cleanup(class_##_name##_destructor) = \
class_##_name##_constructor
-#define scoped_class(_name, var, args) \
- for (CLASS(_name, var)(args); \
- __guard_ptr(_name)(&var) || !__is_cond_ptr(_name); \
- ({ goto _label; })) \
- if (0) { \
-_label: \
- break; \
+#define CLASS_INIT(_name, _var, _init_expr) \
+ class_##_name##_t _var __cleanup(class_##_name##_destructor) = (_init_expr)
+
+#define __scoped_class(_name, var, _label, args...) \
+ for (CLASS(_name, var)(args); ; ({ goto _label; })) \
+ if (0) { \
+_label: \
+ break; \
} else
+#define scoped_class(_name, var, args...) \
+ __scoped_class(_name, var, __UNIQUE_ID(label), args)
+
/*
* DEFINE_GUARD(name, type, lock, unlock):
* trivial wrapper around DEFINE_CLASS() above specifically
@@ -340,6 +348,11 @@ _label: \
#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \
static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
+#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \
+ __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \
+ static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
+ { return (void *)1; }
+
#define __GUARD_IS_ERR(_ptr) \
({ \
unsigned long _rc = (__force unsigned long)(_ptr); \
@@ -347,7 +360,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
})
#define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \
- static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \
+ static __always_inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \
{ \
void *_ptr = (void *)(__force unsigned long)*(_exp); \
if (IS_ERR(_ptr)) { \
@@ -355,7 +368,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
} \
return _ptr; \
} \
- static inline int class_##_name##_lock_err(class_##_name##_t *_T) \
+ static __always_inline int class_##_name##_lock_err(class_##_name##_t *_T) \
{ \
long _rc = (__force unsigned long)*(_exp); \
if (!_rc) { \
@@ -384,9 +397,9 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
EXTEND_CLASS(_name, _ext, \
({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \
class_##_name##_t _T) \
- static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
+ static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
{ return class_##_name##_lock_ptr(_T); } \
- static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
+ static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
{ return class_##_name##_lock_err(_T); }
/*
@@ -466,7 +479,7 @@ typedef struct { \
__VA_ARGS__; \
} class_##_name##_t; \
\
-static inline void class_##_name##_destructor(class_##_name##_t *_T) \
+static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \
{ \
if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \
} \
@@ -474,7 +487,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \
__DEFINE_GUARD_LOCK_PTR(_name, &_T->lock)
#define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \
-static inline class_##_name##_t class_##_name##_constructor(_type *l) \
+static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \
{ \
class_##_name##_t _t = { .lock = l }, *_T = &_t; \
_lock; \
@@ -482,7 +495,7 @@ static inline class_##_name##_t class_##_name##_constructor(_type *l) \
}
#define __DEFINE_LOCK_GUARD_0(_name, _lock) \
-static inline class_##_name##_t class_##_name##_constructor(void) \
+static __always_inline class_##_name##_t class_##_name##_constructor(void) \
{ \
class_##_name##_t _t = { .lock = (void*)1 }, \
*_T __maybe_unused = &_t; \
@@ -508,9 +521,9 @@ __DEFINE_LOCK_GUARD_0(_name, _lock)
if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\
_t; }), \
typeof_member(class_##_name##_t, lock) l) \
- static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
+ static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
{ return class_##_name##_lock_ptr(_T); } \
- static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
+ static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
{ return class_##_name##_lock_err(_T); }
#define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 5b45ea7dff3e..ab181d87d71d 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -163,7 +163,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
__asm__ ("" : "=r" (var) : "0" (var))
#endif
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+/* Format: __UNIQUE_ID_<name>_<__COUNTER__> */
+#define __UNIQUE_ID(name) \
+ __PASTE(__UNIQUE_ID_, \
+ __PASTE(name, \
+ __PASTE(_, __COUNTER__)))
/**
* data_race - mark an expression as containing intentional data races
@@ -283,7 +287,7 @@ static inline void *offset_to_ptr(const int *off)
*/
#define ___ADDRESSABLE(sym, __attrs) \
static void * __used __attrs \
- __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym;
+ __UNIQUE_ID(__PASTE(addressable_, sym)) = (void *)(uintptr_t)&sym;
#define __ADDRESSABLE(sym) \
___ADDRESSABLE(sym, __section(".discard.addressable"))
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 59288a2c1ad2..3eac51d68426 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -250,10 +250,9 @@ struct ftrace_likely_data {
/*
* GCC does not warn about unused static inline functions for -Wunused-function.
* Suppress the warning in clang as well by using __maybe_unused, but enable it
- * for W=1 build. This will allow clang to find unused functions. Remove the
- * __inline_maybe_unused entirely after fixing most of -Wunused-function warnings.
+ * for W=2 build. This will allow clang to find unused functions.
*/
-#ifdef KBUILD_EXTRA_WARN1
+#ifdef KBUILD_EXTRA_WARN2
#define __inline_maybe_unused
#else
#define __inline_maybe_unused __maybe_unused
@@ -394,6 +393,21 @@ struct ftrace_likely_data {
#define __counted_by_be(member) __counted_by(member)
#endif
+/*
+ * This designates the minimum number of elements a passed array parameter must
+ * have. For example:
+ *
+ * void some_function(u8 param[at_least 7]);
+ *
+ * If a caller passes an array with fewer than 7 elements, the compiler will
+ * emit a warning.
+ */
+#ifndef __CHECKER__
+#define at_least static
+#else
+#define at_least
+#endif
+
/* Do not trap wrapping arithmetic within an annotated function. */
#ifdef CONFIG_UBSAN_INTEGER_WRAP
# define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow")))
@@ -461,6 +475,12 @@ struct ftrace_likely_data {
# define __nocfi
#endif
+#if defined(CONFIG_ARCH_USES_CFI_GENERIC_LLVM_PASS)
+# define __nocfi_generic __nocfi
+#else
+# define __nocfi_generic
+#endif
+
/*
* Any place that could be marked with the "alloc_size" attribute is also
* a place to be marked with the "malloc" attribute, except those that may
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index a9ee4fe55dcf..4073690504a7 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -248,7 +248,8 @@ extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
struct cpuidle_device *dev,
u64 latency_limit_ns);
extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
- struct cpuidle_device *dev);
+ struct cpuidle_device *dev,
+ u64 latency_limit_ns);
extern void cpuidle_use_deepest_state(u64 latency_limit_ns);
#else
static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
@@ -256,7 +257,8 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
u64 latency_limit_ns)
{return -ENODEV; }
static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
- struct cpuidle_device *dev)
+ struct cpuidle_device *dev,
+ u64 latency_limit_ns)
{return -ENODEV; }
static inline void cpuidle_use_deepest_state(u64 latency_limit_ns)
{
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ff8f41ab7ce6..afedfd5bea07 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -126,6 +126,7 @@ extern struct cpumask __cpu_dying_mask;
#define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask)
extern atomic_t __num_online_cpus;
+extern unsigned int __num_possible_cpus;
extern cpumask_t cpus_booted_once_mask;
@@ -729,6 +730,22 @@ void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
}
/**
+ * cpumask_weighted_or - *dstp = *src1p | *src2p and return the weight of the result
+ * @dstp: the cpumask result
+ * @src1p: the first input
+ * @src2p: the second input
+ *
+ * Return: The number of bits set in the resulting cpumask @dstp
+ */
+static __always_inline
+unsigned int cpumask_weighted_or(struct cpumask *dstp, const struct cpumask *src1p,
+ const struct cpumask *src2p)
+{
+ return bitmap_weighted_or(cpumask_bits(dstp), cpumask_bits(src1p),
+ cpumask_bits(src2p), small_cpumask_bits);
+}
+
+/**
* cpumask_xor - *dstp = *src1p ^ *src2p
* @dstp: the cpumask result
* @src1p: the first input
@@ -1005,6 +1022,7 @@ static __always_inline unsigned int cpumask_size(void)
#define this_cpu_cpumask_var_ptr(x) this_cpu_read(x)
#define __cpumask_var_read_mostly __read_mostly
+#define CPUMASK_VAR_NULL NULL
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
@@ -1051,6 +1069,7 @@ static __always_inline bool cpumask_available(cpumask_var_t mask)
#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
#define __cpumask_var_read_mostly
+#define CPUMASK_VAR_NULL {}
static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
@@ -1136,13 +1155,13 @@ void init_cpu_possible(const struct cpumask *src);
#define __assign_cpu(cpu, mask, val) \
__assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val))
-#define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible))
#define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled))
#define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present))
#define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active))
#define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying))
void set_cpu_online(unsigned int cpu, bool online);
+void set_cpu_possible(unsigned int cpu, bool possible);
/**
* to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
@@ -1195,7 +1214,12 @@ static __always_inline unsigned int num_online_cpus(void)
{
return raw_atomic_read(&__num_online_cpus);
}
-#define num_possible_cpus() cpumask_weight(cpu_possible_mask)
+
+static __always_inline unsigned int num_possible_cpus(void)
+{
+ return __num_possible_cpus;
+}
+
#define num_enabled_cpus() cpumask_weight(cpu_enabled_mask)
#define num_present_cpus() cpumask_weight(cpu_present_mask)
#define num_active_cpus() cpumask_weight(cpu_active_mask)
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 89ae50ad2ace..343a140a6ba2 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -20,6 +20,8 @@
struct cred;
struct inode;
+extern struct task_struct init_task;
+
/*
* COW Supplementary groups list
*/
@@ -156,6 +158,11 @@ extern struct cred *prepare_exec_creds(void);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern struct cred *prepare_kernel_cred(struct task_struct *);
+static inline const struct cred *kernel_cred(void)
+{
+ /* shut up sparse */
+ return rcu_dereference_raw(init_task.cred);
+}
extern int set_security_override(struct cred *, u32);
extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
@@ -180,6 +187,16 @@ static inline const struct cred *revert_creds(const struct cred *revert_cred)
return rcu_replace_pointer(current->cred, revert_cred, 1);
}
+DEFINE_CLASS(override_creds,
+ const struct cred *,
+ revert_creds(_T),
+ override_creds(override_cred), const struct cred *override_cred)
+
+#define scoped_with_creds(cred) \
+ scoped_class(override_creds, __UNIQUE_ID(label), cred)
+
+#define scoped_with_kernel_creds() scoped_with_creds(kernel_cred())
+
/**
* get_cred_many - Get references on a set of credentials
* @cred: The credentials to reference
@@ -263,6 +280,11 @@ static inline void put_cred(const struct cred *cred)
put_cred_many(cred, 1);
}
+DEFINE_CLASS(prepare_creds,
+ struct cred *,
+ if (_T) put_cred(_T),
+ prepare_creds(), void)
+
DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T))
/**
diff --git a/include/linux/delay.h b/include/linux/delay.h
index 89866bab100d..46412c00033a 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -68,7 +68,7 @@ void usleep_range_state(unsigned long min, unsigned long max,
* @min: Minimum time in microseconds to sleep
* @max: Maximum time in microseconds to sleep
*
- * For basic information please refere to usleep_range_state().
+ * For basic information please refer to usleep_range_state().
*
* The task will be in the state TASK_UNINTERRUPTIBLE during the sleep.
*/
@@ -82,10 +82,10 @@ static inline void usleep_range(unsigned long min, unsigned long max)
* @min: Minimum time in microseconds to sleep
* @max: Maximum time in microseconds to sleep
*
- * For basic information please refere to usleep_range_state().
+ * For basic information please refer to usleep_range_state().
*
* The sleeping task has the state TASK_IDLE during the sleep to prevent
- * contribution to the load avarage.
+ * contribution to the load average.
*/
static inline void usleep_range_idle(unsigned long min, unsigned long max)
{
@@ -96,7 +96,7 @@ static inline void usleep_range_idle(unsigned long min, unsigned long max)
* ssleep - wrapper for seconds around msleep
* @seconds: Requested sleep duration in seconds
*
- * Please refere to msleep() for detailed information.
+ * Please refer to msleep() for detailed information.
*/
static inline void ssleep(unsigned int seconds)
{
diff --git a/include/linux/devfreq-governor.h b/include/linux/devfreq-governor.h
new file mode 100644
index 000000000000..dfdd0160a29f
--- /dev/null
+++ b/include/linux/devfreq-governor.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * governor.h - internal header for devfreq governors.
+ *
+ * Copyright (C) 2011 Samsung Electronics
+ * MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This header is for devfreq governors
+ */
+
+#ifndef __LINUX_DEVFREQ_DEVFREQ_H__
+#define __LINUX_DEVFREQ_DEVFREQ_H__
+
+#include <linux/devfreq.h>
+
+#define DEVFREQ_NAME_LEN 16
+
+#define to_devfreq(DEV) container_of((DEV), struct devfreq, dev)
+
+/* Devfreq events */
+#define DEVFREQ_GOV_START 0x1
+#define DEVFREQ_GOV_STOP 0x2
+#define DEVFREQ_GOV_UPDATE_INTERVAL 0x3
+#define DEVFREQ_GOV_SUSPEND 0x4
+#define DEVFREQ_GOV_RESUME 0x5
+
+#define DEVFREQ_MIN_FREQ 0
+#define DEVFREQ_MAX_FREQ ULONG_MAX
+
+/*
+ * Definition of the governor feature flags
+ * - DEVFREQ_GOV_FLAG_IMMUTABLE
+ * : This governor is never changeable to other governors.
+ * - DEVFREQ_GOV_FLAG_IRQ_DRIVEN
+ * : The devfreq won't schedule the work for this governor.
+ */
+#define DEVFREQ_GOV_FLAG_IMMUTABLE BIT(0)
+#define DEVFREQ_GOV_FLAG_IRQ_DRIVEN BIT(1)
+
+/*
+ * Definition of governor attribute flags except for common sysfs attributes
+ * - DEVFREQ_GOV_ATTR_POLLING_INTERVAL
+ * : Indicate polling_interval sysfs attribute
+ * - DEVFREQ_GOV_ATTR_TIMER
+ * : Indicate timer sysfs attribute
+ */
+#define DEVFREQ_GOV_ATTR_POLLING_INTERVAL BIT(0)
+#define DEVFREQ_GOV_ATTR_TIMER BIT(1)
+
+/**
+ * struct devfreq_governor - Devfreq policy governor
+ * @node: list node - contains registered devfreq governors
+ * @name: Governor's name
+ * @attrs: Governor's sysfs attribute flags
+ * @flags: Governor's feature flags
+ * @get_target_freq: Returns desired operating frequency for the device.
+ * Basically, get_target_freq will run
+ * devfreq_dev_profile.get_dev_status() to get the
+ * status of the device (load = busy_time / total_time).
+ * @event_handler: Callback for devfreq core framework to notify events
+ * to governors. Events include per device governor
+ * init and exit, opp changes out of devfreq, suspend
+ * and resume of per device devfreq during device idle.
+ *
+ * Note that the callbacks are called with devfreq->lock locked by devfreq.
+ */
+struct devfreq_governor {
+ struct list_head node;
+
+ const char name[DEVFREQ_NAME_LEN];
+ const u64 attrs;
+ const u64 flags;
+ int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
+ int (*event_handler)(struct devfreq *devfreq,
+ unsigned int event, void *data);
+};
+
+void devfreq_monitor_start(struct devfreq *devfreq);
+void devfreq_monitor_stop(struct devfreq *devfreq);
+void devfreq_monitor_suspend(struct devfreq *devfreq);
+void devfreq_monitor_resume(struct devfreq *devfreq);
+void devfreq_update_interval(struct devfreq *devfreq, unsigned int *delay);
+
+int devfreq_add_governor(struct devfreq_governor *governor);
+int devfreq_remove_governor(struct devfreq_governor *governor);
+
+int devm_devfreq_add_governor(struct device *dev,
+ struct devfreq_governor *governor);
+
+int devfreq_update_status(struct devfreq *devfreq, unsigned long freq);
+int devfreq_update_target(struct devfreq *devfreq, unsigned long freq);
+void devfreq_get_freq_range(struct devfreq *devfreq, unsigned long *min_freq,
+ unsigned long *max_freq);
+
+static inline int devfreq_update_stats(struct devfreq *df)
+{
+ if (!df->profile->get_dev_status)
+ return -EINVAL;
+
+ return df->profile->get_dev_status(df->dev.parent, &df->last_status);
+}
+#endif /* __LINUX_DEVFREQ_DEVFREQ_H__ */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 8248ff9363ee..2ceda49c609f 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -90,7 +90,7 @@
*/
#define DMA_MAPPING_ERROR (~(dma_addr_t)0)
-#define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
+#define DMA_BIT_MASK(n) GENMASK_ULL(n - 1, 0)
struct dma_iova_state {
dma_addr_t addr;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index a98cc39e7aaa..b23ff8b83219 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1126,6 +1126,8 @@ static inline bool efi_runtime_disabled(void) { return true; }
extern void efi_call_virt_check_flags(unsigned long flags, const void *caller);
extern unsigned long efi_call_virt_save_flags(void);
+void efi_runtime_assert_lock_held(void);
+
enum efi_secureboot_mode {
efi_secureboot_mode_unset,
efi_secureboot_mode_unknown,
diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
index 69b136e4dd2b..bb3dcded055f 100644
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -60,23 +60,21 @@
#else /* !__ASSEMBLER__ */
#include <uapi/linux/elf.h>
+#include <linux/compiler.h>
/*
* Use an anonymous structure which matches the shape of
* Elf{32,64}_Nhdr, but includes the name and desc data. The size and
* type of name and desc depend on the macro arguments. "name" must
- * be a literal string, and "desc" must be passed by value. You may
- * only define one note per line, since __LINE__ is used to generate
- * unique symbols.
+ * be a literal string, and "desc" must be passed by value.
*/
-#define _ELFNOTE_PASTE(a,b) a##b
-#define _ELFNOTE(size, name, unique, type, desc) \
+#define ELFNOTE(size, name, type, desc) \
static const struct { \
struct elf##size##_note _nhdr; \
unsigned char _name[sizeof(name)] \
__attribute__((aligned(sizeof(Elf##size##_Word)))); \
typeof(desc) _desc \
__attribute__((aligned(sizeof(Elf##size##_Word)))); \
- } _ELFNOTE_PASTE(_note_, unique) \
+ } __UNIQUE_ID(note) \
__used \
__attribute__((section(".note." name), \
aligned(sizeof(Elf##size##_Word)), \
@@ -89,11 +87,10 @@
name, \
desc \
}
-#define ELFNOTE(size, name, type, desc) \
- _ELFNOTE(size, name, __LINE__, type, desc)
#define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc)
#define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc)
+
#endif /* __ASSEMBLER__ */
#endif /* _LINUX_ELFNOTE_H */
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 61d50571ad88..43aa6153dc57 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -54,6 +54,8 @@ struct em_perf_table {
/**
* struct em_perf_domain - Performance domain
* @em_table: Pointer to the runtime modifiable em_perf_table
+ * @node: node in em_pd_list (in energy_model.c)
+ * @id: A unique ID number for each performance domain
* @nr_perf_states: Number of performance states
* @min_perf_state: Minimum allowed Performance State index
* @max_perf_state: Maximum allowed Performance State index
@@ -71,6 +73,8 @@ struct em_perf_table {
*/
struct em_perf_domain {
struct em_perf_table __rcu *em_table;
+ struct list_head node;
+ int id;
int nr_perf_states;
int min_perf_state;
int max_perf_state;
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 7177436f0f9e..87efb38b7081 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -3,11 +3,11 @@
#define __LINUX_ENTRYCOMMON_H
#include <linux/irq-entry-common.h>
+#include <linux/livepatch.h>
#include <linux/ptrace.h>
+#include <linux/resume_user_mode.h>
#include <linux/seccomp.h>
#include <linux/sched.h>
-#include <linux/livepatch.h>
-#include <linux/resume_user_mode.h>
#include <asm/entry-common.h>
#include <asm/syscall.h>
@@ -37,6 +37,7 @@
SYSCALL_WORK_SYSCALL_AUDIT | \
SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
ARCH_SYSCALL_WORK_ENTER)
+
#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \
SYSCALL_WORK_SYSCALL_TRACE | \
SYSCALL_WORK_SYSCALL_AUDIT | \
@@ -44,25 +45,7 @@
SYSCALL_WORK_SYSCALL_EXIT_TRAP | \
ARCH_SYSCALL_WORK_EXIT)
-/**
- * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
- * @regs: Pointer to currents pt_regs
- *
- * Invoked from architecture specific syscall entry code with interrupts
- * disabled. The calling code has to be non-instrumentable. When the
- * function returns all state is correct, interrupts are enabled and the
- * subsequent functions can be instrumented.
- *
- * This handles lockdep, RCU (context tracking) and tracing state, i.e.
- * the functionality provided by enter_from_user_mode().
- *
- * This is invoked when there is extra architecture specific functionality
- * to be done between establishing state and handling user mode entry work.
- */
-void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
-
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
- unsigned long work);
+long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work);
/**
* syscall_enter_from_user_mode_work - Check and handle work before invoking
@@ -71,8 +54,8 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall,
* @syscall: The syscall number
*
* Invoked from architecture specific syscall entry code with interrupts
- * enabled after invoking syscall_enter_from_user_mode_prepare() and extra
- * architecture specific work.
+ * enabled after invoking enter_from_user_mode(), enabling interrupts and
+ * extra architecture specific work.
*
* Returns: The original or a modified syscall number
*
@@ -108,8 +91,9 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re
* function returns all state is correct, interrupts are enabled and the
* subsequent functions can be instrumented.
*
- * This is combination of syscall_enter_from_user_mode_prepare() and
- * syscall_enter_from_user_mode_work().
+ * This is the combination of enter_from_user_mode() and
+ * syscall_enter_from_user_mode_work() to be used when there is no
+ * architecture specific work to be done between the two.
*
* Returns: The original or a modified syscall number. See
* syscall_enter_from_user_mode_work() for further explanation.
@@ -162,7 +146,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
local_irq_enable();
}
- rseq_syscall(regs);
+ rseq_debug_syscall_return(regs);
/*
* Do one-time syscall specific work. If these work items are
@@ -172,7 +156,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
if (unlikely(work & SYSCALL_WORK_EXIT))
syscall_exit_work(regs, work);
local_irq_disable_exit_to_user();
- exit_to_user_mode_prepare(regs);
+ syscall_exit_to_user_mode_prepare(regs);
}
/**
diff --git a/include/linux/entry-virt.h b/include/linux/entry-virt.h
index 42c89e3e5ca7..bfa767702d9a 100644
--- a/include/linux/entry-virt.h
+++ b/include/linux/entry-virt.h
@@ -32,7 +32,7 @@
*/
static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work);
-#ifndef arch_xfer_to_guest_mode_work
+#ifndef arch_xfer_to_guest_mode_handle_work
static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work)
{
return 0;
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index c2d8b4ec62eb..5c9162193d26 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -492,7 +492,7 @@ struct ethtool_pause_stats {
};
#define ETHTOOL_MAX_LANES 8
-/**
+/*
* IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for
* the end-of-list marker, total 17 items
*/
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index 81f0e698acbf..f206370060e1 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -18,6 +18,7 @@ void fbcon_suspended(struct fb_info *info);
void fbcon_resumed(struct fb_info *info);
int fbcon_mode_deleted(struct fb_info *info,
struct fb_videomode *mode);
+void fbcon_delete_modelist(struct list_head *head);
void fbcon_new_modelist(struct fb_info *info);
void fbcon_get_requirement(struct fb_info *info,
struct fb_blit_caps *caps);
@@ -38,6 +39,7 @@ static inline void fbcon_suspended(struct fb_info *info) {}
static inline void fbcon_resumed(struct fb_info *info) {}
static inline int fbcon_mode_deleted(struct fb_info *info,
struct fb_videomode *mode) { return 0; }
+static inline void fbcon_delete_modelist(struct list_head *head) {}
static inline void fbcon_new_modelist(struct fb_info *info) {}
static inline void fbcon_get_requirement(struct fb_info *info,
struct fb_blit_caps *caps) {}
diff --git a/include/linux/file.h b/include/linux/file.h
index af1768d934a0..cf389fde9bc2 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -127,4 +127,130 @@ extern void __fput_sync(struct file *);
extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
+/*
+ * fd_prepare: Combined fd + file allocation cleanup class.
+ * @err: Error code to indicate if allocation succeeded.
+ * @__fd: Allocated fd (may not be accessed directly)
+ * @__file: Allocated struct file pointer (may not be accessed directly)
+ *
+ * Allocates an fd and a file together. On error paths, automatically cleans
+ * up whichever resource was successfully allocated. Allows flexible file
+ * allocation with different functions per usage.
+ *
+ * Do not use directly.
+ */
+struct fd_prepare {
+ s32 err;
+ s32 __fd; /* do not access directly */
+ struct file *__file; /* do not access directly */
+};
+
+/* Typedef for fd_prepare cleanup guards. */
+typedef struct fd_prepare class_fd_prepare_t;
+
+/*
+ * Accessors for fd_prepare class members.
+ * _Generic() is used for zero-cost type safety.
+ */
+#define fd_prepare_fd(_fdf) \
+ (_Generic((_fdf), struct fd_prepare: (_fdf).__fd))
+
+#define fd_prepare_file(_fdf) \
+ (_Generic((_fdf), struct fd_prepare: (_fdf).__file))
+
+/* Do not use directly. */
+static inline void class_fd_prepare_destructor(const struct fd_prepare *fdf)
+{
+ if (unlikely(fdf->err)) {
+ if (likely(fdf->__fd >= 0))
+ put_unused_fd(fdf->__fd);
+ if (unlikely(!IS_ERR_OR_NULL(fdf->__file)))
+ fput(fdf->__file);
+ }
+}
+
+/* Do not use directly. */
+static inline int class_fd_prepare_lock_err(const struct fd_prepare *fdf)
+{
+ if (unlikely(fdf->err))
+ return fdf->err;
+ if (unlikely(fdf->__fd < 0))
+ return fdf->__fd;
+ if (unlikely(IS_ERR(fdf->__file)))
+ return PTR_ERR(fdf->__file);
+ if (unlikely(!fdf->__file))
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * __FD_PREPARE_INIT - Helper to initialize fd_prepare class.
+ * @_fd_flags: flags for get_unused_fd_flags()
+ * @_file_owned: expression that returns struct file *
+ *
+ * Returns a struct fd_prepare with fd, file, and err set.
+ * If fd allocation fails, fd will be negative and err will be set. If
+ * fd succeeds but file_init_expr fails, file will be ERR_PTR and err
+ * will be set. The err field is the single source of truth for error
+ * checking.
+ */
+#define __FD_PREPARE_INIT(_fd_flags, _file_owned) \
+ ({ \
+ struct fd_prepare fdf = { \
+ .__fd = get_unused_fd_flags((_fd_flags)), \
+ }; \
+ if (likely(fdf.__fd >= 0)) \
+ fdf.__file = (_file_owned); \
+ fdf.err = ACQUIRE_ERR(fd_prepare, &fdf); \
+ fdf; \
+ })
+
+/*
+ * FD_PREPARE - Macro to declare and initialize an fd_prepare variable.
+ *
+ * Declares and initializes an fd_prepare variable with automatic
+ * cleanup. No separate scope required - cleanup happens when variable
+ * goes out of scope.
+ *
+ * @_fdf: name of struct fd_prepare variable to define
+ * @_fd_flags: flags for get_unused_fd_flags()
+ * @_file_owned: struct file to take ownership of (can be expression)
+ */
+#define FD_PREPARE(_fdf, _fd_flags, _file_owned) \
+ CLASS_INIT(fd_prepare, _fdf, __FD_PREPARE_INIT(_fd_flags, _file_owned))
+
+/*
+ * fd_publish - Publish prepared fd and file to the fd table.
+ * @_fdf: struct fd_prepare variable
+ */
+#define fd_publish(_fdf) \
+ ({ \
+ struct fd_prepare *fdp = &(_fdf); \
+ VFS_WARN_ON_ONCE(fdp->err); \
+ VFS_WARN_ON_ONCE(fdp->__fd < 0); \
+ VFS_WARN_ON_ONCE(IS_ERR_OR_NULL(fdp->__file)); \
+ fd_install(fdp->__fd, fdp->__file); \
+ fdp->__fd; \
+ })
+
+/* Do not use directly. */
+#define __FD_ADD(_fdf, _fd_flags, _file_owned) \
+ ({ \
+ FD_PREPARE(_fdf, _fd_flags, _file_owned); \
+ s32 ret = _fdf.err; \
+ if (likely(!ret)) \
+ ret = fd_publish(_fdf); \
+ ret; \
+ })
+
+/*
+ * FD_ADD - Allocate and install an fd and file in one step.
+ * @_fd_flags: flags for get_unused_fd_flags()
+ * @_file_owned: struct file to take ownership of
+ *
+ * Returns the allocated fd number, or negative error code on failure.
+ */
+#define FD_ADD(_fd_flags, _file_owned) \
+ __FD_ADD(__UNIQUE_ID(fd_prepare), _fd_flags, _file_owned)
+
#endif /* __LINUX_FILE_H */
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index c2ce8ba05d06..54b824c05299 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -159,6 +159,8 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned int,
int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
int fcntl_getlease(struct file *filp);
+int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg);
+int fcntl_getdeleg(struct file *filp, struct delegation *deleg);
static inline bool lock_is_unlock(struct file_lock *fl)
{
@@ -212,7 +214,14 @@ int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);
void locks_init_lease(struct file_lease *);
void locks_free_lease(struct file_lease *fl);
struct file_lease *locks_alloc_lease(void);
-int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
+
+#define LEASE_BREAK_LEASE BIT(0) // break leases and delegations
+#define LEASE_BREAK_DELEG BIT(1) // break delegations only
+#define LEASE_BREAK_LAYOUT BIT(2) // break layouts only
+#define LEASE_BREAK_NONBLOCK BIT(3) // non-blocking break
+#define LEASE_BREAK_OPEN_RDONLY BIT(4) // readonly open event
+
+int __break_lease(struct inode *inode, unsigned int flags);
void lease_get_mtime(struct inode *, struct timespec64 *time);
int generic_setlease(struct file *, int, struct file_lease **, void **priv);
int kernel_setlease(struct file *, int, struct file_lease **, void **);
@@ -271,6 +280,16 @@ static inline int fcntl_getlease(struct file *filp)
return F_UNLCK;
}
+static inline int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg)
+{
+ return -EINVAL;
+}
+
+static inline int fcntl_getdeleg(struct file *filp, struct delegation *deleg)
+{
+ return -EINVAL;
+}
+
static inline bool lock_is_unlock(struct file_lock *fl)
{
return false;
@@ -367,7 +386,7 @@ static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *f
return -ENOLCK;
}
-static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
+static inline int __break_lease(struct inode *inode, unsigned int flags)
{
return 0;
}
@@ -428,6 +447,17 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
}
#ifdef CONFIG_FILE_LOCKING
+static inline unsigned int openmode_to_lease_flags(unsigned int mode)
+{
+ unsigned int flags = 0;
+
+ if ((mode & O_ACCMODE) == O_RDONLY)
+ flags |= LEASE_BREAK_OPEN_RDONLY;
+ if (mode & O_NONBLOCK)
+ flags |= LEASE_BREAK_NONBLOCK;
+ return flags;
+}
+
static inline int break_lease(struct inode *inode, unsigned int mode)
{
struct file_lock_context *flctx;
@@ -443,11 +473,11 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
return 0;
smp_mb();
if (!list_empty_careful(&flctx->flc_lease))
- return __break_lease(inode, mode, FL_LEASE);
+ return __break_lease(inode, LEASE_BREAK_LEASE | openmode_to_lease_flags(mode));
return 0;
}
-static inline int break_deleg(struct inode *inode, unsigned int mode)
+static inline int break_deleg(struct inode *inode, unsigned int flags)
{
struct file_lock_context *flctx;
@@ -461,60 +491,84 @@ static inline int break_deleg(struct inode *inode, unsigned int mode)
if (!flctx)
return 0;
smp_mb();
- if (!list_empty_careful(&flctx->flc_lease))
- return __break_lease(inode, mode, FL_DELEG);
+ if (!list_empty_careful(&flctx->flc_lease)) {
+ flags |= LEASE_BREAK_DELEG;
+ return __break_lease(inode, flags);
+ }
return 0;
}
-static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
+struct delegated_inode {
+ struct inode *di_inode;
+};
+
+static inline bool is_delegated(struct delegated_inode *di)
+{
+ return di->di_inode;
+}
+
+static inline int try_break_deleg(struct inode *inode,
+ struct delegated_inode *di)
{
int ret;
- ret = break_deleg(inode, O_WRONLY|O_NONBLOCK);
- if (ret == -EWOULDBLOCK && delegated_inode) {
- *delegated_inode = inode;
+ ret = break_deleg(inode, LEASE_BREAK_NONBLOCK);
+ if (ret == -EWOULDBLOCK && di) {
+ di->di_inode = inode;
ihold(inode);
}
return ret;
}
-static inline int break_deleg_wait(struct inode **delegated_inode)
+static inline int break_deleg_wait(struct delegated_inode *di)
{
int ret;
- ret = break_deleg(*delegated_inode, O_WRONLY);
- iput(*delegated_inode);
- *delegated_inode = NULL;
+ ret = break_deleg(di->di_inode, 0);
+ iput(di->di_inode);
+ di->di_inode = NULL;
return ret;
}
static inline int break_layout(struct inode *inode, bool wait)
{
smp_mb();
- if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
- return __break_lease(inode,
- wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
- FL_LAYOUT);
+ if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) {
+ unsigned int flags = LEASE_BREAK_LAYOUT;
+
+ if (!wait)
+ flags |= LEASE_BREAK_NONBLOCK;
+
+ return __break_lease(inode, flags);
+ }
return 0;
}
#else /* !CONFIG_FILE_LOCKING */
-static inline int break_lease(struct inode *inode, unsigned int mode)
+struct delegated_inode { };
+
+static inline bool is_delegated(struct delegated_inode *di)
+{
+ return false;
+}
+
+static inline int break_lease(struct inode *inode, bool wait)
{
return 0;
}
-static inline int break_deleg(struct inode *inode, unsigned int mode)
+static inline int break_deleg(struct inode *inode, unsigned int flags)
{
return 0;
}
-static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
+static inline int try_break_deleg(struct inode *inode,
+ struct delegated_inode *delegated_inode)
{
return 0;
}
-static inline int break_deleg_wait(struct inode **delegated_inode)
+static inline int break_deleg_wait(struct delegated_inode *delegated_inode)
{
BUG();
return 0;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f5c859b8131a..973233b82dc1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -901,6 +901,26 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
cb->data_end = skb->data + skb_headlen(skb);
}
+static inline int bpf_prog_run_data_pointers(
+ const struct bpf_prog *prog,
+ struct sk_buff *skb)
+{
+ struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+ void *save_data_meta, *save_data_end;
+ int res;
+
+ save_data_meta = cb->data_meta;
+ save_data_end = cb->data_end;
+
+ bpf_compute_data_pointers(skb);
+ res = bpf_prog_run(prog, skb);
+
+ cb->data_meta = save_data_meta;
+ cb->data_end = save_data_end;
+
+ return res;
+}
+
/* Similar to bpf_compute_data_pointers(), except that save orginal
* data in cb->data and cb->meta_data for restore.
*/
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 32884c9721e5..0a8c6c4d1a82 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -22,14 +22,18 @@ extern bool pm_nosig_freezing; /* PM nosig freezing in effect */
extern unsigned int freeze_timeout_msecs;
/*
- * Check if a process has been frozen
+ * Check if a process has been frozen for PM or cgroup1 freezer. Note that
+ * cgroup2 freezer uses the job control mechanism and does not interact with
+ * the PM freezer.
*/
extern bool frozen(struct task_struct *p);
extern bool freezing_slow_path(struct task_struct *p);
/*
- * Check if there is a request to freeze a process
+ * Check if there is a request to freeze a task from PM or cgroup1 freezer.
+ * Note that cgroup2 freezer uses the job control mechanism and does not
+ * interact with the PM freezer.
*/
static inline bool freezing(struct task_struct *p)
{
@@ -63,9 +67,9 @@ extern bool freeze_task(struct task_struct *p);
extern bool set_freezable(void);
#ifdef CONFIG_CGROUP_FREEZER
-extern bool cgroup_freezing(struct task_struct *task);
+extern bool cgroup1_freezing(struct task_struct *task);
#else /* !CONFIG_CGROUP_FREEZER */
-static inline bool cgroup_freezing(struct task_struct *task)
+static inline bool cgroup1_freezing(struct task_struct *task)
{
return false;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..ce25feb06727 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_FS_H
#define _LINUX_FS_H
+#include <linux/fs/super.h>
#include <linux/vfsdebug.h>
#include <linux/linkage.h>
#include <linux/wait_bit.h>
@@ -11,7 +12,6 @@
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/list.h>
-#include <linux/list_lru.h>
#include <linux/llist.h>
#include <linux/radix-tree.h>
#include <linux/xarray.h>
@@ -37,7 +37,6 @@
#include <linux/uuid.h>
#include <linux/errseq.h>
#include <linux/ioprio.h>
-#include <linux/fs_types.h>
#include <linux/build_bug.h>
#include <linux/stddef.h>
#include <linux/mount.h>
@@ -52,11 +51,9 @@
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
-struct backing_dev_info;
struct bdi_writeback;
struct bio;
struct io_comp_batch;
-struct export_operations;
struct fiemap_extent_info;
struct hd_geometry;
struct iovec;
@@ -70,16 +67,13 @@ struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
-struct workqueue_struct;
struct iov_iter;
-struct fscrypt_operations;
-struct fsverity_operations;
struct fsnotify_mark_connector;
-struct fsnotify_sb_info;
struct fs_context;
struct fs_parameter_spec;
struct file_kattr;
struct iomap_ops;
+struct delegated_inode;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -299,11 +293,6 @@ struct iattr {
};
/*
- * Includes for diskquotas.
- */
-#include <linux/quota.h>
-
-/*
* Maximum number of layers of fs stack. Needs to be limited to
* prevent kernel stack overflow
*/
@@ -367,23 +356,9 @@ struct readahead_control;
#define IOCB_NOIO (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE (1 << 21)
-/*
- * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
- * iocb completion can be passed back to the owner for execution from a safe
- * context rather than needing to be punted through a workqueue. If this
- * flag is set, the bio completion handling may set iocb->dio_complete to a
- * handler function and iocb->private to context information for that handler.
- * The issuer should call the handler with that context information from task
- * context to complete the processing of the iocb. Note that while this
- * provides a task context for the dio_complete() callback, it should only be
- * used on the completion side for non-IO generating completions. It's fine to
- * call blocking functions from this callback, but they should not wait for
- * unrelated IO (like cache flushing, new IO generation, etc).
- */
-#define IOCB_DIO_CALLER_COMP (1 << 22)
/* kiocb is a read or write operation submitted by fs/aio.c. */
-#define IOCB_AIO_RW (1 << 23)
-#define IOCB_HAS_METADATA (1 << 24)
+#define IOCB_AIO_RW (1 << 22)
+#define IOCB_HAS_METADATA (1 << 23)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
@@ -400,7 +375,6 @@ struct readahead_control;
{ IOCB_WAITQ, "WAITQ" }, \
{ IOCB_NOIO, "NOIO" }, \
{ IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \
- { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \
{ IOCB_AIO_RW, "AIO_RW" }, \
{ IOCB_HAS_METADATA, "AIO_HAS_METADATA" }
@@ -412,23 +386,13 @@ struct kiocb {
int ki_flags;
u16 ki_ioprio; /* See linux/ioprio.h */
u8 ki_write_stream;
- union {
- /*
- * Only used for async buffered reads, where it denotes the
- * page waitqueue associated with completing the read. Valid
- * IFF IOCB_WAITQ is set.
- */
- struct wait_page_queue *ki_waitq;
- /*
- * Can be used for O_DIRECT IO, where the completion handling
- * is punted back to the issuer of the IO. May only be set
- * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
- * must then check for presence of this handler when ki_complete
- * is invoked. The data passed in to this handler must be
- * assigned to ->private when dio_complete is assigned.
- */
- ssize_t (*dio_complete)(void *data);
- };
+
+ /*
+ * Only used for async buffered reads, where it denotes the page
+ * waitqueue associated with completing the read.
+ * Valid IFF IOCB_WAITQ is set.
+ */
+ struct wait_page_queue *ki_waitq;
};
static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -659,13 +623,14 @@ is_uncached_acl(struct posix_acl *acl)
return (long)acl & 1;
}
-#define IOP_FASTPERM 0x0001
-#define IOP_LOOKUP 0x0002
-#define IOP_NOFOLLOW 0x0004
-#define IOP_XATTR 0x0008
+#define IOP_FASTPERM 0x0001
+#define IOP_LOOKUP 0x0002
+#define IOP_NOFOLLOW 0x0004
+#define IOP_XATTR 0x0008
#define IOP_DEFAULT_READLINK 0x0010
-#define IOP_MGTIME 0x0020
-#define IOP_CACHED_LINK 0x0040
+#define IOP_MGTIME 0x0020
+#define IOP_CACHED_LINK 0x0040
+#define IOP_FASTPERM_MAY_EXEC 0x0080
/*
* Inode state bits. Protected by inode->i_lock
@@ -759,7 +724,7 @@ enum inode_state_bits {
/* reserved wait address bit 3 */
};
-enum inode_state_flags_t {
+enum inode_state_flags_enum {
I_NEW = (1U << __I_NEW),
I_SYNC = (1U << __I_SYNC),
I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING),
@@ -786,6 +751,13 @@ enum inode_state_flags_t {
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
/*
+ * Use inode_state_read() & friends to access.
+ */
+struct inode_state_flags {
+ enum inode_state_flags_enum __state;
+};
+
+/*
* Keep mostly read-only and often accessed (especially for
* the RCU path lookup and 'stat' data) fields at the beginning
* of the 'struct inode'
@@ -793,14 +765,13 @@ enum inode_state_flags_t {
struct inode {
umode_t i_mode;
unsigned short i_opflags;
- kuid_t i_uid;
- kgid_t i_gid;
unsigned int i_flags;
-
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
+ kuid_t i_uid;
+ kgid_t i_gid;
const struct inode_operations *i_op;
struct super_block *i_sb;
@@ -843,7 +814,7 @@ struct inode {
#endif
/* Misc */
- enum inode_state_flags_t i_state;
+ struct inode_state_flags i_state;
/* 32-bit hole */
struct rw_semaphore i_rwsem;
@@ -902,6 +873,80 @@ struct inode {
void *i_private; /* fs or device private pointer */
} __randomize_layout;
+/*
+ * i_state handling
+ *
+ * We hide all of it behind helpers so that we can validate consumers.
+ */
+static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode)
+{
+ return READ_ONCE(inode->i_state.__state);
+}
+
+static inline enum inode_state_flags_enum inode_state_read(struct inode *inode)
+{
+ lockdep_assert_held(&inode->i_lock);
+ return inode->i_state.__state;
+}
+
+static inline void inode_state_set_raw(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ WRITE_ONCE(inode->i_state.__state, inode->i_state.__state | flags);
+}
+
+static inline void inode_state_set(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ lockdep_assert_held(&inode->i_lock);
+ inode_state_set_raw(inode, flags);
+}
+
+static inline void inode_state_clear_raw(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ WRITE_ONCE(inode->i_state.__state, inode->i_state.__state & ~flags);
+}
+
+static inline void inode_state_clear(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ lockdep_assert_held(&inode->i_lock);
+ inode_state_clear_raw(inode, flags);
+}
+
+static inline void inode_state_assign_raw(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ WRITE_ONCE(inode->i_state.__state, flags);
+}
+
+static inline void inode_state_assign(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ lockdep_assert_held(&inode->i_lock);
+ inode_state_assign_raw(inode, flags);
+}
+
+static inline void inode_state_replace_raw(struct inode *inode,
+ enum inode_state_flags_enum clearflags,
+ enum inode_state_flags_enum setflags)
+{
+ enum inode_state_flags_enum flags;
+ flags = inode->i_state.__state;
+ flags &= ~clearflags;
+ flags |= setflags;
+ inode_state_assign_raw(inode, flags);
+}
+
+static inline void inode_state_replace(struct inode *inode,
+ enum inode_state_flags_enum clearflags,
+ enum inode_state_flags_enum setflags)
+{
+ lockdep_assert_held(&inode->i_lock);
+ inode_state_replace_raw(inode, clearflags, setflags);
+}
+
static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
{
VFS_WARN_ON_INODE(strlen(link) != linklen, inode);
@@ -949,6 +994,8 @@ static inline void inode_fake_hash(struct inode *inode)
hlist_add_fake(&inode->i_hash);
}
+void wait_on_new_inode(struct inode *inode);
+
/*
* inode->i_rwsem nesting subclasses for the lock validator:
*
@@ -1348,49 +1395,6 @@ extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct file *file);
/*
- * sb->s_flags. Note that these mirror the equivalent MS_* flags where
- * represented in both.
- */
-#define SB_RDONLY BIT(0) /* Mount read-only */
-#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */
-#define SB_NODEV BIT(2) /* Disallow access to device special files */
-#define SB_NOEXEC BIT(3) /* Disallow program execution */
-#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */
-#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */
-#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */
-#define SB_NOATIME BIT(10) /* Do not update access times. */
-#define SB_NODIRATIME BIT(11) /* Do not update directory access times */
-#define SB_SILENT BIT(15)
-#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */
-#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */
-#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */
-#define SB_I_VERSION BIT(23) /* Update inode I_version field */
-#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */
-
-/* These sb flags are internal to the kernel */
-#define SB_DEAD BIT(21)
-#define SB_DYING BIT(24)
-#define SB_FORCE BIT(27)
-#define SB_NOSEC BIT(28)
-#define SB_BORN BIT(29)
-#define SB_ACTIVE BIT(30)
-#define SB_NOUSER BIT(31)
-
-/* These flags relate to encoding and casefolding */
-#define SB_ENC_STRICT_MODE_FL (1 << 0)
-#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1)
-
-#define sb_has_strict_encoding(sb) \
- (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
-
-#if IS_ENABLED(CONFIG_UNICODE)
-#define sb_no_casefold_compat_fallback(sb) \
- (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL)
-#else
-#define sb_no_casefold_compat_fallback(sb) (1)
-#endif
-
-/*
* Umount options
*/
@@ -1400,191 +1404,6 @@ extern int send_sigurg(struct file *file);
#define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */
#define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */
-/* sb->s_iflags */
-#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
-#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */
-#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */
-#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */
-
-/* sb->s_iflags to limit user namespace mounts */
-#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */
-#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020
-#define SB_I_UNTRUSTED_MOUNTER 0x00000040
-#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080
-
-#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */
-#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */
-#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
-#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */
-#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */
-#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */
-#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */
-
-/* Possible states of 'frozen' field */
-enum {
- SB_UNFROZEN = 0, /* FS is unfrozen */
- SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */
- SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */
- SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop
- * internal threads if needed) */
- SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */
-};
-
-#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
-
-struct sb_writers {
- unsigned short frozen; /* Is sb frozen? */
- int freeze_kcount; /* How many kernel freeze requests? */
- int freeze_ucount; /* How many userspace freeze requests? */
- const void *freeze_owner; /* Owner of the freeze */
- struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
-};
-
-struct mount;
-
-struct super_block {
- struct list_head s_list; /* Keep this first */
- dev_t s_dev; /* search index; _not_ kdev_t */
- unsigned char s_blocksize_bits;
- unsigned long s_blocksize;
- loff_t s_maxbytes; /* Max file size */
- struct file_system_type *s_type;
- const struct super_operations *s_op;
- const struct dquot_operations *dq_op;
- const struct quotactl_ops *s_qcop;
- const struct export_operations *s_export_op;
- unsigned long s_flags;
- unsigned long s_iflags; /* internal SB_I_* flags */
- unsigned long s_magic;
- struct dentry *s_root;
- struct rw_semaphore s_umount;
- int s_count;
- atomic_t s_active;
-#ifdef CONFIG_SECURITY
- void *s_security;
-#endif
- const struct xattr_handler * const *s_xattr;
-#ifdef CONFIG_FS_ENCRYPTION
- const struct fscrypt_operations *s_cop;
- struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */
-#endif
-#ifdef CONFIG_FS_VERITY
- const struct fsverity_operations *s_vop;
-#endif
-#if IS_ENABLED(CONFIG_UNICODE)
- struct unicode_map *s_encoding;
- __u16 s_encoding_flags;
-#endif
- struct hlist_bl_head s_roots; /* alternate root dentries for NFS */
- struct mount *s_mounts; /* list of mounts; _not_ for fs use */
- struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */
- struct file *s_bdev_file;
- struct backing_dev_info *s_bdi;
- struct mtd_info *s_mtd;
- struct hlist_node s_instances;
- unsigned int s_quota_types; /* Bitmask of supported quota types */
- struct quota_info s_dquot; /* Diskquota specific options */
-
- struct sb_writers s_writers;
-
- /*
- * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
- * s_fsnotify_info together for cache efficiency. They are frequently
- * accessed and rarely modified.
- */
- void *s_fs_info; /* Filesystem private info */
-
- /* Granularity of c/m/atime in ns (cannot be worse than a second) */
- u32 s_time_gran;
- /* Time limits for c/m/atime in seconds */
- time64_t s_time_min;
- time64_t s_time_max;
-#ifdef CONFIG_FSNOTIFY
- u32 s_fsnotify_mask;
- struct fsnotify_sb_info *s_fsnotify_info;
-#endif
-
- /*
- * q: why are s_id and s_sysfs_name not the same? both are human
- * readable strings that identify the filesystem
- * a: s_id is allowed to change at runtime; it's used in log messages,
- * and we want to when a device starts out as single device (s_id is dev
- * name) but then a device is hot added and we have to switch to
- * identifying it by UUID
- * but s_sysfs_name is a handle for programmatic access, and can't
- * change at runtime
- */
- char s_id[32]; /* Informational name */
- uuid_t s_uuid; /* UUID */
- u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */
-
- /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */
- char s_sysfs_name[UUID_STRING_LEN + 1];
-
- unsigned int s_max_links;
- unsigned int s_d_flags; /* default d_flags for dentries */
-
- /*
- * The next field is for VFS *only*. No filesystems have any business
- * even looking at it. You had been warned.
- */
- struct mutex s_vfs_rename_mutex; /* Kludge */
-
- /*
- * Filesystem subtype. If non-empty the filesystem type field
- * in /proc/mounts will be "type.subtype"
- */
- const char *s_subtype;
-
- const struct dentry_operations *__s_d_op; /* default d_op for dentries */
-
- struct shrinker *s_shrink; /* per-sb shrinker handle */
-
- /* Number of inodes with nlink == 0 but still referenced */
- atomic_long_t s_remove_count;
-
- /* Read-only state of the superblock is being changed */
- int s_readonly_remount;
-
- /* per-sb errseq_t for reporting writeback errors via syncfs */
- errseq_t s_wb_err;
-
- /* AIO completions deferred from interrupt context */
- struct workqueue_struct *s_dio_done_wq;
- struct hlist_head s_pins;
-
- /*
- * Owning user namespace and default context in which to
- * interpret filesystem uids, gids, quotas, device nodes,
- * xattrs and security labels.
- */
- struct user_namespace *s_user_ns;
-
- /*
- * The list_lru structure is essentially just a pointer to a table
- * of per-node lru lists, each of which has its own spinlock.
- * There is no need to put them into separate cachelines.
- */
- struct list_lru s_dentry_lru;
- struct list_lru s_inode_lru;
- struct rcu_head rcu;
- struct work_struct destroy_work;
-
- struct mutex s_sync_lock; /* sync serialisation lock */
-
- /*
- * Indicates how deep in a filesystem stack this SB is
- */
- int s_stack_depth;
-
- /* s_inode_list_lock protects s_inodes */
- spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
- struct list_head s_inodes; /* all inodes */
-
- spinlock_t s_inode_wblist_lock;
- struct list_head s_inodes_wb; /* writeback inodes */
-} __randomize_layout;
-
static inline struct user_namespace *i_user_ns(const struct inode *inode)
{
return inode->i_sb->s_user_ns;
@@ -1902,66 +1721,6 @@ struct timespec64 simple_inode_init_ts(struct inode *inode);
* Snapshotting support.
*/
-/*
- * These are internal functions, please use sb_start_{write,pagefault,intwrite}
- * instead.
- */
-static inline void __sb_end_write(struct super_block *sb, int level)
-{
- percpu_up_read(sb->s_writers.rw_sem + level-1);
-}
-
-static inline void __sb_start_write(struct super_block *sb, int level)
-{
- percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true);
-}
-
-static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
-{
- return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
-}
-
-#define __sb_writers_acquired(sb, lev) \
- percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
-#define __sb_writers_release(sb, lev) \
- percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_)
-
-/**
- * __sb_write_started - check if sb freeze level is held
- * @sb: the super we write to
- * @level: the freeze level
- *
- * * > 0 - sb freeze level is held
- * * 0 - sb freeze level is not held
- * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
- */
-static inline int __sb_write_started(const struct super_block *sb, int level)
-{
- return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
-}
-
-/**
- * sb_write_started - check if SB_FREEZE_WRITE is held
- * @sb: the super we write to
- *
- * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
- */
-static inline bool sb_write_started(const struct super_block *sb)
-{
- return __sb_write_started(sb, SB_FREEZE_WRITE);
-}
-
-/**
- * sb_write_not_started - check if SB_FREEZE_WRITE is not held
- * @sb: the super we write to
- *
- * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
- */
-static inline bool sb_write_not_started(const struct super_block *sb)
-{
- return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
-}
-
/**
* file_write_started - check if SB_FREEZE_WRITE is held
* @file: the file we write to
@@ -1992,137 +1751,26 @@ static inline bool file_write_not_started(const struct file *file)
return sb_write_not_started(file_inode(file)->i_sb);
}
-/**
- * sb_end_write - drop write access to a superblock
- * @sb: the super we wrote to
- *
- * Decrement number of writers to the filesystem. Wake up possible waiters
- * wanting to freeze the filesystem.
- */
-static inline void sb_end_write(struct super_block *sb)
-{
- __sb_end_write(sb, SB_FREEZE_WRITE);
-}
-
-/**
- * sb_end_pagefault - drop write access to a superblock from a page fault
- * @sb: the super we wrote to
- *
- * Decrement number of processes handling write page fault to the filesystem.
- * Wake up possible waiters wanting to freeze the filesystem.
- */
-static inline void sb_end_pagefault(struct super_block *sb)
-{
- __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
-}
-
-/**
- * sb_end_intwrite - drop write access to a superblock for internal fs purposes
- * @sb: the super we wrote to
- *
- * Decrement fs-internal number of writers to the filesystem. Wake up possible
- * waiters wanting to freeze the filesystem.
- */
-static inline void sb_end_intwrite(struct super_block *sb)
-{
- __sb_end_write(sb, SB_FREEZE_FS);
-}
-
-/**
- * sb_start_write - get write access to a superblock
- * @sb: the super we write to
- *
- * When a process wants to write data or metadata to a file system (i.e. dirty
- * a page or an inode), it should embed the operation in a sb_start_write() -
- * sb_end_write() pair to get exclusion against file system freezing. This
- * function increments number of writers preventing freezing. If the file
- * system is already frozen, the function waits until the file system is
- * thawed.
- *
- * Since freeze protection behaves as a lock, users have to preserve
- * ordering of freeze protection and other filesystem locks. Generally,
- * freeze protection should be the outermost lock. In particular, we have:
- *
- * sb_start_write
- * -> i_rwsem (write path, truncate, directory ops, ...)
- * -> s_umount (freeze_super, thaw_super)
- */
-static inline void sb_start_write(struct super_block *sb)
-{
- __sb_start_write(sb, SB_FREEZE_WRITE);
-}
-
-static inline bool sb_start_write_trylock(struct super_block *sb)
-{
- return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
-}
-
-/**
- * sb_start_pagefault - get write access to a superblock from a page fault
- * @sb: the super we write to
- *
- * When a process starts handling write page fault, it should embed the
- * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
- * exclusion against file system freezing. This is needed since the page fault
- * is going to dirty a page. This function increments number of running page
- * faults preventing freezing. If the file system is already frozen, the
- * function waits until the file system is thawed.
- *
- * Since page fault freeze protection behaves as a lock, users have to preserve
- * ordering of freeze protection and other filesystem locks. It is advised to
- * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
- * handling code implies lock dependency:
- *
- * mmap_lock
- * -> sb_start_pagefault
- */
-static inline void sb_start_pagefault(struct super_block *sb)
-{
- __sb_start_write(sb, SB_FREEZE_PAGEFAULT);
-}
-
-/**
- * sb_start_intwrite - get write access to a superblock for internal fs purposes
- * @sb: the super we write to
- *
- * This is the third level of protection against filesystem freezing. It is
- * free for use by a filesystem. The only requirement is that it must rank
- * below sb_start_pagefault.
- *
- * For example filesystem can call sb_start_intwrite() when starting a
- * transaction which somewhat eases handling of freezing for internal sources
- * of filesystem changes (internal fs threads, discarding preallocation on file
- * close, etc.).
- */
-static inline void sb_start_intwrite(struct super_block *sb)
-{
- __sb_start_write(sb, SB_FREEZE_FS);
-}
-
-static inline bool sb_start_intwrite_trylock(struct super_block *sb)
-{
- return __sb_start_write_trylock(sb, SB_FREEZE_FS);
-}
-
bool inode_owner_or_capable(struct mnt_idmap *idmap,
const struct inode *inode);
/*
* VFS helper functions..
*/
-int vfs_create(struct mnt_idmap *, struct inode *,
- struct dentry *, umode_t, bool);
+int vfs_create(struct mnt_idmap *, struct dentry *, umode_t,
+ struct delegated_inode *);
struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *,
- struct dentry *, umode_t);
+ struct dentry *, umode_t, struct delegated_inode *);
int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
- umode_t, dev_t);
+ umode_t, dev_t, struct delegated_inode *);
int vfs_symlink(struct mnt_idmap *, struct inode *,
- struct dentry *, const char *);
+ struct dentry *, const char *, struct delegated_inode *);
int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
- struct dentry *, struct inode **);
-int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *);
+ struct dentry *, struct delegated_inode *);
+int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *,
+ struct delegated_inode *);
int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
- struct inode **);
+ struct delegated_inode *);
/**
* struct renamedata - contains all information required for renaming
@@ -2140,7 +1788,7 @@ struct renamedata {
struct dentry *old_dentry;
struct dentry *new_parent;
struct dentry *new_dentry;
- struct inode **delegated_inode;
+ struct delegated_inode *delegated_inode;
unsigned int flags;
} __randomize_layout;
@@ -2150,7 +1798,7 @@ static inline int vfs_whiteout(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *dentry)
{
return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE,
- WHITEOUT_DEV);
+ WHITEOUT_DEV, NULL);
}
struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
@@ -2431,72 +2079,6 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos,
loff_t len, unsigned int remap_flags);
-/**
- * enum freeze_holder - holder of the freeze
- * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
- * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
- * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
- * @FREEZE_EXCL: a freeze that can only be undone by the owner
- *
- * Indicate who the owner of the freeze or thaw request is and whether
- * the freeze needs to be exclusive or can nest.
- * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
- * same holder aren't allowed. It is however allowed to hold a single
- * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
- * the same time. This is relied upon by some filesystems during online
- * repair or similar.
- */
-enum freeze_holder {
- FREEZE_HOLDER_KERNEL = (1U << 0),
- FREEZE_HOLDER_USERSPACE = (1U << 1),
- FREEZE_MAY_NEST = (1U << 2),
- FREEZE_EXCL = (1U << 3),
-};
-
-struct super_operations {
- struct inode *(*alloc_inode)(struct super_block *sb);
- void (*destroy_inode)(struct inode *);
- void (*free_inode)(struct inode *);
-
- void (*dirty_inode) (struct inode *, int flags);
- int (*write_inode) (struct inode *, struct writeback_control *wbc);
- int (*drop_inode) (struct inode *);
- void (*evict_inode) (struct inode *);
- void (*put_super) (struct super_block *);
- int (*sync_fs)(struct super_block *sb, int wait);
- int (*freeze_super) (struct super_block *, enum freeze_holder who, const void *owner);
- int (*freeze_fs) (struct super_block *);
- int (*thaw_super) (struct super_block *, enum freeze_holder who, const void *owner);
- int (*unfreeze_fs) (struct super_block *);
- int (*statfs) (struct dentry *, struct kstatfs *);
- int (*remount_fs) (struct super_block *, int *, char *);
- void (*umount_begin) (struct super_block *);
-
- int (*show_options)(struct seq_file *, struct dentry *);
- int (*show_devname)(struct seq_file *, struct dentry *);
- int (*show_path)(struct seq_file *, struct dentry *);
- int (*show_stats)(struct seq_file *, struct dentry *);
-#ifdef CONFIG_QUOTA
- ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
- ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
- struct dquot __rcu **(*get_dquots)(struct inode *);
-#endif
- long (*nr_cached_objects)(struct super_block *,
- struct shrink_control *);
- long (*free_cached_objects)(struct super_block *,
- struct shrink_control *);
- /*
- * If a filesystem can support graceful removal of a device and
- * continue read-write operations, implement this callback.
- *
- * Return 0 if the filesystem can continue read-write.
- * Non-zero return value or no such callback means the fs will be shutdown
- * as usual.
- */
- int (*remove_bdev)(struct super_block *sb, struct block_device *bdev);
- void (*shutdown)(struct super_block *sb);
-};
-
/*
* Inode flags - they have no relation to superblock flags now
*/
@@ -2539,7 +2121,6 @@ struct super_operations {
*/
#define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg))
-static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; }
#define IS_RDONLY(inode) sb_rdonly((inode)->i_sb)
#define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \
((inode)->i_flags & S_SYNC))
@@ -2635,8 +2216,8 @@ static inline int icount_read(const struct inode *inode)
*/
static inline bool inode_is_dirtytime_only(struct inode *inode)
{
- return (inode->i_state & (I_DIRTY_TIME | I_NEW |
- I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
+ return (inode_state_read_once(inode) &
+ (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
}
extern void inc_nlink(struct inode *inode);
@@ -2689,6 +2270,7 @@ struct file_system_type {
#define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */
#define FS_MGTIME 64 /* FS uses multigrain timestamps */
#define FS_LBS 128 /* FS supports LBS */
+#define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
const struct fs_parameter_spec *parameters;
@@ -2773,10 +2355,6 @@ extern int unregister_filesystem(struct file_system_type *);
extern int vfs_statfs(const struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
-int freeze_super(struct super_block *super, enum freeze_holder who,
- const void *freeze_owner);
-int thaw_super(struct super_block *super, enum freeze_holder who,
- const void *freeze_owner);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);
@@ -2819,10 +2397,9 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch
va_end(args);
}
-extern int current_umask(void);
-
extern void ihold(struct inode * inode);
extern void iput(struct inode *);
+void iput_not_last(struct inode *);
int inode_update_timestamps(struct inode *inode, int flags);
int generic_update_time(struct inode *, int);
@@ -2963,12 +2540,6 @@ extern struct kmem_cache *names_cachep;
#define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL)
#define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
-extern struct super_block *blockdev_superblock;
-static inline bool sb_is_blkdev_sb(struct super_block *sb)
-{
- return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
-}
-
void emergency_thaw_all(void);
extern int sync_filesystem(struct super_block *);
extern const struct file_operations def_blk_fops;
@@ -3014,7 +2585,7 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
loff_t start, loff_t end);
-int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
+int filemap_flush_range(struct address_space *mapping, loff_t start,
loff_t end);
static inline int file_write_and_wait(struct file *file)
@@ -3051,8 +2622,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
} else if (iocb->ki_flags & IOCB_DONTCACHE) {
struct address_space *mapping = iocb->ki_filp->f_mapping;
- filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count,
- iocb->ki_pos - 1);
+ filemap_flush_range(mapping, iocb->ki_pos - count,
+ iocb->ki_pos - 1);
}
return count;
@@ -3071,7 +2642,7 @@ static inline int bmap(struct inode *inode, sector_t *block)
#endif
int notify_change(struct mnt_idmap *, struct dentry *,
- struct iattr *, struct inode **);
+ struct iattr *, struct delegated_inode *);
int inode_permission(struct mnt_idmap *, struct inode *, int);
int generic_permission(struct mnt_idmap *, struct inode *, int);
static inline int file_permission(struct file *file, int mask)
@@ -3101,7 +2672,7 @@ static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
* file_start_write - get write access to a superblock for regular file io
* @file: the file we want to write to
*
- * This is a variant of sb_start_write() which is a noop on non-regualr file.
+ * This is a variant of sb_start_write() which is a noop on non-regular file.
* Should be matched with a call to file_end_write().
*/
static inline void file_start_write(struct file *file)
@@ -3269,6 +2840,7 @@ extern struct file * open_exec(const char *);
/* fs/dcache.c -- generic fs support functions */
extern bool is_subdir(struct dentry *, struct dentry *);
extern bool path_is_under(const struct path *, const struct path *);
+u64 vfsmount_to_propagation_flags(struct vfsmount *mnt);
extern char *file_path(struct file *, char *, int);
@@ -3326,7 +2898,7 @@ extern void d_mark_dontcache(struct inode *inode);
extern struct inode *ilookup5_nowait(struct super_block *sb,
unsigned long hashval, int (*test)(struct inode *, void *),
- void *data);
+ void *data, bool *isnew);
extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data);
extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
@@ -3378,11 +2950,9 @@ static inline bool is_zero_ino(ino_t ino)
return (u32)ino == 0;
}
-/*
- * inode->i_lock must be held
- */
static inline void __iget(struct inode *inode)
{
+ lockdep_assert_held(&inode->i_lock);
atomic_inc(&inode->i_count);
}
@@ -3421,10 +2991,7 @@ static inline void remove_inode_hash(struct inode *inode)
}
extern void inode_sb_list_add(struct inode *inode);
-extern void inode_add_lru(struct inode *inode);
-
-extern int sb_set_blocksize(struct super_block *, int);
-extern int sb_min_blocksize(struct super_block *, int);
+extern void inode_lru_list_add(struct inode *inode);
int generic_file_mmap(struct file *, struct vm_area_struct *);
int generic_file_mmap_prepare(struct vm_area_desc *desc);
@@ -3606,9 +3173,11 @@ extern void drop_super_exclusive(struct super_block *sb);
extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg);
extern void iterate_supers_type(struct file_system_type *,
void (*)(struct super_block *, void *), void *);
-void filesystems_freeze(void);
+void filesystems_freeze(bool freeze_all);
void filesystems_thaw(void);
+void end_dirop(struct dentry *de);
+
extern int dcache_dir_open(struct inode *, struct file *);
extern int dcache_dir_close(struct inode *, struct file *);
extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
@@ -3745,38 +3314,6 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir,
}
#endif
-static inline struct unicode_map *sb_encoding(const struct super_block *sb)
-{
-#if IS_ENABLED(CONFIG_UNICODE)
- return sb->s_encoding;
-#else
- return NULL;
-#endif
-}
-
-static inline bool sb_has_encoding(const struct super_block *sb)
-{
- return !!sb_encoding(sb);
-}
-
-/*
- * Compare if two super blocks have the same encoding and flags
- */
-static inline bool sb_same_encoding(const struct super_block *sb1,
- const struct super_block *sb2)
-{
-#if IS_ENABLED(CONFIG_UNICODE)
- if (sb1->s_encoding == sb2->s_encoding)
- return true;
-
- return (sb1->s_encoding && sb2->s_encoding &&
- (sb1->s_encoding->version == sb2->s_encoding->version) &&
- (sb1->s_encoding_flags == sb2->s_encoding_flags));
-#else
- return true;
-#endif
-}
-
int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
unsigned int ia_valid);
int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *);
diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h
new file mode 100644
index 000000000000..f21ffbb6dea5
--- /dev/null
+++ b/include/linux/fs/super.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FS_SUPER_H
+#define _LINUX_FS_SUPER_H
+
+#include <linux/fs/super_types.h>
+#include <linux/unicode.h>
+
+/*
+ * These are internal functions, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+static inline void __sb_end_write(struct super_block *sb, int level)
+{
+ percpu_up_read(sb->s_writers.rw_sem + level - 1);
+}
+
+static inline void __sb_start_write(struct super_block *sb, int level)
+{
+ percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true);
+}
+
+static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
+{
+ return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
+}
+
+#define __sb_writers_acquired(sb, lev) \
+ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev) - 1], 1, _THIS_IP_)
+#define __sb_writers_release(sb, lev) \
+ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev) - 1], _THIS_IP_)
+
+/**
+ * __sb_write_started - check if sb freeze level is held
+ * @sb: the super we write to
+ * @level: the freeze level
+ *
+ * * > 0 - sb freeze level is held
+ * * 0 - sb freeze level is not held
+ * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
+ */
+static inline int __sb_write_started(const struct super_block *sb, int level)
+{
+ return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
+}
+
+/**
+ * sb_write_started - check if SB_FREEZE_WRITE is held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
+static inline bool sb_write_started(const struct super_block *sb)
+{
+ return __sb_write_started(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_write_not_started - check if SB_FREEZE_WRITE is not held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
+static inline bool sb_write_not_started(const struct super_block *sb)
+{
+ return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
+}
+
+/**
+ * sb_end_write - drop write access to a superblock
+ * @sb: the super we wrote to
+ *
+ * Decrement number of writers to the filesystem. Wake up possible waiters
+ * wanting to freeze the filesystem.
+ */
+static inline void sb_end_write(struct super_block *sb)
+{
+ __sb_end_write(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_end_pagefault - drop write access to a superblock from a page fault
+ * @sb: the super we wrote to
+ *
+ * Decrement number of processes handling write page fault to the filesystem.
+ * Wake up possible waiters wanting to freeze the filesystem.
+ */
+static inline void sb_end_pagefault(struct super_block *sb)
+{
+ __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
+}
+
+/**
+ * sb_end_intwrite - drop write access to a superblock for internal fs purposes
+ * @sb: the super we wrote to
+ *
+ * Decrement fs-internal number of writers to the filesystem. Wake up possible
+ * waiters wanting to freeze the filesystem.
+ */
+static inline void sb_end_intwrite(struct super_block *sb)
+{
+ __sb_end_write(sb, SB_FREEZE_FS);
+}
+
+/**
+ * sb_start_write - get write access to a superblock
+ * @sb: the super we write to
+ *
+ * When a process wants to write data or metadata to a file system (i.e. dirty
+ * a page or an inode), it should embed the operation in a sb_start_write() -
+ * sb_end_write() pair to get exclusion against file system freezing. This
+ * function increments number of writers preventing freezing. If the file
+ * system is already frozen, the function waits until the file system is
+ * thawed.
+ *
+ * Since freeze protection behaves as a lock, users have to preserve
+ * ordering of freeze protection and other filesystem locks. Generally,
+ * freeze protection should be the outermost lock. In particular, we have:
+ *
+ * sb_start_write
+ * -> i_rwsem (write path, truncate, directory ops, ...)
+ * -> s_umount (freeze_super, thaw_super)
+ */
+static inline void sb_start_write(struct super_block *sb)
+{
+ __sb_start_write(sb, SB_FREEZE_WRITE);
+}
+
+DEFINE_GUARD(super_write,
+ struct super_block *,
+ sb_start_write(_T),
+ sb_end_write(_T))
+
+static inline bool sb_start_write_trylock(struct super_block *sb)
+{
+ return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_start_pagefault - get write access to a superblock from a page fault
+ * @sb: the super we write to
+ *
+ * When a process starts handling write page fault, it should embed the
+ * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
+ * exclusion against file system freezing. This is needed since the page fault
+ * is going to dirty a page. This function increments number of running page
+ * faults preventing freezing. If the file system is already frozen, the
+ * function waits until the file system is thawed.
+ *
+ * Since page fault freeze protection behaves as a lock, users have to preserve
+ * ordering of freeze protection and other filesystem locks. It is advised to
+ * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
+ * handling code implies lock dependency:
+ *
+ * mmap_lock
+ * -> sb_start_pagefault
+ */
+static inline void sb_start_pagefault(struct super_block *sb)
+{
+ __sb_start_write(sb, SB_FREEZE_PAGEFAULT);
+}
+
+/**
+ * sb_start_intwrite - get write access to a superblock for internal fs purposes
+ * @sb: the super we write to
+ *
+ * This is the third level of protection against filesystem freezing. It is
+ * free for use by a filesystem. The only requirement is that it must rank
+ * below sb_start_pagefault.
+ *
+ * For example filesystem can call sb_start_intwrite() when starting a
+ * transaction which somewhat eases handling of freezing for internal sources
+ * of filesystem changes (internal fs threads, discarding preallocation on file
+ * close, etc.).
+ */
+static inline void sb_start_intwrite(struct super_block *sb)
+{
+ __sb_start_write(sb, SB_FREEZE_FS);
+}
+
+static inline bool sb_start_intwrite_trylock(struct super_block *sb)
+{
+ return __sb_start_write_trylock(sb, SB_FREEZE_FS);
+}
+
+static inline bool sb_rdonly(const struct super_block *sb)
+{
+ return sb->s_flags & SB_RDONLY;
+}
+
+static inline bool sb_is_blkdev_sb(struct super_block *sb)
+{
+ return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
+}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+static inline struct unicode_map *sb_encoding(const struct super_block *sb)
+{
+ return sb->s_encoding;
+}
+
+/* Compare if two super blocks have the same encoding and flags */
+static inline bool sb_same_encoding(const struct super_block *sb1,
+ const struct super_block *sb2)
+{
+ if (sb1->s_encoding == sb2->s_encoding)
+ return true;
+
+ return (sb1->s_encoding && sb2->s_encoding &&
+ (sb1->s_encoding->version == sb2->s_encoding->version) &&
+ (sb1->s_encoding_flags == sb2->s_encoding_flags));
+}
+#else
+static inline struct unicode_map *sb_encoding(const struct super_block *sb)
+{
+ return NULL;
+}
+
+static inline bool sb_same_encoding(const struct super_block *sb1,
+ const struct super_block *sb2)
+{
+ return true;
+}
+#endif
+
+static inline bool sb_has_encoding(const struct super_block *sb)
+{
+ return !!sb_encoding(sb);
+}
+
+int sb_set_blocksize(struct super_block *sb, int size);
+int __must_check sb_min_blocksize(struct super_block *sb, int size);
+
+int freeze_super(struct super_block *super, enum freeze_holder who,
+ const void *freeze_owner);
+int thaw_super(struct super_block *super, enum freeze_holder who,
+ const void *freeze_owner);
+
+#endif /* _LINUX_FS_SUPER_H */
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
new file mode 100644
index 000000000000..6bd3009e09b3
--- /dev/null
+++ b/include/linux/fs/super_types.h
@@ -0,0 +1,336 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FS_SUPER_TYPES_H
+#define _LINUX_FS_SUPER_TYPES_H
+
+#include <linux/fs_dirent.h>
+#include <linux/errseq.h>
+#include <linux/list_lru.h>
+#include <linux/list.h>
+#include <linux/list_bl.h>
+#include <linux/llist.h>
+#include <linux/uidgid.h>
+#include <linux/uuid.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/workqueue_types.h>
+#include <linux/quota.h>
+
+struct backing_dev_info;
+struct block_device;
+struct dentry;
+struct dentry_operations;
+struct dquot_operations;
+struct export_operations;
+struct file;
+struct file_system_type;
+struct fscrypt_operations;
+struct fsnotify_sb_info;
+struct fsverity_operations;
+struct kstatfs;
+struct mount;
+struct mtd_info;
+struct quotactl_ops;
+struct shrinker;
+struct unicode_map;
+struct user_namespace;
+struct workqueue_struct;
+struct writeback_control;
+struct xattr_handler;
+
+extern struct super_block *blockdev_superblock;
+
+/* Possible states of 'frozen' field */
+enum {
+ SB_UNFROZEN = 0, /* FS is unfrozen */
+ SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */
+ SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */
+ SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop internal threads if needed) */
+ SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */
+};
+
+#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
+
+struct sb_writers {
+ unsigned short frozen; /* Is sb frozen? */
+ int freeze_kcount; /* How many kernel freeze requests? */
+ int freeze_ucount; /* How many userspace freeze requests? */
+ const void *freeze_owner; /* Owner of the freeze */
+ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
+};
+
+/**
+ * enum freeze_holder - holder of the freeze
+ * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
+ * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
+ * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
+ * @FREEZE_EXCL: a freeze that can only be undone by the owner
+ *
+ * Indicate who the owner of the freeze or thaw request is and whether
+ * the freeze needs to be exclusive or can nest.
+ * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
+ * same holder aren't allowed. It is however allowed to hold a single
+ * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
+ * the same time. This is relied upon by some filesystems during online
+ * repair or similar.
+ */
+enum freeze_holder {
+ FREEZE_HOLDER_KERNEL = (1U << 0),
+ FREEZE_HOLDER_USERSPACE = (1U << 1),
+ FREEZE_MAY_NEST = (1U << 2),
+ FREEZE_EXCL = (1U << 3),
+};
+
+struct super_operations {
+ struct inode *(*alloc_inode)(struct super_block *sb);
+ void (*destroy_inode)(struct inode *inode);
+ void (*free_inode)(struct inode *inode);
+ void (*dirty_inode)(struct inode *inode, int flags);
+ int (*write_inode)(struct inode *inode, struct writeback_control *wbc);
+ int (*drop_inode)(struct inode *inode);
+ void (*evict_inode)(struct inode *inode);
+ void (*put_super)(struct super_block *sb);
+ int (*sync_fs)(struct super_block *sb, int wait);
+ int (*freeze_super)(struct super_block *sb, enum freeze_holder who,
+ const void *owner);
+ int (*freeze_fs)(struct super_block *sb);
+ int (*thaw_super)(struct super_block *sb, enum freeze_holder who,
+ const void *owner);
+ int (*unfreeze_fs)(struct super_block *sb);
+ int (*statfs)(struct dentry *dentry, struct kstatfs *kstatfs);
+ int (*remount_fs) (struct super_block *, int *, char *);
+ void (*umount_begin)(struct super_block *sb);
+
+ int (*show_options)(struct seq_file *seq, struct dentry *dentry);
+ int (*show_devname)(struct seq_file *seq, struct dentry *dentry);
+ int (*show_path)(struct seq_file *seq, struct dentry *dentry);
+ int (*show_stats)(struct seq_file *seq, struct dentry *dentry);
+#ifdef CONFIG_QUOTA
+ ssize_t (*quota_read)(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off);
+ ssize_t (*quota_write)(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off);
+ struct dquot __rcu **(*get_dquots)(struct inode *inode);
+#endif
+ long (*nr_cached_objects)(struct super_block *sb,
+ struct shrink_control *sc);
+ long (*free_cached_objects)(struct super_block *sb,
+ struct shrink_control *sc);
+ /*
+ * If a filesystem can support graceful removal of a device and
+ * continue read-write operations, implement this callback.
+ *
+ * Return 0 if the filesystem can continue read-write.
+ * Non-zero return value or no such callback means the fs will be shutdown
+ * as usual.
+ */
+ int (*remove_bdev)(struct super_block *sb, struct block_device *bdev);
+ void (*shutdown)(struct super_block *sb);
+};
+
+struct super_block {
+ struct list_head s_list; /* Keep this first */
+ dev_t s_dev; /* search index; _not_ kdev_t */
+ unsigned char s_blocksize_bits;
+ unsigned long s_blocksize;
+ loff_t s_maxbytes; /* Max file size */
+ struct file_system_type *s_type;
+ const struct super_operations *s_op;
+ const struct dquot_operations *dq_op;
+ const struct quotactl_ops *s_qcop;
+ const struct export_operations *s_export_op;
+ unsigned long s_flags;
+ unsigned long s_iflags; /* internal SB_I_* flags */
+ unsigned long s_magic;
+ struct dentry *s_root;
+ struct rw_semaphore s_umount;
+ int s_count;
+ atomic_t s_active;
+#ifdef CONFIG_SECURITY
+ void *s_security;
+#endif
+ const struct xattr_handler *const *s_xattr;
+#ifdef CONFIG_FS_ENCRYPTION
+ const struct fscrypt_operations *s_cop;
+ struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */
+#endif
+#ifdef CONFIG_FS_VERITY
+ const struct fsverity_operations *s_vop;
+#endif
+#if IS_ENABLED(CONFIG_UNICODE)
+ struct unicode_map *s_encoding;
+ __u16 s_encoding_flags;
+#endif
+ struct hlist_bl_head s_roots; /* alternate root dentries for NFS */
+ struct mount *s_mounts; /* list of mounts; _not_ for fs use */
+ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */
+ struct file *s_bdev_file;
+ struct backing_dev_info *s_bdi;
+ struct mtd_info *s_mtd;
+ struct hlist_node s_instances;
+ unsigned int s_quota_types; /* Bitmask of supported quota types */
+ struct quota_info s_dquot; /* Diskquota specific options */
+
+ struct sb_writers s_writers;
+
+ /*
+ * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
+ * s_fsnotify_info together for cache efficiency. They are frequently
+ * accessed and rarely modified.
+ */
+ void *s_fs_info; /* Filesystem private info */
+
+ /* Granularity of c/m/atime in ns (cannot be worse than a second) */
+ u32 s_time_gran;
+ /* Time limits for c/m/atime in seconds */
+ time64_t s_time_min;
+ time64_t s_time_max;
+#ifdef CONFIG_FSNOTIFY
+ u32 s_fsnotify_mask;
+ struct fsnotify_sb_info *s_fsnotify_info;
+#endif
+
+ /*
+ * q: why are s_id and s_sysfs_name not the same? both are human
+ * readable strings that identify the filesystem
+ * a: s_id is allowed to change at runtime; it's used in log messages,
+ * and we want to when a device starts out as single device (s_id is dev
+ * name) but then a device is hot added and we have to switch to
+ * identifying it by UUID
+ * but s_sysfs_name is a handle for programmatic access, and can't
+ * change at runtime
+ */
+ char s_id[32]; /* Informational name */
+ uuid_t s_uuid; /* UUID */
+ u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */
+
+ /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */
+ char s_sysfs_name[UUID_STRING_LEN + 1];
+
+ unsigned int s_max_links;
+ unsigned int s_d_flags; /* default d_flags for dentries */
+
+ /*
+ * The next field is for VFS *only*. No filesystems have any business
+ * even looking at it. You had been warned.
+ */
+ struct mutex s_vfs_rename_mutex; /* Kludge */
+
+ /*
+ * Filesystem subtype. If non-empty the filesystem type field
+ * in /proc/mounts will be "type.subtype"
+ */
+ const char *s_subtype;
+
+ const struct dentry_operations *__s_d_op; /* default d_op for dentries */
+
+ struct shrinker *s_shrink; /* per-sb shrinker handle */
+
+ /* Number of inodes with nlink == 0 but still referenced */
+ atomic_long_t s_remove_count;
+
+ /* Read-only state of the superblock is being changed */
+ int s_readonly_remount;
+
+ /* per-sb errseq_t for reporting writeback errors via syncfs */
+ errseq_t s_wb_err;
+
+ /* AIO completions deferred from interrupt context */
+ struct workqueue_struct *s_dio_done_wq;
+ struct hlist_head s_pins;
+
+ /*
+ * Owning user namespace and default context in which to
+ * interpret filesystem uids, gids, quotas, device nodes,
+ * xattrs and security labels.
+ */
+ struct user_namespace *s_user_ns;
+
+ /*
+ * The list_lru structure is essentially just a pointer to a table
+ * of per-node lru lists, each of which has its own spinlock.
+ * There is no need to put them into separate cachelines.
+ */
+ struct list_lru s_dentry_lru;
+ struct list_lru s_inode_lru;
+ struct rcu_head rcu;
+ struct work_struct destroy_work;
+
+ struct mutex s_sync_lock; /* sync serialisation lock */
+
+ /*
+ * Indicates how deep in a filesystem stack this SB is
+ */
+ int s_stack_depth;
+
+ /* s_inode_list_lock protects s_inodes */
+ spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
+ struct list_head s_inodes; /* all inodes */
+
+ spinlock_t s_inode_wblist_lock;
+ struct list_head s_inodes_wb; /* writeback inodes */
+ long s_min_writeback_pages;
+} __randomize_layout;
+
+/*
+ * sb->s_flags. Note that these mirror the equivalent MS_* flags where
+ * represented in both.
+ */
+#define SB_RDONLY BIT(0) /* Mount read-only */
+#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */
+#define SB_NODEV BIT(2) /* Disallow access to device special files */
+#define SB_NOEXEC BIT(3) /* Disallow program execution */
+#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */
+#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */
+#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */
+#define SB_NOATIME BIT(10) /* Do not update access times. */
+#define SB_NODIRATIME BIT(11) /* Do not update directory access times */
+#define SB_SILENT BIT(15)
+#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */
+#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */
+#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */
+#define SB_I_VERSION BIT(23) /* Update inode I_version field */
+#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */
+
+/* These sb flags are internal to the kernel */
+#define SB_DEAD BIT(21)
+#define SB_DYING BIT(24)
+#define SB_FORCE BIT(27)
+#define SB_NOSEC BIT(28)
+#define SB_BORN BIT(29)
+#define SB_ACTIVE BIT(30)
+#define SB_NOUSER BIT(31)
+
+/* These flags relate to encoding and casefolding */
+#define SB_ENC_STRICT_MODE_FL (1 << 0)
+#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1)
+
+#define sb_has_strict_encoding(sb) \
+ (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
+
+#if IS_ENABLED(CONFIG_UNICODE)
+#define sb_no_casefold_compat_fallback(sb) \
+ (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL)
+#else
+#define sb_no_casefold_compat_fallback(sb) (1)
+#endif
+
+/* sb->s_iflags */
+#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
+#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */
+#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */
+#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */
+
+/* sb->s_iflags to limit user namespace mounts */
+#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */
+#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020
+#define SB_I_UNTRUSTED_MOUNTER 0x00000040
+#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080
+
+#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */
+#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */
+#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
+#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */
+#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */
+#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */
+#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */
+
+#endif /* _LINUX_FS_SUPER_TYPES_H */
diff --git a/include/linux/fs_types.h b/include/linux/fs_dirent.h
index 54816791196f..92f75c5bac19 100644
--- a/include/linux/fs_types.h
+++ b/include/linux/fs_dirent.h
@@ -1,6 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_FS_TYPES_H
-#define _LINUX_FS_TYPES_H
+#ifndef _LINUX_FS_DIRENT_H
+#define _LINUX_FS_DIRENT_H
+
+#include <linux/stat.h>
+#include <linux/types.h>
/*
* This is a header for the common implementation of dirent
@@ -66,10 +69,10 @@
/*
* declarations for helper functions, accompanying implementation
- * is in fs/fs_types.c
+ * is in fs/fs_dirent.c
*/
extern unsigned char fs_ftype_to_dtype(unsigned int filetype);
extern unsigned char fs_umode_to_ftype(umode_t mode);
extern unsigned char fs_umode_to_dtype(umode_t mode);
-#endif
+#endif /* _LINUX_FS_DIRENT_H */
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index baf200ab5c77..0070764b790a 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_FS_STRUCT_H
#define _LINUX_FS_STRUCT_H
+#include <linux/sched.h>
#include <linux/path.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>
@@ -41,4 +42,9 @@ static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
extern bool current_chrooted(void);
+static inline int current_umask(void)
+{
+ return current->fs->umask;
+}
+
#endif /* _LINUX_FS_STRUCT_H */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7ded7df6e9b5..07f8c309e432 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -193,6 +193,10 @@ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs
#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) || \
defined(CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS)
+#ifndef arch_ftrace_partial_regs
+#define arch_ftrace_partial_regs(regs) do {} while (0)
+#endif
+
static __always_inline struct pt_regs *
ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs)
{
@@ -202,7 +206,11 @@ ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs)
* Since arch_ftrace_get_regs() will check some members and may return
* NULL, we can not use it.
*/
- return &arch_ftrace_regs(fregs)->regs;
+ regs = &arch_ftrace_regs(fregs)->regs;
+
+ /* Allow arch specific updates to regs. */
+ arch_ftrace_partial_regs(regs);
+ return regs;
}
#endif /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS || CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0ceb4e09306c..623bee335383 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -7,6 +7,7 @@
#include <linux/mmzone.h>
#include <linux/topology.h>
#include <linux/alloc_tag.h>
+#include <linux/cleanup.h>
#include <linux/sched.h>
struct vm_area_struct;
@@ -463,4 +464,6 @@ static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
/* This should be paired with folio_put() rather than free_contig_range(). */
#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__))
+DEFINE_FREE(free_page, void *, free_page((unsigned long)_T))
+
#endif /* __LINUX_GFP_H */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 105cc4c00cc3..abc20f9810fd 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -249,10 +249,12 @@ static inline void clear_highpage_kasan_tagged(struct page *page)
kunmap_local(kaddr);
}
-#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
+#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES
-static inline void tag_clear_highpage(struct page *page)
+/* Return false to let people know we did not initialize the pages */
+static inline bool tag_clear_highpages(struct page *page, int numpages)
{
+ return false;
}
#endif
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f327d62fc985..11cab07f322a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,7 +11,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-void huge_pmd_set_accessed(struct vm_fault *vmf);
+bool huge_pmd_set_accessed(struct vm_fault *vmf);
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
struct vm_area_struct *vma);
@@ -376,45 +376,30 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
struct list_head *list);
/*
- * try_folio_split - try to split a @folio at @page using non uniform split.
+ * try_folio_split_to_order - try to split a @folio at @page to @new_order using
+ * non uniform split.
* @folio: folio to be split
- * @page: split to order-0 at the given page
- * @list: store the after-split folios
+ * @page: split to @new_order at the given page
+ * @new_order: the target split order
*
- * Try to split a @folio at @page using non uniform split to order-0, if
- * non uniform split is not supported, fall back to uniform split.
+ * Try to split a @folio at @page using non uniform split to @new_order, if
+ * non uniform split is not supported, fall back to uniform split. After-split
+ * folios are put back to LRU list. Use min_order_for_split() to get the lower
+ * bound of @new_order.
*
* Return: 0: split is successful, otherwise split failed.
*/
-static inline int try_folio_split(struct folio *folio, struct page *page,
- struct list_head *list)
+static inline int try_folio_split_to_order(struct folio *folio,
+ struct page *page, unsigned int new_order)
{
- int ret = min_order_for_split(folio);
-
- if (ret < 0)
- return ret;
-
- if (!non_uniform_split_supported(folio, 0, false))
- return split_huge_page_to_list_to_order(&folio->page, list,
- ret);
- return folio_split(folio, ret, page, list);
+ if (!non_uniform_split_supported(folio, new_order, /* warns= */ false))
+ return split_huge_page_to_list_to_order(&folio->page, NULL,
+ new_order);
+ return folio_split(folio, new_order, page, NULL);
}
static inline int split_huge_page(struct page *page)
{
- struct folio *folio = page_folio(page);
- int ret = min_order_for_split(folio);
-
- if (ret < 0)
- return ret;
-
- /*
- * split_huge_page() locks the page before splitting and
- * expects the same page that has been split to be locked when
- * returned. split_folio(page_folio(page)) cannot be used here
- * because it converts the page to folio and passes the head
- * page to be split.
- */
- return split_huge_page_to_list_to_order(page, NULL, ret);
+ return split_huge_page_to_list_to_order(page, NULL, 0);
}
void deferred_split_folio(struct folio *folio, bool partially_mapped);
@@ -597,14 +582,20 @@ static inline int split_huge_page(struct page *page)
return -EINVAL;
}
+static inline int min_order_for_split(struct folio *folio)
+{
+ VM_WARN_ON_ONCE_FOLIO(1, folio);
+ return -EINVAL;
+}
+
static inline int split_folio_to_list(struct folio *folio, struct list_head *list)
{
VM_WARN_ON_ONCE_FOLIO(1, folio);
return -EINVAL;
}
-static inline int try_folio_split(struct folio *folio, struct page *page,
- struct list_head *list)
+static inline int try_folio_split_to_order(struct folio *folio,
+ struct page *page, unsigned int new_order)
{
VM_WARN_ON_ONCE_FOLIO(1, folio);
return -EINVAL;
diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h
index 5eb66a399002..4f33e6a39797 100644
--- a/include/linux/iio/buffer-dma.h
+++ b/include/linux/iio/buffer-dma.h
@@ -174,5 +174,6 @@ int iio_dma_buffer_enqueue_dmabuf(struct iio_buffer *buffer,
size_t size, bool cyclic);
void iio_dma_buffer_lock_queue(struct iio_buffer *buffer);
void iio_dma_buffer_unlock_queue(struct iio_buffer *buffer);
+struct device *iio_dma_buffer_get_dma_dev(struct iio_buffer *buffer);
#endif
diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h
index e72552e026f3..8d770ced66b2 100644
--- a/include/linux/iio/buffer_impl.h
+++ b/include/linux/iio/buffer_impl.h
@@ -50,6 +50,7 @@ struct sg_table;
* @enqueue_dmabuf: called from userspace via ioctl to queue this DMABUF
* object to this buffer. Requires a valid DMABUF fd, that
* was previouly attached to this buffer.
+ * @get_dma_dev: called to get the DMA channel associated with this buffer.
* @lock_queue: called when the core needs to lock the buffer queue;
* it is used when enqueueing DMABUF objects.
* @unlock_queue: used to unlock a previously locked buffer queue
@@ -90,6 +91,7 @@ struct iio_buffer_access_funcs {
struct iio_dma_buffer_block *block,
struct dma_fence *fence, struct sg_table *sgt,
size_t size, bool cyclic);
+ struct device * (*get_dma_dev)(struct iio_buffer *buffer);
void (*lock_queue)(struct iio_buffer *buffer);
void (*unlock_queue)(struct iio_buffer *buffer);
diff --git a/include/linux/init.h b/include/linux/init.h
index 17c1bc712e23..40331923b9f4 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -200,12 +200,13 @@ extern struct module __this_module;
/* Format: <modname>__<counter>_<line>_<fn> */
#define __initcall_id(fn) \
+ __PASTE(kmod_, \
__PASTE(__KBUILD_MODNAME, \
__PASTE(__, \
__PASTE(__COUNTER__, \
__PASTE(_, \
__PASTE(__LINE__, \
- __PASTE(_, fn))))))
+ __PASTE(_, fn)))))))
/* Format: __<prefix>__<iid><id> */
#define __initcall_name(prefix, __iid, id) \
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index bccb3f1f6262..a6cb241ea00c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,7 +25,6 @@
extern struct files_struct init_files;
extern struct fs_struct init_fs;
extern struct nsproxy init_nsproxy;
-extern struct cred init_cred;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
#define INIT_PREV_CPUTIME(x) .prev_cputime = { \
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index c0397423d3a8..e9ade2ff4af6 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -152,7 +152,7 @@ struct rapl_if_priv {
union rapl_reg reg_unit;
union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
int limits[RAPL_DOMAIN_MAX];
- int (*read_raw)(int id, struct reg_action *ra);
+ int (*read_raw)(int id, struct reg_action *ra, bool atomic);
int (*write_raw)(int id, struct reg_action *ra);
void *defaults;
void *rpi;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 51b6484c0493..266f2b39213a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -109,6 +109,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
* @name: name of the device
* @dev_id: cookie to identify the device
* @percpu_dev_id: cookie to identify the device
+ * @affinity: CPUs this irqaction is allowed to run on
* @next: pointer to the next irqaction for shared interrupts
* @irq: interrupt number
* @flags: flags (see IRQF_* above)
@@ -121,8 +122,11 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
*/
struct irqaction {
irq_handler_t handler;
- void *dev_id;
- void __percpu *percpu_dev_id;
+ union {
+ void *dev_id;
+ void __percpu *percpu_dev_id;
+ };
+ const struct cpumask *affinity;
struct irqaction *next;
irq_handler_t thread_fn;
struct task_struct *thread;
@@ -179,7 +183,7 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler,
extern int __must_check
__request_percpu_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *devname,
- void __percpu *percpu_dev_id);
+ const cpumask_t *affinity, void __percpu *percpu_dev_id);
extern int __must_check
request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags,
@@ -190,12 +194,21 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler,
const char *devname, void __percpu *percpu_dev_id)
{
return __request_percpu_irq(irq, handler, 0,
- devname, percpu_dev_id);
+ devname, NULL, percpu_dev_id);
+}
+
+static inline int __must_check
+request_percpu_irq_affinity(unsigned int irq, irq_handler_t handler,
+ const char *devname, const cpumask_t *affinity,
+ void __percpu *percpu_dev_id)
+{
+ return __request_percpu_irq(irq, handler, 0,
+ devname, affinity, percpu_dev_id);
}
extern int __must_check
-request_percpu_nmi(unsigned int irq, irq_handler_t handler,
- const char *devname, void __percpu *dev);
+request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name,
+ const struct cpumask *affinity, void __percpu *dev_id);
extern const void *free_irq(unsigned int, void *);
extern void free_percpu_irq(unsigned int, void __percpu *);
diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h
index 2b8026a39906..9d5791e9f737 100644
--- a/include/linux/interval_tree.h
+++ b/include/linux/interval_tree.h
@@ -20,6 +20,10 @@ interval_tree_remove(struct interval_tree_node *node,
struct rb_root_cached *root);
extern struct interval_tree_node *
+interval_tree_subtree_search(struct interval_tree_node *node,
+ unsigned long start, unsigned long last);
+
+extern struct interval_tree_node *
interval_tree_iter_first(struct rb_root_cached *root,
unsigned long start, unsigned long last);
diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
index 1b400f26f63d..c5a2fed49eb0 100644
--- a/include/linux/interval_tree_generic.h
+++ b/include/linux/interval_tree_generic.h
@@ -77,7 +77,7 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \
* Cond2: start <= ITLAST(node) \
*/ \
\
-static ITSTRUCT * \
+ITSTATIC ITSTRUCT * \
ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \
{ \
while (true) { \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 73dceabc21c8..520e967cb501 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/mm_types.h>
#include <linux/blkdev.h>
+#include <linux/pagevec.h>
struct address_space;
struct fiemap_extent_info;
@@ -16,6 +17,7 @@ struct inode;
struct iomap_iter;
struct iomap_dio;
struct iomap_writepage_ctx;
+struct iomap_read_folio_ctx;
struct iov_iter;
struct kiocb;
struct page;
@@ -241,11 +243,12 @@ struct iomap_iter {
unsigned flags;
struct iomap iomap;
struct iomap srcmap;
+ struct folio_batch *fbatch;
void *private;
};
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
-int iomap_iter_advance(struct iomap_iter *iter, u64 *count);
+int iomap_iter_advance(struct iomap_iter *iter, u64 count);
/**
* iomap_length_trim - trimmed length of the current iomap iteration
@@ -282,9 +285,7 @@ static inline u64 iomap_length(const struct iomap_iter *iter)
*/
static inline int iomap_iter_advance_full(struct iomap_iter *iter)
{
- u64 length = iomap_length(iter);
-
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, iomap_length(iter));
}
/**
@@ -339,8 +340,10 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter)
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops,
const struct iomap_write_ops *write_ops, void *private);
-int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
-void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
+void iomap_read_folio(const struct iomap_ops *ops,
+ struct iomap_read_folio_ctx *ctx);
+void iomap_readahead(const struct iomap_ops *ops,
+ struct iomap_read_folio_ctx *ctx);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
@@ -349,6 +352,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops,
const struct iomap_write_ops *write_ops);
+loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset,
+ loff_t length);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops,
const struct iomap_write_ops *write_ops, void *private);
@@ -430,6 +435,10 @@ struct iomap_writeback_ops {
* An existing mapping from a previous call to this method can be reused
* by the file system if it is still valid.
*
+ * If this succeeds, iomap_finish_folio_write() must be called once
+ * writeback completes for the range, regardless of whether the
+ * writeback succeeded or failed.
+ *
* Returns the number of bytes processed or a negative errno.
*/
ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc,
@@ -467,14 +476,41 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
loff_t pos, loff_t end_pos, unsigned int dirty_len);
int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error);
-void iomap_start_folio_write(struct inode *inode, struct folio *folio,
- size_t len);
+void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len,
+ int error);
void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
size_t len);
int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio);
int iomap_writepages(struct iomap_writepage_ctx *wpc);
+struct iomap_read_folio_ctx {
+ const struct iomap_read_ops *ops;
+ struct folio *cur_folio;
+ struct readahead_control *rac;
+ void *read_ctx;
+};
+
+struct iomap_read_ops {
+ /*
+ * Read in a folio range.
+ *
+ * If this succeeds, iomap_finish_folio_read() must be called after the
+ * range is read in, regardless of whether the read succeeded or failed.
+ *
+ * Returns 0 on success or a negative error on failure.
+ */
+ int (*read_folio_range)(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx, size_t len);
+
+ /*
+ * Submit any pending read requests.
+ *
+ * This is optional.
+ */
+ void (*submit_read)(struct iomap_read_folio_ctx *ctx);
+};
+
/*
* Flags for direct I/O ->end_io:
*/
@@ -518,6 +554,14 @@ struct iomap_dio_ops {
*/
#define IOMAP_DIO_PARTIAL (1 << 2)
+/*
+ * Ensure each bio is aligned to fs block size.
+ *
+ * For filesystems which need to calculate/verify the checksum of each fs
+ * block. Otherwise they may not be able to handle unaligned bios.
+ */
+#define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3)
+
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, void *private, size_t done_before);
@@ -540,4 +584,30 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
extern struct bio_set iomap_ioend_bioset;
+#ifdef CONFIG_BLOCK
+extern const struct iomap_read_ops iomap_bio_read_ops;
+
+static inline void iomap_bio_read_folio(struct folio *folio,
+ const struct iomap_ops *ops)
+{
+ struct iomap_read_folio_ctx ctx = {
+ .ops = &iomap_bio_read_ops,
+ .cur_folio = folio,
+ };
+
+ iomap_read_folio(ops, &ctx);
+}
+
+static inline void iomap_bio_readahead(struct readahead_control *rac,
+ const struct iomap_ops *ops)
+{
+ struct iomap_read_folio_ctx ctx = {
+ .ops = &iomap_bio_read_ops,
+ .rac = rac,
+ };
+
+ iomap_readahead(ops, &ctx);
+}
+#endif /* CONFIG_BLOCK */
+
#endif /* LINUX_IOMAP_H */
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index d643c7c87822..6ab913e57da0 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -2,11 +2,12 @@
#ifndef __LINUX_IRQENTRYCOMMON_H
#define __LINUX_IRQENTRYCOMMON_H
+#include <linux/context_tracking.h>
+#include <linux/kmsan.h>
+#include <linux/rseq_entry.h>
#include <linux/static_call_types.h>
#include <linux/syscalls.h>
-#include <linux/context_tracking.h>
#include <linux/tick.h>
-#include <linux/kmsan.h>
#include <linux/unwind_deferred.h>
#include <asm/entry-common.h>
@@ -29,7 +30,7 @@
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
- _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
+ _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \
ARCH_EXIT_TO_USER_MODE_WORK)
/**
@@ -67,6 +68,7 @@ static __always_inline bool arch_in_rcu_eqs(void) { return false; }
/**
* enter_from_user_mode - Establish state when coming from user mode
+ * @regs: Pointer to currents pt_regs
*
* Syscall/interrupt entry disables interrupts, but user mode is traced as
* interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
@@ -195,14 +197,11 @@ static __always_inline void arch_exit_to_user_mode(void) { }
*/
void arch_do_signal_or_restart(struct pt_regs *regs);
-/**
- * exit_to_user_mode_loop - do any pending work before leaving to user space
- */
-unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work);
+/* Handle pending TIF work */
+unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work);
/**
- * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
* @regs: Pointer to pt_regs on entry stack
*
* 1) check that interrupts are disabled
@@ -210,8 +209,10 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
* 3) call exit_to_user_mode_loop() if any flags from
* EXIT_TO_USER_MODE_WORK are set
* 4) check that interrupts are still disabled
+ *
+ * Don't invoke directly, use the syscall/irqentry_ prefixed variants below
*/
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs)
{
unsigned long ti_work;
@@ -225,13 +226,52 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work);
+}
+static __always_inline void __exit_to_user_mode_validate(void)
+{
/* Ensure that kernel state is sane for a return to userspace */
kmap_assert_nomap();
lockdep_assert_irqs_disabled();
lockdep_sys_exit();
}
+/* Temporary workaround to keep ARM64 alive */
+static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs)
+{
+ __exit_to_user_mode_prepare(regs);
+ rseq_exit_to_user_mode_legacy();
+ __exit_to_user_mode_validate();
+}
+
+/**
+ * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs: Pointer to pt_regs on entry stack
+ *
+ * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for
+ * syscalls and interrupts.
+ */
+static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+ __exit_to_user_mode_prepare(regs);
+ rseq_syscall_exit_to_user_mode();
+ __exit_to_user_mode_validate();
+}
+
+/**
+ * irqentry_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs: Pointer to pt_regs on entry stack
+ *
+ * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for
+ * syscalls and interrupts.
+ */
+static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+ __exit_to_user_mode_prepare(regs);
+ rseq_irqentry_exit_to_user_mode();
+ __exit_to_user_mode_validate();
+}
+
/**
* exit_to_user_mode - Fixup state when exiting to user mode
*
@@ -253,11 +293,11 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
static __always_inline void exit_to_user_mode(void)
{
instrumentation_begin();
+ unwind_reset_info();
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
instrumentation_end();
- unwind_reset_info();
user_enter_irqoff();
arch_exit_to_user_mode();
lockdep_hardirqs_on(CALLER_ADDR0);
@@ -274,7 +314,11 @@ static __always_inline void exit_to_user_mode(void)
*
* The function establishes state (lockdep, RCU (context tracking), tracing)
*/
-void irqentry_enter_from_user_mode(struct pt_regs *regs);
+static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
+{
+ enter_from_user_mode(regs);
+ rseq_note_user_irq_entry();
+}
/**
* irqentry_exit_to_user_mode - Interrupt exit work
@@ -289,7 +333,13 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs);
* Interrupt exit is not invoking #1 which is the syscall specific one time
* work.
*/
-void irqentry_exit_to_user_mode(struct pt_regs *regs);
+static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ irqentry_exit_to_user_mode_prepare(regs);
+ instrumentation_end();
+ exit_to_user_mode();
+}
#ifndef irqentry_state
/**
@@ -354,6 +404,7 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
* Conditional reschedule with additional sanity checks.
*/
void raw_irqentry_exit_cond_resched(void);
+
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c67e76fbcc07..4a9f1d7b08c3 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -655,7 +655,6 @@ extern void handle_bad_irq(struct irq_desc *desc);
extern void handle_nested_irq(unsigned int irq);
extern void handle_fasteoi_nmi(struct irq_desc *desc);
-extern void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc);
extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg);
extern int irq_chip_pm_get(struct irq_data *data);
@@ -719,10 +718,6 @@ static inline void irq_set_chip_and_handler(unsigned int irq,
}
extern int irq_set_percpu_devid(unsigned int irq);
-extern int irq_set_percpu_devid_partition(unsigned int irq,
- const struct cpumask *affinity);
-extern int irq_get_percpu_devid_partition(unsigned int irq,
- struct cpumask *affinity);
extern void
__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 136f2980cba3..c5afd053ae32 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -2,8 +2,9 @@
#ifndef _LINUX_IRQ_WORK_H
#define _LINUX_IRQ_WORK_H
-#include <linux/smp_types.h>
+#include <linux/irq_work_types.h>
#include <linux/rcuwait.h>
+#include <linux/smp_types.h>
/*
* An entry can be in one of four states:
@@ -14,12 +15,6 @@
* busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
*/
-struct irq_work {
- struct __call_single_node node;
- void (*func)(struct irq_work *);
- struct rcuwait irqwait;
-};
-
#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \
.node = { .u_flags = (_flags), }, \
.func = (_func), \
diff --git a/include/linux/irq_work_types.h b/include/linux/irq_work_types.h
new file mode 100644
index 000000000000..73abec5bb06e
--- /dev/null
+++ b/include/linux/irq_work_types.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IRQ_WORK_TYPES_H
+#define _LINUX_IRQ_WORK_TYPES_H
+
+#include <linux/smp_types.h>
+#include <linux/types.h>
+
+struct irq_work {
+ struct __call_single_node node;
+ void (*func)(struct irq_work *);
+ struct rcuwait irqwait;
+};
+
+#endif
diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h
index d5e6024cb2a8..bc4ddacd6ddc 100644
--- a/include/linux/irqchip.h
+++ b/include/linux/irqchip.h
@@ -17,12 +17,18 @@
#include <linux/of_irq.h>
#include <linux/platform_device.h>
+typedef int (*platform_irq_probe_t)(struct platform_device *, struct device_node *);
+
/* Undefined on purpose */
extern of_irq_init_cb_t typecheck_irq_init_cb;
+extern platform_irq_probe_t typecheck_irq_probe;
#define typecheck_irq_init_cb(fn) \
(__typecheck(typecheck_irq_init_cb, &fn) ? fn : fn)
+#define typecheck_irq_probe(fn) \
+ (__typecheck(typecheck_irq_probe, &fn) ? fn : fn)
+
/*
* This macro must be used by the different irqchip drivers to declare
* the association between their DT compatible string and their
@@ -42,7 +48,7 @@ extern int platform_irqchip_probe(struct platform_device *pdev);
static const struct of_device_id drv_name##_irqchip_match_table[] = {
#define IRQCHIP_MATCH(compat, fn) { .compatible = compat, \
- .data = typecheck_irq_init_cb(fn), },
+ .data = typecheck_irq_probe(fn), },
#define IRQCHIP_PLATFORM_DRIVER_END(drv_name, ...) \
diff --git a/include/linux/irqchip/irq-partition-percpu.h b/include/linux/irqchip/irq-partition-percpu.h
deleted file mode 100644
index b35ee22c278f..000000000000
--- a/include/linux/irqchip/irq-partition-percpu.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2016 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#ifndef __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H
-#define __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H
-
-#include <linux/fwnode.h>
-#include <linux/cpumask_types.h>
-#include <linux/irqdomain.h>
-
-struct partition_affinity {
- cpumask_t mask;
- void *partition_id;
-};
-
-struct partition_desc;
-
-#ifdef CONFIG_PARTITION_PERCPU
-int partition_translate_id(struct partition_desc *desc, void *partition_id);
-struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode,
- struct partition_affinity *parts,
- int nr_parts,
- int chained_irq,
- const struct irq_domain_ops *ops);
-struct irq_domain *partition_get_domain(struct partition_desc *dsc);
-#else
-static inline int partition_translate_id(struct partition_desc *desc,
- void *partition_id)
-{
- return -EINVAL;
-}
-
-static inline
-struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode,
- struct partition_affinity *parts,
- int nr_parts,
- int chained_irq,
- const struct irq_domain_ops *ops)
-{
- return NULL;
-}
-
-static inline
-struct irq_domain *partition_get_domain(struct partition_desc *dsc)
-{
- return NULL;
-}
-#endif
-
-#endif /* __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H */
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index fd091c35d572..37e0b5b5600a 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -82,7 +82,6 @@ struct irq_desc {
int threads_handled_last;
raw_spinlock_t lock;
struct cpumask *percpu_enabled;
- const struct cpumask *percpu_affinity;
#ifdef CONFIG_SMP
const struct cpumask *affinity_hint;
struct irq_affinity_notify *affinity_notify;
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 4a86e6b915dd..952d3c8dd6b7 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -44,6 +44,23 @@ struct irq_fwspec {
u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS];
};
+/**
+ * struct irq_fwspec_info - firmware provided IRQ information structure
+ *
+ * @flags: Information validity flags
+ * @affinity: Affinity mask for this interrupt
+ *
+ * This structure reports firmware-specific information about an
+ * interrupt. The only significant information is the affinity of a
+ * per-CPU interrupt, but this is designed to be extended as required.
+ */
+struct irq_fwspec_info {
+ unsigned long flags;
+ const struct cpumask *affinity;
+};
+
+#define IRQ_FWSPEC_INFO_AFFINITY_VALID BIT(0)
+
/* Conversion function from of_phandle_args fields to fwspec */
void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
unsigned int count, struct irq_fwspec *fwspec);
@@ -69,6 +86,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
* @translate: Given @fwspec, decode the hardware irq number (@out_hwirq) and
* linux irq type value (@out_type). This is a generalised @xlate
* (over struct irq_fwspec) and is preferred if provided.
+ * @get_fwspec_info:
+ * Given @fwspec, report additional firmware-provided information in
+ * @info. Optional.
* @debug_show: For domains to show specific data for an interrupt in debugfs.
*
* Functions below are provided by the driver and called whenever a new mapping
@@ -96,6 +116,7 @@ struct irq_domain_ops {
void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data);
int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec,
unsigned long *out_hwirq, unsigned int *out_type);
+ int (*get_fwspec_info)(struct irq_fwspec *fwspec, struct irq_fwspec_info *info);
#endif
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
void (*debug_show)(struct seq_file *m, struct irq_domain *d,
@@ -602,6 +623,8 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_bas
int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq);
+int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info);
+
static inline bool irq_domain_is_hierarchy(struct irq_domain *domain)
{
return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY;
@@ -685,6 +708,10 @@ static inline bool irq_domain_is_msi_device(struct irq_domain *domain)
return false;
}
+static inline int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
#ifdef CONFIG_GENERIC_MSI_IRQ
@@ -703,12 +730,6 @@ static inline void msi_device_domain_free_wired(struct irq_domain *domain, unsig
}
#endif
-/* Deprecated functions. Will be removed in the merge window */
-static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node)
-{
- return node ? &node->fwnode : NULL;
-}
-
static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
const struct irq_domain_ops *ops,
void *host_data)
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 490464c205b4..a568d8e6f4e8 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -11,8 +11,22 @@
#ifdef KVM_SUB_MODULES
#define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) \
EXPORT_SYMBOL_FOR_MODULES(symbol, __stringify(KVM_SUB_MODULES))
+#define EXPORT_SYMBOL_FOR_KVM(symbol) \
+ EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm," __stringify(KVM_SUB_MODULES))
#else
#define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol)
+/*
+ * Allow architectures to provide a custom EXPORT_SYMBOL_FOR_KVM, but only
+ * if there are no submodules, e.g. to allow suppressing exports if KVM=m, but
+ * kvm.ko won't actually be built (due to lack of at least one submodule).
+ */
+#ifndef EXPORT_SYMBOL_FOR_KVM
+#if IS_MODULE(CONFIG_KVM)
+#define EXPORT_SYMBOL_FOR_KVM(symbol) EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm")
+#else
+#define EXPORT_SYMBOL_FOR_KVM(symbol)
+#endif /* IS_MODULE(CONFIG_KVM) */
+#endif /* EXPORT_SYMBOL_FOR_KVM */
#endif
#ifndef __ASSEMBLER__
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 51a258c24ff5..772919e8096a 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -13,6 +13,7 @@
#include <linux/ftrace.h>
#include <linux/completion.h>
#include <linux/list.h>
+#include <linux/livepatch_external.h>
#include <linux/livepatch_sched.h>
#if IS_ENABLED(CONFIG_LIVEPATCH)
@@ -77,30 +78,6 @@ struct klp_func {
bool transition;
};
-struct klp_object;
-
-/**
- * struct klp_callbacks - pre/post live-(un)patch callback structure
- * @pre_patch: executed before code patching
- * @post_patch: executed after code patching
- * @pre_unpatch: executed before code unpatching
- * @post_unpatch: executed after code unpatching
- * @post_unpatch_enabled: flag indicating if post-unpatch callback
- * should run
- *
- * All callbacks are optional. Only the pre-patch callback, if provided,
- * will be unconditionally executed. If the parent klp_object fails to
- * patch for any reason, including a non-zero error status returned from
- * the pre-patch callback, no further callbacks will be executed.
- */
-struct klp_callbacks {
- int (*pre_patch)(struct klp_object *obj);
- void (*post_patch)(struct klp_object *obj);
- void (*pre_unpatch)(struct klp_object *obj);
- void (*post_unpatch)(struct klp_object *obj);
- bool post_unpatch_enabled;
-};
-
/**
* struct klp_object - kernel object structure for live patching
* @name: module name (or NULL for vmlinux)
diff --git a/include/linux/livepatch_external.h b/include/linux/livepatch_external.h
new file mode 100644
index 000000000000..138af19b0f5c
--- /dev/null
+++ b/include/linux/livepatch_external.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * External livepatch interfaces for patch creation tooling
+ */
+
+#ifndef _LINUX_LIVEPATCH_EXTERNAL_H_
+#define _LINUX_LIVEPATCH_EXTERNAL_H_
+
+#include <linux/types.h>
+
+#define KLP_RELOC_SEC_PREFIX ".klp.rela."
+#define KLP_SYM_PREFIX ".klp.sym."
+
+#define __KLP_PRE_PATCH_PREFIX __klp_pre_patch_callback_
+#define __KLP_POST_PATCH_PREFIX __klp_post_patch_callback_
+#define __KLP_PRE_UNPATCH_PREFIX __klp_pre_unpatch_callback_
+#define __KLP_POST_UNPATCH_PREFIX __klp_post_unpatch_callback_
+
+#define KLP_PRE_PATCH_PREFIX __stringify(__KLP_PRE_PATCH_PREFIX)
+#define KLP_POST_PATCH_PREFIX __stringify(__KLP_POST_PATCH_PREFIX)
+#define KLP_PRE_UNPATCH_PREFIX __stringify(__KLP_PRE_UNPATCH_PREFIX)
+#define KLP_POST_UNPATCH_PREFIX __stringify(__KLP_POST_UNPATCH_PREFIX)
+
+struct klp_object;
+
+typedef int (*klp_pre_patch_t)(struct klp_object *obj);
+typedef void (*klp_post_patch_t)(struct klp_object *obj);
+typedef void (*klp_pre_unpatch_t)(struct klp_object *obj);
+typedef void (*klp_post_unpatch_t)(struct klp_object *obj);
+
+/**
+ * struct klp_callbacks - pre/post live-(un)patch callback structure
+ * @pre_patch: executed before code patching
+ * @post_patch: executed after code patching
+ * @pre_unpatch: executed before code unpatching
+ * @post_unpatch: executed after code unpatching
+ * @post_unpatch_enabled: flag indicating if post-unpatch callback
+ * should run
+ *
+ * All callbacks are optional. Only the pre-patch callback, if provided,
+ * will be unconditionally executed. If the parent klp_object fails to
+ * patch for any reason, including a non-zero error status returned from
+ * the pre-patch callback, no further callbacks will be executed.
+ */
+struct klp_callbacks {
+ klp_pre_patch_t pre_patch;
+ klp_post_patch_t post_patch;
+ klp_pre_unpatch_t pre_unpatch;
+ klp_post_unpatch_t post_unpatch;
+ bool post_unpatch_enabled;
+};
+
+/*
+ * 'struct klp_{func,object}_ext' are compact "external" representations of
+ * 'struct klp_{func,object}'. They are used by objtool for livepatch
+ * generation. The structs are then read by the livepatch module and converted
+ * to the real structs before calling klp_enable_patch().
+ *
+ * TODO make these the official API for klp_enable_patch(). That should
+ * simplify livepatch's interface as well as its data structure lifetime
+ * management.
+ */
+struct klp_func_ext {
+ const char *old_name;
+ void *new_func;
+ unsigned long sympos;
+};
+
+struct klp_object_ext {
+ const char *name;
+ struct klp_func_ext *funcs;
+ struct klp_callbacks callbacks;
+ unsigned int nr_funcs;
+};
+
+#endif /* _LINUX_LIVEPATCH_EXTERNAL_H_ */
diff --git a/include/linux/livepatch_helpers.h b/include/linux/livepatch_helpers.h
new file mode 100644
index 000000000000..99d68d0773fa
--- /dev/null
+++ b/include/linux/livepatch_helpers.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_LIVEPATCH_HELPERS_H
+#define _LINUX_LIVEPATCH_HELPERS_H
+
+/*
+ * Interfaces for use by livepatch patches
+ */
+
+#include <linux/syscalls.h>
+#include <linux/livepatch.h>
+
+#ifdef MODULE
+#define KLP_OBJNAME __KBUILD_MODNAME
+#else
+#define KLP_OBJNAME vmlinux
+#endif
+
+/* Livepatch callback registration */
+
+#define KLP_CALLBACK_PTRS ".discard.klp_callback_ptrs"
+
+#define KLP_PRE_PATCH_CALLBACK(func) \
+ klp_pre_patch_t __used __section(KLP_CALLBACK_PTRS) \
+ __PASTE(__KLP_PRE_PATCH_PREFIX, KLP_OBJNAME) = func
+
+#define KLP_POST_PATCH_CALLBACK(func) \
+ klp_post_patch_t __used __section(KLP_CALLBACK_PTRS) \
+ __PASTE(__KLP_POST_PATCH_PREFIX, KLP_OBJNAME) = func
+
+#define KLP_PRE_UNPATCH_CALLBACK(func) \
+ klp_pre_unpatch_t __used __section(KLP_CALLBACK_PTRS) \
+ __PASTE(__KLP_PRE_UNPATCH_PREFIX, KLP_OBJNAME) = func
+
+#define KLP_POST_UNPATCH_CALLBACK(func) \
+ klp_post_unpatch_t __used __section(KLP_CALLBACK_PTRS) \
+ __PASTE(__KLP_POST_UNPATCH_PREFIX, KLP_OBJNAME) = func
+
+/*
+ * Replace static_call() usage with this macro when create-diff-object
+ * recommends it due to the original static call key living in a module.
+ *
+ * This converts the static call to a regular indirect call.
+ */
+#define KLP_STATIC_CALL(name) \
+ ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func))
+
+/* Syscall patching */
+
+#define KLP_SYSCALL_DEFINE1(name, ...) KLP_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
+#define KLP_SYSCALL_DEFINE2(name, ...) KLP_SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
+#define KLP_SYSCALL_DEFINE3(name, ...) KLP_SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
+#define KLP_SYSCALL_DEFINE4(name, ...) KLP_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
+#define KLP_SYSCALL_DEFINE5(name, ...) KLP_SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
+#define KLP_SYSCALL_DEFINE6(name, ...) KLP_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
+
+#define KLP_SYSCALL_DEFINEx(x, sname, ...) \
+ __KLP_SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+
+#ifdef CONFIG_X86_64
+// TODO move this to arch/x86/include/asm/syscall_wrapper.h and share code
+#define __KLP_SYSCALL_DEFINEx(x, name, ...) \
+ static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
+ static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
+ __X64_SYS_STUBx(x, name, __VA_ARGS__) \
+ __IA32_SYS_STUBx(x, name, __VA_ARGS__) \
+ static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
+ { \
+ long ret = __klp_do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
+ __MAP(x,__SC_TEST,__VA_ARGS__); \
+ __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
+ return ret; \
+ } \
+ static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+
+#endif
+
+#endif /* _LINUX_LIVEPATCH_HELPERS_H */
diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
index 0d91d060e3e9..b0e6ab329b00 100644
--- a/include/linux/local_lock.h
+++ b/include/linux/local_lock.h
@@ -6,6 +6,7 @@
/**
* local_lock_init - Runtime initialize a lock instance
+ * @lock: The lock variable
*/
#define local_lock_init(lock) __local_lock_init(lock)
@@ -52,7 +53,8 @@
__local_unlock_irqrestore(this_cpu_ptr(lock), flags)
/**
- * local_lock_init - Runtime initialize a lock instance
+ * local_trylock_init - Runtime initialize a lock instance
+ * @lock: The lock variable
*/
#define local_trylock_init(lock) __local_trylock_init(lock)
diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index a4dc479157b5..8f82b4eb542f 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -99,18 +99,18 @@ do { \
#define __local_lock_acquire(lock) \
do { \
- local_trylock_t *tl; \
- local_lock_t *l; \
+ local_trylock_t *__tl; \
+ local_lock_t *__l; \
\
- l = (local_lock_t *)(lock); \
- tl = (local_trylock_t *)l; \
+ __l = (local_lock_t *)(lock); \
+ __tl = (local_trylock_t *)__l; \
_Generic((lock), \
local_trylock_t *: ({ \
- lockdep_assert(tl->acquired == 0); \
- WRITE_ONCE(tl->acquired, 1); \
+ lockdep_assert(__tl->acquired == 0); \
+ WRITE_ONCE(__tl->acquired, 1); \
}), \
local_lock_t *: (void)0); \
- local_lock_acquire(l); \
+ local_lock_acquire(__l); \
} while (0)
#define __local_lock(lock) \
@@ -133,36 +133,36 @@ do { \
#define __local_trylock(lock) \
({ \
- local_trylock_t *tl; \
+ local_trylock_t *__tl; \
\
preempt_disable(); \
- tl = (lock); \
- if (READ_ONCE(tl->acquired)) { \
+ __tl = (lock); \
+ if (READ_ONCE(__tl->acquired)) { \
preempt_enable(); \
- tl = NULL; \
+ __tl = NULL; \
} else { \
- WRITE_ONCE(tl->acquired, 1); \
+ WRITE_ONCE(__tl->acquired, 1); \
local_trylock_acquire( \
- (local_lock_t *)tl); \
+ (local_lock_t *)__tl); \
} \
- !!tl; \
+ !!__tl; \
})
#define __local_trylock_irqsave(lock, flags) \
({ \
- local_trylock_t *tl; \
+ local_trylock_t *__tl; \
\
local_irq_save(flags); \
- tl = (lock); \
- if (READ_ONCE(tl->acquired)) { \
+ __tl = (lock); \
+ if (READ_ONCE(__tl->acquired)) { \
local_irq_restore(flags); \
- tl = NULL; \
+ __tl = NULL; \
} else { \
- WRITE_ONCE(tl->acquired, 1); \
+ WRITE_ONCE(__tl->acquired, 1); \
local_trylock_acquire( \
- (local_lock_t *)tl); \
+ (local_lock_t *)__tl); \
} \
- !!tl; \
+ !!__tl; \
})
/* preemption or migration must be disabled before calling __local_lock_is_locked */
@@ -170,16 +170,16 @@ do { \
#define __local_lock_release(lock) \
do { \
- local_trylock_t *tl; \
- local_lock_t *l; \
+ local_trylock_t *__tl; \
+ local_lock_t *__l; \
\
- l = (local_lock_t *)(lock); \
- tl = (local_trylock_t *)l; \
- local_lock_release(l); \
+ __l = (local_lock_t *)(lock); \
+ __tl = (local_trylock_t *)__l; \
+ local_lock_release(__l); \
_Generic((lock), \
local_trylock_t *: ({ \
- lockdep_assert(tl->acquired == 1); \
- WRITE_ONCE(tl->acquired, 0); \
+ lockdep_assert(__tl->acquired == 1); \
+ WRITE_ONCE(__tl->acquired, 0); \
}), \
local_lock_t *: (void)0); \
} while (0)
@@ -223,12 +223,12 @@ typedef spinlock_t local_trylock_t;
#define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
#define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
-#define __local_lock_init(l) \
+#define __local_lock_init(__l) \
do { \
- local_spin_lock_init((l)); \
+ local_spin_lock_init((__l)); \
} while (0)
-#define __local_trylock_init(l) __local_lock_init(l)
+#define __local_trylock_init(__l) __local_lock_init(__l)
#define __local_lock(__lock) \
do { \
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 67964dc4db95..dd634103b014 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -616,7 +616,7 @@ do { \
#define lockdep_assert_in_softirq() \
do { \
WARN_ON_ONCE(__lockdep_enabled && \
- (!in_softirq() || in_irq() || in_nmi())); \
+ (!in_softirq() || in_hardirq() || in_nmi())); \
} while (0)
extern void lockdep_assert_in_softirq_func(void);
diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h
index 4c1a91b07de3..e1555e06e7e5 100644
--- a/include/linux/mailbox/mtk-cmdq-mailbox.h
+++ b/include/linux/mailbox/mtk-cmdq-mailbox.h
@@ -77,6 +77,16 @@ struct cmdq_pkt {
size_t buf_size; /* real buffer size */
};
+/**
+ * cmdq_get_shift_pa() - get the shift bits of physical address
+ * @chan: mailbox channel
+ *
+ * GCE can only fetch the command buffer address from a 32-bit register.
+ * Some SOCs support more than 32-bit command buffer address for GCE, which
+ * requires some shift bits to make the address fit into the 32-bit register.
+ *
+ * Return: the shift bits of physical address
+ */
u8 cmdq_get_shift_pa(struct mbox_chan *chan);
#endif /* __MTK_CMDQ_MAILBOX_H__ */
diff --git a/include/linux/map_benchmark.h b/include/linux/map_benchmark.h
index 62674c83bde4..48e2ff95332f 100644
--- a/include/linux/map_benchmark.h
+++ b/include/linux/map_benchmark.h
@@ -27,5 +27,6 @@ struct map_benchmark {
__u32 dma_dir; /* DMA data direction */
__u32 dma_trans_ns; /* time for DMA transmission in ns */
__u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */
+ __u8 expansion[76]; /* For future use */
};
#endif /* _KERNEL_DMA_BENCHMARK_H */
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 0c214256216f..ba1515160894 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -96,17 +96,8 @@ int set_memory_block_size_order(unsigned int order);
#define MEM_GOING_ONLINE (1<<3)
#define MEM_CANCEL_ONLINE (1<<4)
#define MEM_CANCEL_OFFLINE (1<<5)
-#define MEM_PREPARE_ONLINE (1<<6)
-#define MEM_FINISH_OFFLINE (1<<7)
struct memory_notify {
- /*
- * The altmap_start_pfn and altmap_nr_pages fields are designated for
- * specifying the altmap range and are exclusively intended for use in
- * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
- */
- unsigned long altmap_start_pfn;
- unsigned long altmap_nr_pages;
unsigned long start_pfn;
unsigned long nr_pages;
};
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 23f038a16231..f2f16cdd73ee 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -58,22 +58,6 @@ typedef int __bitwise mhp_t;
* implies the node id (nid).
*/
#define MHP_NID_IS_MGID ((__force mhp_t)BIT(2))
-/*
- * The hotplugged memory is completely inaccessible while the memory is
- * offline. The memory provider will handle MEM_PREPARE_ONLINE /
- * MEM_FINISH_OFFLINE notifications and make the memory accessible.
- *
- * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY,
- * because the altmap cannot be written (e.g., poisoned) when adding
- * memory -- before it is set online.
- *
- * This allows for adding memory with an altmap that is not currently
- * made available by a hypervisor. When onlining that memory, the
- * hypervisor can be instructed to make that memory available, and
- * the onlining phase will not require any memory allocations, which is
- * helpful in low-memory situations.
- */
-#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3))
/*
* Extended parameters for memory hotplug:
@@ -123,7 +107,7 @@ extern void adjust_present_page_count(struct page *page,
long nr_pages);
/* VM interface that may be used by firmware interface */
extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
- struct zone *zone, bool mhp_off_inaccessible);
+ struct zone *zone);
extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
extern int online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index e5951ba12a28..30c7aecbd245 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -25,7 +25,6 @@ struct vmem_altmap {
unsigned long free;
unsigned long align;
unsigned long alloc;
- bool inaccessible;
};
/*
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 7ef2c7c7d803..9d47cdc727ad 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -183,6 +183,7 @@ static inline void mlx5_cq_put(struct mlx5_core_cq *cq)
complete(&cq->free);
}
+void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe);
int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
u32 *in, int inlen, u32 *out, int outlen);
int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d16b33bacc32..8dc0a07570cc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2074,7 +2074,7 @@ static inline unsigned long folio_nr_pages(const struct folio *folio)
return folio_large_nr_pages(folio);
}
-#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE)
+#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS)
/*
* We don't expect any folios that exceed buddy sizes (and consequently
* memory sections).
@@ -2087,10 +2087,17 @@ static inline unsigned long folio_nr_pages(const struct folio *folio)
* pages are guaranteed to be contiguous.
*/
#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT
-#else
+#elif defined(CONFIG_HUGETLB_PAGE)
/*
* There is no real limit on the folio size. We limit them to the maximum we
- * currently expect (e.g., hugetlb, dax).
+ * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
+ * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
+ */
+#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
+#else
+/*
+ * Without hugetlb, gigantic folios that are bigger than a single PUD are
+ * currently impossible.
*/
#define MAX_FOLIO_ORDER PUD_ORDER
#endif
@@ -2401,31 +2408,6 @@ struct zap_details {
/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
-#ifdef CONFIG_SCHED_MM_CID
-void sched_mm_cid_before_execve(struct task_struct *t);
-void sched_mm_cid_after_execve(struct task_struct *t);
-void sched_mm_cid_fork(struct task_struct *t);
-void sched_mm_cid_exit_signals(struct task_struct *t);
-static inline int task_mm_cid(struct task_struct *t)
-{
- return t->mm_cid;
-}
-#else
-static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_fork(struct task_struct *t) { }
-static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
-static inline int task_mm_cid(struct task_struct *t)
-{
- /*
- * Use the processor id as a fall-back when the mm cid feature is
- * disabled. This provides functional per-cpu data structure accesses
- * in user-space, althrough it won't provide the memory usage benefits.
- */
- return raw_smp_processor_id();
-}
-#endif
-
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
@@ -3369,6 +3351,8 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
struct rb_root_cached *root);
+struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node,
+ unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
@@ -3495,10 +3479,10 @@ struct vm_unmapped_area_info {
extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
/* truncate.c */
-extern void truncate_inode_pages(struct address_space *, loff_t);
-extern void truncate_inode_pages_range(struct address_space *,
- loff_t lstart, loff_t lend);
-extern void truncate_inode_pages_final(struct address_space *);
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart);
+void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart,
+ uoff_t lend);
+void truncate_inode_pages_final(struct address_space *mapping);
/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 90e5790c318f..3b7d05e7169c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -20,6 +20,7 @@
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>
#include <linux/types.h>
+#include <linux/rseq_types.h>
#include <linux/bitmap.h>
#include <asm/mmu.h>
@@ -922,14 +923,6 @@ struct vm_area_struct {
#define vma_policy(vma) NULL
#endif
-#ifdef CONFIG_SCHED_MM_CID
-struct mm_cid {
- u64 time;
- int cid;
- int recent_cid;
-};
-#endif
-
/*
* Opaque type representing current mm_struct flag state. Must be accessed via
* mm_flags_xxx() helper functions.
@@ -991,44 +984,9 @@ struct mm_struct {
*/
atomic_t mm_users;
-#ifdef CONFIG_SCHED_MM_CID
- /**
- * @pcpu_cid: Per-cpu current cid.
- *
- * Keep track of the currently allocated mm_cid for each cpu.
- * The per-cpu mm_cid values are serialized by their respective
- * runqueue locks.
- */
- struct mm_cid __percpu *pcpu_cid;
- /*
- * @mm_cid_next_scan: Next mm_cid scan (in jiffies).
- *
- * When the next mm_cid scan is due (in jiffies).
- */
- unsigned long mm_cid_next_scan;
- /**
- * @nr_cpus_allowed: Number of CPUs allowed for mm.
- *
- * Number of CPUs allowed in the union of all mm's
- * threads allowed CPUs.
- */
- unsigned int nr_cpus_allowed;
- /**
- * @max_nr_cid: Maximum number of allowed concurrency
- * IDs allocated.
- *
- * Track the highest number of allowed concurrency IDs
- * allocated for the mm.
- */
- atomic_t max_nr_cid;
- /**
- * @cpus_allowed_lock: Lock protecting mm cpus_allowed.
- *
- * Provide mutual exclusion for mm cpus_allowed and
- * mm nr_cpus_allowed updates.
- */
- raw_spinlock_t cpus_allowed_lock;
-#endif
+ /* MM CID related storage */
+ struct mm_mm_cid mm_cid;
+
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
#endif
@@ -1370,37 +1328,6 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
}
#ifdef CONFIG_SCHED_MM_CID
-
-enum mm_cid_state {
- MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */
- MM_CID_LAZY_PUT = (1U << 31),
-};
-
-static inline bool mm_cid_is_unset(int cid)
-{
- return cid == MM_CID_UNSET;
-}
-
-static inline bool mm_cid_is_lazy_put(int cid)
-{
- return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
-}
-
-static inline bool mm_cid_is_valid(int cid)
-{
- return !(cid & MM_CID_LAZY_PUT);
-}
-
-static inline int mm_cid_set_lazy_put(int cid)
-{
- return cid | MM_CID_LAZY_PUT;
-}
-
-static inline int mm_cid_clear_lazy_put(int cid)
-{
- return cid & ~MM_CID_LAZY_PUT;
-}
-
/*
* mm_cpus_allowed: Union of all mm's threads allowed CPUs.
*/
@@ -1415,37 +1342,21 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm)
}
/* Accessor for struct mm_struct's cidmask. */
-static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
+static inline unsigned long *mm_cidmask(struct mm_struct *mm)
{
unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm);
/* Skip mm_cpus_allowed */
cid_bitmap += cpumask_size();
- return (struct cpumask *)cid_bitmap;
+ return (unsigned long *)cid_bitmap;
}
-static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
-{
- int i;
-
- for_each_possible_cpu(i) {
- struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
-
- pcpu_cid->cid = MM_CID_UNSET;
- pcpu_cid->recent_cid = MM_CID_UNSET;
- pcpu_cid->time = 0;
- }
- mm->nr_cpus_allowed = p->nr_cpus_allowed;
- atomic_set(&mm->max_nr_cid, 0);
- raw_spin_lock_init(&mm->cpus_allowed_lock);
- cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
- cpumask_clear(mm_cidmask(mm));
-}
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p);
static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
{
- mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid);
- if (!mm->pcpu_cid)
+ mm->mm_cid.pcpu = alloc_percpu_noprof(struct mm_cid_pcpu);
+ if (!mm->mm_cid.pcpu)
return -ENOMEM;
mm_init_cid(mm, p);
return 0;
@@ -1454,37 +1365,24 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *
static inline void mm_destroy_cid(struct mm_struct *mm)
{
- free_percpu(mm->pcpu_cid);
- mm->pcpu_cid = NULL;
+ free_percpu(mm->mm_cid.pcpu);
+ mm->mm_cid.pcpu = NULL;
}
static inline unsigned int mm_cid_size(void)
{
- return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */
+ /* mm_cpus_allowed(), mm_cidmask(). */
+ return cpumask_size() + bitmap_size(num_possible_cpus());
}
-static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask)
-{
- struct cpumask *mm_allowed = mm_cpus_allowed(mm);
-
- if (!mm)
- return;
- /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
- raw_spin_lock(&mm->cpus_allowed_lock);
- cpumask_or(mm_allowed, mm_allowed, cpumask);
- WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
- raw_spin_unlock(&mm->cpus_allowed_lock);
-}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
static inline void mm_destroy_cid(struct mm_struct *mm) { }
-
static inline unsigned int mm_cid_size(void)
{
return 0;
}
-static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
#endif /* CONFIG_SCHED_MM_CID */
struct mmu_gather;
diff --git a/include/linux/module.h b/include/linux/module.h
index e135cc79acee..d80c3ea57472 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -251,10 +251,11 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name);
*/
#define __mod_device_table(type, name) \
__PASTE(__mod_device_table__, \
+ __PASTE(kmod_, \
__PASTE(__KBUILD_MODNAME, \
__PASTE(__, \
__PASTE(type, \
- __PASTE(__, name)))))
+ __PASTE(__, name))))))
/* Creates an alias so file2alias.c can find device table. */
#define MODULE_DEVICE_TABLE(type, name) \
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d415dd15a0a9..8003e3218c46 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -701,9 +701,6 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
void pci_msi_mask_irq(struct irq_data *data);
void pci_msi_unmask_irq(struct irq_data *data);
-struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
- struct msi_domain_info *info,
- struct irq_domain *parent);
u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
u32 pci_msi_map_rid_ctlr_node(struct pci_dev *pdev, struct device_node **node);
struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 847b81ca6436..bf535f0118bb 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -86,8 +86,23 @@ do { \
#define DEFINE_MUTEX(mutexname) \
struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
-extern void __mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ mutex_init_lockep(lock, name, key);
+}
+#else
+extern void mutex_init_generic(struct mutex *lock);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ mutex_init_generic(lock);
+}
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
/**
* mutex_is_locked - is the mutex locked
@@ -111,17 +126,27 @@ extern bool mutex_is_locked(struct mutex *lock);
#define DEFINE_MUTEX(mutexname) \
struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
-extern void __mutex_rt_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
-
#define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex)
-#define __mutex_init(mutex, name, key) \
-do { \
- rt_mutex_base_init(&(mutex)->rtmutex); \
- __mutex_rt_init((mutex), name, key); \
-} while (0)
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern void mutex_rt_init_lockdep(struct mutex *mutex, const char *name,
+ struct lock_class_key *key);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ mutex_rt_init_lockdep(lock, name, key);
+}
+#else
+extern void mutex_rt_init_generic(struct mutex *mutex);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ mutex_rt_init_generic(lock);
+}
+#endif /* !CONFIG_LOCKDEP */
#endif /* CONFIG_PREEMPT_RT */
#ifdef CONFIG_DEBUG_MUTEXES
diff --git a/include/linux/namei.h b/include/linux/namei.h
index fed86221c69c..58600cf234bc 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -7,6 +7,7 @@
#include <linux/path.h>
#include <linux/fcntl.h>
#include <linux/errno.h>
+#include <linux/fs_struct.h>
enum { MAX_NESTED_LINKS = 8 };
@@ -88,6 +89,81 @@ struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap,
struct qstr *name,
struct dentry *base);
+struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
+ struct qstr *name);
+struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
+ struct qstr *name);
+struct dentry *start_creating_killable(struct mnt_idmap *idmap,
+ struct dentry *parent,
+ struct qstr *name);
+struct dentry *start_removing_killable(struct mnt_idmap *idmap,
+ struct dentry *parent,
+ struct qstr *name);
+struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name);
+struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name);
+struct dentry *start_creating_dentry(struct dentry *parent,
+ struct dentry *child);
+struct dentry *start_removing_dentry(struct dentry *parent,
+ struct dentry *child);
+
+/* end_creating - finish action started with start_creating
+ * @child: dentry returned by start_creating() or vfs_mkdir()
+ *
+ * Unlock and release the child. This can be called after
+ * start_creating() whether that function succeeded or not,
+ * but it is not needed on failure.
+ *
+ * If vfs_mkdir() was called then the value returned from that function
+ * should be given for @child rather than the original dentry, as vfs_mkdir()
+ * may have provided a new dentry.
+ *
+ *
+ * If vfs_mkdir() was not called, then @child will be a valid dentry and
+ * @parent will be ignored.
+ */
+static inline void end_creating(struct dentry *child)
+{
+ end_dirop(child);
+}
+
+/* end_creating_keep - finish action started with start_creating() and return result
+ * @child: dentry returned by start_creating() or vfs_mkdir()
+ *
+ * Unlock and return the child. This can be called after
+ * start_creating() whether that function succeeded or not,
+ * but it is not needed on failure.
+ *
+ * If vfs_mkdir() was called then the value returned from that function
+ * should be given for @child rather than the original dentry, as vfs_mkdir()
+ * may have provided a new dentry.
+ *
+ * Returns: @child, which may be a dentry or an error.
+ *
+ */
+static inline struct dentry *end_creating_keep(struct dentry *child)
+{
+ if (!IS_ERR(child))
+ dget(child);
+ end_dirop(child);
+ return child;
+}
+
+/**
+ * end_removing - finish action started with start_removing
+ * @child: dentry returned by start_removing()
+ * @parent: dentry given to start_removing()
+ *
+ * Unlock and release the child.
+ *
+ * This is identical to end_dirop(). It can be passed the result of
+ * start_removing() whether that was successful or not, but it not needed
+ * if start_removing() failed.
+ */
+static inline void end_removing(struct dentry *child)
+{
+ end_dirop(child);
+}
+
extern int follow_down_one(struct path *);
extern int follow_down(struct path *path, unsigned int flags);
extern int follow_up(struct path *);
@@ -95,6 +171,13 @@ extern int follow_up(struct path *);
extern struct dentry *lock_rename(struct dentry *, struct dentry *);
extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
extern void unlock_rename(struct dentry *, struct dentry *);
+int start_renaming(struct renamedata *rd, int lookup_flags,
+ struct qstr *old_last, struct qstr *new_last);
+int start_renaming_dentry(struct renamedata *rd, int lookup_flags,
+ struct dentry *old_dentry, struct qstr *new_last);
+int start_renaming_two_dentries(struct renamedata *rd,
+ struct dentry *old_dentry, struct dentry *new_dentry);
+void end_renaming(struct renamedata *rd);
/**
* mode_strip_umask - handle vfs umask stripping
diff --git a/include/linux/net/intel/libie/fwlog.h b/include/linux/net/intel/libie/fwlog.h
index 36b13fabca9e..7273c78c826b 100644
--- a/include/linux/net/intel/libie/fwlog.h
+++ b/include/linux/net/intel/libie/fwlog.h
@@ -78,8 +78,20 @@ struct libie_fwlog {
);
};
+#if IS_ENABLED(CONFIG_LIBIE_FWLOG)
int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api);
void libie_fwlog_deinit(struct libie_fwlog *fwlog);
void libie_fwlog_reregister(struct libie_fwlog *fwlog);
void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len);
+#else
+static inline int libie_fwlog_init(struct libie_fwlog *fwlog,
+ struct libie_fwlog_api *api)
+{
+ return -EOPNOTSUPP;
+}
+static inline void libie_fwlog_deinit(struct libie_fwlog *fwlog) { }
+static inline void libie_fwlog_reregister(struct libie_fwlog *fwlog) { }
+static inline void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf,
+ u16 len) { }
+#endif /* CONFIG_LIBIE_FWLOG */
#endif /* _LIBIE_FWLOG_H_ */
diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h
new file mode 100644
index 000000000000..b332b019b29c
--- /dev/null
+++ b/include/linux/ns/ns_common_types.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NS_COMMON_TYPES_H
+#define _LINUX_NS_COMMON_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/ns/nstree_types.h>
+#include <linux/rbtree.h>
+#include <linux/refcount.h>
+#include <linux/types.h>
+
+struct cgroup_namespace;
+struct dentry;
+struct ipc_namespace;
+struct mnt_namespace;
+struct net;
+struct pid_namespace;
+struct proc_ns_operations;
+struct time_namespace;
+struct user_namespace;
+struct uts_namespace;
+
+extern struct cgroup_namespace init_cgroup_ns;
+extern struct ipc_namespace init_ipc_ns;
+extern struct mnt_namespace init_mnt_ns;
+extern struct net init_net;
+extern struct pid_namespace init_pid_ns;
+extern struct time_namespace init_time_ns;
+extern struct user_namespace init_user_ns;
+extern struct uts_namespace init_uts_ns;
+
+extern const struct proc_ns_operations cgroupns_operations;
+extern const struct proc_ns_operations ipcns_operations;
+extern const struct proc_ns_operations mntns_operations;
+extern const struct proc_ns_operations netns_operations;
+extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
+extern const struct proc_ns_operations timens_operations;
+extern const struct proc_ns_operations timens_for_children_operations;
+extern const struct proc_ns_operations userns_operations;
+extern const struct proc_ns_operations utsns_operations;
+
+/*
+ * Namespace lifetimes are managed via a two-tier reference counting model:
+ *
+ * (1) __ns_ref (refcount_t): Main reference count tracking memory
+ * lifetime. Controls when the namespace structure itself is freed.
+ * It also pins the namespace on the namespace trees whereas (2)
+ * only regulates their visibility to userspace.
+ *
+ * (2) __ns_ref_active (atomic_t): Reference count tracking active users.
+ * Controls visibility of the namespace in the namespace trees.
+ * Any live task that uses the namespace (via nsproxy or cred) holds
+ * an active reference. Any open file descriptor or bind-mount of
+ * the namespace holds an active reference. Once all tasks have
+ * called exited their namespaces and all file descriptors and
+ * bind-mounts have been released the active reference count drops
+ * to zero and the namespace becomes inactive. IOW, the namespace
+ * cannot be listed or opened via file handles anymore.
+ *
+ * Note that it is valid to transition from active to inactive and
+ * back from inactive to active e.g., when resurrecting an inactive
+ * namespace tree via the SIOCGSKNS ioctl().
+ *
+ * Relationship and lifecycle states:
+ *
+ * - Active (__ns_ref_active > 0):
+ * Namespace is actively used and visible to userspace. The namespace
+ * can be reopened via /proc/<pid>/ns/<ns_type>, via namespace file
+ * handles, or discovered via listns().
+ *
+ * - Inactive (__ns_ref_active == 0, __ns_ref > 0):
+ * No tasks are actively using the namespace and it isn't pinned by
+ * any bind-mounts or open file descriptors anymore. But the namespace
+ * is still kept alive by internal references. For example, the user
+ * namespace could be pinned by an open file through file->f_cred
+ * references when one of the now defunct tasks had opened a file and
+ * handed the file descriptor off to another process via a UNIX
+ * sockets. Such references keep the namespace structure alive through
+ * __ns_ref but will not hold an active reference.
+ *
+ * - Destroyed (__ns_ref == 0):
+ * No references remain. The namespace is removed from the tree and freed.
+ *
+ * State transitions:
+ *
+ * Active -> Inactive:
+ * When the last task using the namespace exits it drops its active
+ * references to all namespaces. However, user and pid namespaces
+ * remain accessible until the task has been reaped.
+ *
+ * Inactive -> Active:
+ * An inactive namespace tree might be resurrected due to e.g., the
+ * SIOCGSKNS ioctl() on a socket.
+ *
+ * Inactive -> Destroyed:
+ * When __ns_ref drops to zero the namespace is removed from the
+ * namespaces trees and the memory is freed (after RCU grace period).
+ *
+ * Initial namespaces:
+ * Boot-time namespaces (init_net, init_pid_ns, etc.) start with
+ * __ns_ref_active = 1 and remain active forever.
+ *
+ * @ns_type: type of namespace (e.g., CLONE_NEWNET)
+ * @stashed: cached dentry to be used by the vfs
+ * @ops: namespace operations
+ * @inum: namespace inode number (quickly recycled for non-initial namespaces)
+ * @__ns_ref: main reference count (do not use directly)
+ * @ns_tree: namespace tree nodes and active reference count
+ */
+struct ns_common {
+ u32 ns_type;
+ struct dentry *stashed;
+ const struct proc_ns_operations *ops;
+ unsigned int inum;
+ refcount_t __ns_ref; /* do not use directly */
+ union {
+ struct ns_tree;
+ struct rcu_head ns_rcu;
+ };
+};
+
+#define to_ns_common(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: &(__ns)->ns, \
+ const struct cgroup_namespace *: &(__ns)->ns, \
+ struct ipc_namespace *: &(__ns)->ns, \
+ const struct ipc_namespace *: &(__ns)->ns, \
+ struct mnt_namespace *: &(__ns)->ns, \
+ const struct mnt_namespace *: &(__ns)->ns, \
+ struct net *: &(__ns)->ns, \
+ const struct net *: &(__ns)->ns, \
+ struct pid_namespace *: &(__ns)->ns, \
+ const struct pid_namespace *: &(__ns)->ns, \
+ struct time_namespace *: &(__ns)->ns, \
+ const struct time_namespace *: &(__ns)->ns, \
+ struct user_namespace *: &(__ns)->ns, \
+ const struct user_namespace *: &(__ns)->ns, \
+ struct uts_namespace *: &(__ns)->ns, \
+ const struct uts_namespace *: &(__ns)->ns)
+
+#define ns_init_inum(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
+ struct ipc_namespace *: IPC_NS_INIT_INO, \
+ struct mnt_namespace *: MNT_NS_INIT_INO, \
+ struct net *: NET_NS_INIT_INO, \
+ struct pid_namespace *: PID_NS_INIT_INO, \
+ struct time_namespace *: TIME_NS_INIT_INO, \
+ struct user_namespace *: USER_NS_INIT_INO, \
+ struct uts_namespace *: UTS_NS_INIT_INO)
+
+#define ns_init_ns(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: &init_cgroup_ns, \
+ struct ipc_namespace *: &init_ipc_ns, \
+ struct mnt_namespace *: &init_mnt_ns, \
+ struct net *: &init_net, \
+ struct pid_namespace *: &init_pid_ns, \
+ struct time_namespace *: &init_time_ns, \
+ struct user_namespace *: &init_user_ns, \
+ struct uts_namespace *: &init_uts_ns)
+
+#define ns_init_id(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CGROUP_NS_INIT_ID, \
+ struct ipc_namespace *: IPC_NS_INIT_ID, \
+ struct mnt_namespace *: MNT_NS_INIT_ID, \
+ struct net *: NET_NS_INIT_ID, \
+ struct pid_namespace *: PID_NS_INIT_ID, \
+ struct time_namespace *: TIME_NS_INIT_ID, \
+ struct user_namespace *: USER_NS_INIT_ID, \
+ struct uts_namespace *: UTS_NS_INIT_ID)
+
+#define to_ns_operations(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
+ struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \
+ struct mnt_namespace *: &mntns_operations, \
+ struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \
+ struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \
+ struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \
+ struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \
+ struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL))
+
+#define ns_common_type(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CLONE_NEWCGROUP, \
+ struct ipc_namespace *: CLONE_NEWIPC, \
+ struct mnt_namespace *: CLONE_NEWNS, \
+ struct net *: CLONE_NEWNET, \
+ struct pid_namespace *: CLONE_NEWPID, \
+ struct time_namespace *: CLONE_NEWTIME, \
+ struct user_namespace *: CLONE_NEWUSER, \
+ struct uts_namespace *: CLONE_NEWUTS)
+
+#endif /* _LINUX_NS_COMMON_TYPES_H */
diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h
new file mode 100644
index 000000000000..2fb28ee31efb
--- /dev/null
+++ b/include/linux/ns/nstree_types.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+#ifndef _LINUX_NSTREE_TYPES_H
+#define _LINUX_NSTREE_TYPES_H
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+/**
+ * struct ns_tree_root - Root of a namespace tree
+ * @ns_rb: Red-black tree root for efficient lookups
+ * @ns_list_head: List head for sequential iteration
+ *
+ * Each namespace tree maintains both an rbtree (for O(log n) lookups)
+ * and a list (for efficient sequential iteration). The list is kept in
+ * the same sorted order as the rbtree.
+ */
+struct ns_tree_root {
+ struct rb_root ns_rb;
+ struct list_head ns_list_head;
+};
+
+/**
+ * struct ns_tree_node - Node in a namespace tree
+ * @ns_node: Red-black tree node
+ * @ns_list_entry: List entry for sequential iteration
+ *
+ * Represents a namespace's position in a tree. Each namespace has
+ * multiple tree nodes for different trees (unified, per-type, owner).
+ */
+struct ns_tree_node {
+ struct rb_node ns_node;
+ struct list_head ns_list_entry;
+};
+
+/**
+ * struct ns_tree - Namespace tree nodes and active reference count
+ * @ns_id: Unique namespace identifier
+ * @__ns_ref_active: Active reference count (do not use directly)
+ * @ns_unified_node: Node in the global namespace tree
+ * @ns_tree_node: Node in the per-type namespace tree
+ * @ns_owner_node: Node in the owner namespace's tree of owned namespaces
+ * @ns_owner_root: Root of the tree of namespaces owned by this namespace
+ * (only used when this namespace is an owner)
+ */
+struct ns_tree {
+ u64 ns_id;
+ atomic_t __ns_ref_active;
+ struct ns_tree_node ns_unified_node;
+ struct ns_tree_node ns_tree_node;
+ struct ns_tree_node ns_owner_node;
+ struct ns_tree_root ns_owner_root;
+};
+
+#endif /* _LINUX_NSTREE_TYPES_H */
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index f5b68b8abb54..825f5865bfc5 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -2,122 +2,44 @@
#ifndef _LINUX_NS_COMMON_H
#define _LINUX_NS_COMMON_H
+#include <linux/ns/ns_common_types.h>
#include <linux/refcount.h>
-#include <linux/rbtree.h>
+#include <linux/vfsdebug.h>
#include <uapi/linux/sched.h>
+#include <uapi/linux/nsfs.h>
-struct proc_ns_operations;
-
-struct cgroup_namespace;
-struct ipc_namespace;
-struct mnt_namespace;
-struct net;
-struct pid_namespace;
-struct time_namespace;
-struct user_namespace;
-struct uts_namespace;
-
-extern struct cgroup_namespace init_cgroup_ns;
-extern struct ipc_namespace init_ipc_ns;
-extern struct mnt_namespace init_mnt_ns;
-extern struct net init_net;
-extern struct pid_namespace init_pid_ns;
-extern struct time_namespace init_time_ns;
-extern struct user_namespace init_user_ns;
-extern struct uts_namespace init_uts_ns;
-
-extern const struct proc_ns_operations netns_operations;
-extern const struct proc_ns_operations utsns_operations;
-extern const struct proc_ns_operations ipcns_operations;
-extern const struct proc_ns_operations pidns_operations;
-extern const struct proc_ns_operations pidns_for_children_operations;
-extern const struct proc_ns_operations userns_operations;
-extern const struct proc_ns_operations mntns_operations;
-extern const struct proc_ns_operations cgroupns_operations;
-extern const struct proc_ns_operations timens_operations;
-extern const struct proc_ns_operations timens_for_children_operations;
-
-struct ns_common {
- u32 ns_type;
- struct dentry *stashed;
- const struct proc_ns_operations *ops;
- unsigned int inum;
- refcount_t __ns_ref; /* do not use directly */
- union {
- struct {
- u64 ns_id;
- struct rb_node ns_tree_node;
- struct list_head ns_list_node;
- };
- struct rcu_head ns_rcu;
- };
-};
-
+bool is_current_namespace(struct ns_common *ns);
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
void __ns_common_free(struct ns_common *ns);
+struct ns_common *__must_check ns_owner(struct ns_common *ns);
+
+static __always_inline bool is_ns_init_inum(const struct ns_common *ns)
+{
+ VFS_WARN_ON_ONCE(ns->inum == 0);
+ return unlikely(in_range(ns->inum, MNT_NS_INIT_INO,
+ IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1));
+}
+
+static __always_inline bool is_ns_init_id(const struct ns_common *ns)
+{
+ VFS_WARN_ON_ONCE(ns->ns_id == 0);
+ return ns->ns_id <= NS_LAST_INIT_ID;
+}
-#define to_ns_common(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: &(__ns)->ns, \
- const struct cgroup_namespace *: &(__ns)->ns, \
- struct ipc_namespace *: &(__ns)->ns, \
- const struct ipc_namespace *: &(__ns)->ns, \
- struct mnt_namespace *: &(__ns)->ns, \
- const struct mnt_namespace *: &(__ns)->ns, \
- struct net *: &(__ns)->ns, \
- const struct net *: &(__ns)->ns, \
- struct pid_namespace *: &(__ns)->ns, \
- const struct pid_namespace *: &(__ns)->ns, \
- struct time_namespace *: &(__ns)->ns, \
- const struct time_namespace *: &(__ns)->ns, \
- struct user_namespace *: &(__ns)->ns, \
- const struct user_namespace *: &(__ns)->ns, \
- struct uts_namespace *: &(__ns)->ns, \
- const struct uts_namespace *: &(__ns)->ns)
-
-#define ns_init_inum(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
- struct ipc_namespace *: IPC_NS_INIT_INO, \
- struct mnt_namespace *: MNT_NS_INIT_INO, \
- struct net *: NET_NS_INIT_INO, \
- struct pid_namespace *: PID_NS_INIT_INO, \
- struct time_namespace *: TIME_NS_INIT_INO, \
- struct user_namespace *: USER_NS_INIT_INO, \
- struct uts_namespace *: UTS_NS_INIT_INO)
-
-#define ns_init_ns(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: &init_cgroup_ns, \
- struct ipc_namespace *: &init_ipc_ns, \
- struct mnt_namespace *: &init_mnt_ns, \
- struct net *: &init_net, \
- struct pid_namespace *: &init_pid_ns, \
- struct time_namespace *: &init_time_ns, \
- struct user_namespace *: &init_user_ns, \
- struct uts_namespace *: &init_uts_ns)
-
-#define to_ns_operations(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
- struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \
- struct mnt_namespace *: &mntns_operations, \
- struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \
- struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \
- struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \
- struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \
- struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL))
-
-#define ns_common_type(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: CLONE_NEWCGROUP, \
- struct ipc_namespace *: CLONE_NEWIPC, \
- struct mnt_namespace *: CLONE_NEWNS, \
- struct net *: CLONE_NEWNET, \
- struct pid_namespace *: CLONE_NEWPID, \
- struct time_namespace *: CLONE_NEWTIME, \
- struct user_namespace *: CLONE_NEWUSER, \
- struct uts_namespace *: CLONE_NEWUTS)
+#define NS_COMMON_INIT(nsname) \
+{ \
+ .ns_type = ns_common_type(&nsname), \
+ .ns_id = ns_init_id(&nsname), \
+ .inum = ns_init_inum(&nsname), \
+ .ops = to_ns_operations(&nsname), \
+ .stashed = NULL, \
+ .__ns_ref = REFCOUNT_INIT(1), \
+ .__ns_ref_active = ATOMIC_INIT(1), \
+ .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \
+ .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \
+ .ns_owner_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry), \
+ .ns_owner_root.ns_list_head = LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head), \
+}
#define ns_common_init(__ns) \
__ns_common_init(to_ns_common(__ns), \
@@ -133,21 +55,96 @@ void __ns_common_free(struct ns_common *ns);
#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns)))
+static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns)
+{
+ return atomic_read(&ns->__ns_ref_active);
+}
+
+static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns)
+{
+ return refcount_read(&ns->__ns_ref);
+}
+
static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
{
- return refcount_dec_and_test(&ns->__ns_ref);
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return false;
+ }
+ if (refcount_dec_and_test(&ns->__ns_ref)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
+ return true;
+ }
+ return false;
}
static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
{
- return refcount_inc_not_zero(&ns->__ns_ref);
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return true;
+ }
+ if (refcount_inc_not_zero(&ns->__ns_ref))
+ return true;
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
+ return false;
}
-#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref)
-#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref)
-#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns)))
-#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns)))
-#define ns_ref_put_and_lock(__ns, __lock) \
- refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock))
+static __always_inline void __ns_ref_inc(struct ns_common *ns)
+{
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return;
+ }
+ refcount_inc(&ns->__ns_ref);
+}
+
+static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns,
+ spinlock_t *ns_lock)
+{
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return false;
+ }
+ return refcount_dec_and_lock(&ns->__ns_ref, ns_lock);
+}
+
+#define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns)))
+#define ns_ref_inc(__ns) \
+ do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0)
+#define ns_ref_get(__ns) \
+ ((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false)
+#define ns_ref_put(__ns) \
+ ((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false)
+#define ns_ref_put_and_lock(__ns, __ns_lock) \
+ ((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false)
+
+#define ns_ref_active_read(__ns) \
+ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)
+
+void __ns_ref_active_put(struct ns_common *ns);
+
+#define ns_ref_active_put(__ns) \
+ do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0)
+
+static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns)
+{
+ if (!__ns_ref_active_read(ns)) {
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ return NULL;
+ }
+ if (!__ns_ref_get(ns))
+ return NULL;
+ return ns;
+}
+
+void __ns_ref_active_get(struct ns_common *ns);
+
+#define ns_ref_active_get(__ns) \
+ do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)
#endif
diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h
index e5a5fa83d36b..731b67fc2fec 100644
--- a/include/linux/nsfs.h
+++ b/include/linux/nsfs.h
@@ -37,4 +37,7 @@ void nsfs_init(void);
#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns)
+void nsproxy_ns_active_get(struct nsproxy *ns);
+void nsproxy_ns_active_put(struct nsproxy *ns);
+
#endif /* _LINUX_NSFS_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index bd118a187dec..5a67648721c7 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -93,10 +93,13 @@ static inline struct cred *nsset_cred(struct nsset *set)
*/
int copy_namespaces(u64 flags, struct task_struct *tsk);
-void exit_task_namespaces(struct task_struct *tsk);
+void switch_cred_namespaces(const struct cred *old, const struct cred *new);
+void exit_nsproxy_namespaces(struct task_struct *tsk);
+void get_cred_namespaces(struct task_struct *tsk);
+void exit_cred_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
int exec_task_namespaces(void);
-void free_nsproxy(struct nsproxy *ns);
+void deactivate_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
struct cred *, struct fs_struct *);
int __init nsproxy_cache_init(void);
@@ -104,7 +107,7 @@ int __init nsproxy_cache_init(void);
static inline void put_nsproxy(struct nsproxy *ns)
{
if (refcount_dec_and_test(&ns->count))
- free_nsproxy(ns);
+ deactivate_nsproxy(ns);
}
static inline void get_nsproxy(struct nsproxy *ns)
diff --git a/include/linux/nstree.h b/include/linux/nstree.h
index 8b8636690473..175e4625bfa6 100644
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@@ -1,22 +1,34 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#ifndef _LINUX_NSTREE_H
#define _LINUX_NSTREE_H
-#include <linux/ns_common.h>
+#include <linux/ns/nstree_types.h>
#include <linux/nsproxy.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/rculist.h>
#include <linux/cookie.h>
+#include <uapi/linux/nsfs.h>
-extern struct ns_tree cgroup_ns_tree;
-extern struct ns_tree ipc_ns_tree;
-extern struct ns_tree mnt_ns_tree;
-extern struct ns_tree net_ns_tree;
-extern struct ns_tree pid_ns_tree;
-extern struct ns_tree time_ns_tree;
-extern struct ns_tree user_ns_tree;
-extern struct ns_tree uts_ns_tree;
+struct ns_common;
+
+extern struct ns_tree_root cgroup_ns_tree;
+extern struct ns_tree_root ipc_ns_tree;
+extern struct ns_tree_root mnt_ns_tree;
+extern struct ns_tree_root net_ns_tree;
+extern struct ns_tree_root pid_ns_tree;
+extern struct ns_tree_root time_ns_tree;
+extern struct ns_tree_root user_ns_tree;
+extern struct ns_tree_root uts_ns_tree;
+
+void ns_tree_node_init(struct ns_tree_node *node);
+void ns_tree_root_init(struct ns_tree_root *root);
+bool ns_tree_node_empty(const struct ns_tree_node *node);
+struct rb_node *ns_tree_node_add(struct ns_tree_node *node,
+ struct ns_tree_root *root,
+ int (*cmp)(struct rb_node *, const struct rb_node *));
+void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root);
#define to_ns_tree(__ns) \
_Generic((__ns), \
@@ -29,17 +41,21 @@ extern struct ns_tree uts_ns_tree;
struct user_namespace *: &(user_ns_tree), \
struct uts_namespace *: &(uts_ns_tree))
-u64 ns_tree_gen_id(struct ns_common *ns);
-void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree);
-void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree);
+#define ns_tree_gen_id(__ns) \
+ __ns_tree_gen_id(to_ns_common(__ns), \
+ (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0))
+
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id);
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree);
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree);
struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type);
struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
- struct ns_tree *ns_tree,
+ struct ns_tree_root *ns_tree,
bool previous);
-static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
+static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_tree, u64 id)
{
- ns_tree_gen_id(ns);
+ __ns_tree_gen_id(ns, id);
__ns_tree_add_raw(ns, ns_tree);
}
@@ -59,7 +75,9 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
* This function assigns a new id to the namespace and adds it to the
* appropriate namespace tree and list.
*/
-#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns))
+#define ns_tree_add(__ns) \
+ __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns), \
+ (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0))
/**
* ns_tree_remove - Remove a namespace from a namespace tree
@@ -73,6 +91,6 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
#define ns_tree_adjoined_rcu(__ns, __previous) \
__ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous)
-#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node))
+#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node.ns_node))
#endif /* _LINUX_NSTREE_H */
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 46ebaa46e6c5..b18ab53561c9 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -3,16 +3,16 @@
#define _LINUX_OBJTOOL_H
#include <linux/objtool_types.h>
+#include <linux/annotate.h>
#ifdef CONFIG_OBJTOOL
-#include <asm/asm.h>
-
#ifndef __ASSEMBLY__
-#define UNWIND_HINT(type, sp_reg, sp_offset, signal) \
+#define UNWIND_HINT(type, sp_reg, sp_offset, signal) \
"987: \n\t" \
".pushsection .discard.unwind_hints\n\t" \
+ ANNOTATE_DATA_SPECIAL \
/* struct unwind_hint */ \
".long 987b - .\n\t" \
".short " __stringify(sp_offset) "\n\t" \
@@ -53,16 +53,6 @@
#define __ASM_BREF(label) label ## b
-#define __ASM_ANNOTATE(label, type) \
- ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \
- ".long " __stringify(label) " - .\n\t" \
- ".long " __stringify(type) "\n\t" \
- ".popsection\n\t"
-
-#define ASM_ANNOTATE(type) \
- "911:\n\t" \
- __ASM_ANNOTATE(911b, type)
-
#else /* __ASSEMBLY__ */
/*
@@ -89,6 +79,7 @@
.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0
.Lhere_\@:
.pushsection .discard.unwind_hints
+ ANNOTATE_DATA_SPECIAL
/* struct unwind_hint */
.long .Lhere_\@ - .
.short \sp_offset
@@ -101,7 +92,7 @@
.macro STACK_FRAME_NON_STANDARD func:req
.pushsection .discard.func_stack_frame_non_standard, "aw"
- .long \func - .
+ .quad \func
.popsection
.endm
@@ -111,14 +102,6 @@
#endif
.endm
-.macro ANNOTATE type:req
-.Lhere_\@:
- .pushsection .discard.annotate_insn,"M",@progbits,8
- .long .Lhere_\@ - .
- .long \type
- .popsection
-.endm
-
#endif /* __ASSEMBLY__ */
#else /* !CONFIG_OBJTOOL */
@@ -128,84 +111,15 @@
#define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t"
#define STACK_FRAME_NON_STANDARD(func)
#define STACK_FRAME_NON_STANDARD_FP(func)
-#define __ASM_ANNOTATE(label, type) ""
-#define ASM_ANNOTATE(type)
#else
.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0
.endm
.macro STACK_FRAME_NON_STANDARD func:req
.endm
-.macro ANNOTATE type:req
-.endm
#endif
#endif /* CONFIG_OBJTOOL */
-#ifndef __ASSEMBLY__
-/*
- * Annotate away the various 'relocation to !ENDBR` complaints; knowing that
- * these relocations will never be used for indirect calls.
- */
-#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR)
-#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR))
-
-/*
- * This should be used immediately before an indirect jump/call. It tells
- * objtool the subsequent indirect jump/call is vouched safe for retpoline
- * builds.
- */
-#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE)
-/*
- * See linux/instrumentation.h
- */
-#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN)
-#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END)
-/*
- * objtool annotation to ignore the alternatives and only consider the original
- * instruction(s).
- */
-#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS)
-/*
- * This macro indicates that the following intra-function call is valid.
- * Any non-annotated intra-function call will cause objtool to issue a warning.
- */
-#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL)
-/*
- * Use objtool to validate the entry requirement that all code paths do
- * VALIDATE_UNRET_END before RET.
- *
- * NOTE: The macro must be used at the beginning of a global symbol, otherwise
- * it will be ignored.
- */
-#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN)
-/*
- * This should be used to refer to an instruction that is considered
- * terminating, like a noreturn CALL or UD2 when we know they are not -- eg
- * WARN using UD2.
- */
-#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE)
-/*
- * This should not be used; it annotates away CFI violations. There are a few
- * valid use cases like kexec handover to the next kernel image, and there is
- * no security concern there.
- *
- * There are also a few real issues annotated away, like EFI because we can't
- * control the EFI code.
- */
-#define ANNOTATE_NOCFI_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI))
-
-#else
-#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR
-#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE
-/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */
-/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */
-#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS
-#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL
-#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN
-#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE
-#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI
-#endif
-
#if defined(CONFIG_NOINSTR_VALIDATION) && \
(defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO))
#define VALIDATE_UNRET_BEGIN ANNOTATE_UNRET_BEGIN
diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h
index aceac94632c8..c6def4049b1a 100644
--- a/include/linux/objtool_types.h
+++ b/include/linux/objtool_types.h
@@ -67,4 +67,6 @@ struct unwind_hint {
#define ANNOTYPE_REACHABLE 8
#define ANNOTYPE_NOCFI 9
+#define ANNOTYPE_DATA_SPECIAL 1
+
#endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 1db8543dfc8a..1c2bc0281807 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -43,6 +43,8 @@ extern int of_irq_parse_one(struct device_node *device, int index,
struct of_phandle_args *out_irq);
extern int of_irq_count(struct device_node *dev);
extern int of_irq_get(struct device_node *dev, int index);
+extern const struct cpumask *of_irq_get_affinity(struct device_node *dev,
+ int index);
extern int of_irq_get_byname(struct device_node *dev, const char *name);
extern int of_irq_to_resource_table(struct device_node *dev,
struct resource *res, int nr_irqs);
@@ -76,6 +78,11 @@ static inline int of_irq_get_byname(struct device_node *dev, const char *name)
{
return 0;
}
+static inline const struct cpumask *of_irq_get_affinity(struct device_node *dev,
+ int index)
+{
+ return NULL;
+}
static inline int of_irq_to_resource_table(struct device_node *dev,
struct resource *res, int nr_irqs)
{
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09b581c1d878..e601a3144f28 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -38,6 +38,7 @@ int filemap_invalidate_pages(struct address_space *mapping,
int write_inode_now(struct inode *, int sync);
int filemap_fdatawrite(struct address_space *);
int filemap_flush(struct address_space *);
+int filemap_flush_nr(struct address_space *mapping, long *nr_to_write);
int filemap_fdatawait_keep_errors(struct address_space *mapping);
int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend);
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
@@ -53,14 +54,10 @@ static inline int filemap_fdatawait(struct address_space *mapping)
bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend);
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
-int __filemap_fdatawrite_range(struct address_space *mapping,
- loff_t start, loff_t end, int sync_mode);
int filemap_fdatawrite_range(struct address_space *mapping,
loff_t start, loff_t end);
int filemap_check_errors(struct address_space *mapping);
void __filemap_set_wb_err(struct address_space *mapping, int err);
-int filemap_fdatawrite_wbc(struct address_space *mapping,
- struct writeback_control *wbc);
int kiocb_write_and_wait(struct kiocb *iocb, size_t count);
static inline int filemap_write_and_wait(struct address_space *mapping)
@@ -942,6 +939,17 @@ static inline pgoff_t folio_next_index(const struct folio *folio)
}
/**
+ * folio_next_pos - Get the file position of the next folio.
+ * @folio: The current folio.
+ *
+ * Return: The position of the folio which follows this folio in the file.
+ */
+static inline loff_t folio_next_pos(const struct folio *folio)
+{
+ return (loff_t)folio_next_index(folio) << PAGE_SHIFT;
+}
+
+/**
* folio_file_page - The page for a particular index.
* @folio: The folio which contains this index.
* @index: The index we want to look up.
@@ -977,6 +985,8 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);
+unsigned filemap_get_folios_dirty(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
struct folio *read_cache_folio(struct address_space *, pgoff_t index,
filler_t *filler, struct file *file);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d1fdf81fbe1e..bf97d49c23cf 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -412,6 +412,8 @@ struct pci_dev {
u16 l1ss; /* L1SS Capability pointer */
#ifdef CONFIG_PCIEASPM
struct pcie_link_state *link_state; /* ASPM link state */
+ unsigned int aspm_l0s_support:1; /* ASPM L0s support */
+ unsigned int aspm_l1_support:1; /* ASPM L1 support */
unsigned int ltr_path:1; /* Latency Tolerance Reporting
supported from root to here */
#endif
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 12d90360f6db..43c854a273c3 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -52,7 +52,7 @@
__section(".discard") __attribute__((unused))
/*
- * s390 and alpha modules require percpu variables to be defined as
+ * alpha modules require percpu variables to be defined as
* weak to force the compiler to generate GOT based external
* references for them. This is necessary because percpu sections
* will be located outside of the usually addressable area.
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 93c9a26492fc..52b37f7bdbf9 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -119,6 +119,7 @@ struct arm_pmu {
/* PMUv3 only */
int pmuver;
+ bool has_smt;
u64 reg_pmmir;
u64 reg_brbidr;
#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40
@@ -132,8 +133,6 @@ struct arm_pmu {
#define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
-DECLARE_PER_CPU(struct arm_pmu *, cpu_armpmu);
-
u64 armpmu_event_update(struct perf_event *event);
int armpmu_event_set_period(struct perf_event *event);
@@ -190,8 +189,8 @@ bool arm_pmu_irq_is_nmi(void);
struct arm_pmu *armpmu_alloc(void);
void armpmu_free(struct arm_pmu *pmu);
int armpmu_register(struct arm_pmu *pmu);
-int armpmu_request_irq(int irq, int cpu);
-void armpmu_free_irq(int irq, int cpu);
+int armpmu_request_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu);
+void armpmu_free_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu);
#define ARMV8_PMU_PDEV_NAME "armv8-pmu"
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fd1d91017b99..9870d768db4c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1720,7 +1720,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
- u32 max_stack, bool crosstask, bool add_mark);
+ u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 32e8457ad535..ee3148ef87f6 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1232,6 +1232,10 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
#endif
+#ifndef flush_tlb_fix_spurious_fault_pmd
+#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0)
+#endif
+
/*
* When walking page tables, get the address of the next boundary,
* or the end address of the range if that comes earlier. Although no
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 445517a72ad0..0e7ae12c96d2 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -61,8 +61,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
- if (ns != &init_pid_ns)
- ns_ref_inc(ns);
+ ns_ref_inc(ns);
return ns;
}
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 9d42d473d201..7f6a92ac9704 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -44,11 +44,11 @@ typedef unsigned int pipe_index_t;
typedef unsigned short pipe_index_t;
#endif
-/*
- * We have to declare this outside 'struct pipe_inode_info',
- * but then we can't use 'union pipe_index' for an anonymous
- * union, so we end up having to duplicate this declaration
- * below. Annoying.
+/**
+ * struct pipe_index - pipe indeces
+ * @head: The point of buffer production
+ * @tail: The point of buffer consumption
+ * @head_tail: unsigned long union of @head and @tail
*/
union pipe_index {
unsigned long head_tail;
@@ -63,9 +63,7 @@ union pipe_index {
* @mutex: mutex protecting the whole thing
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
- * @head: The point of buffer production
- * @tail: The point of buffer consumption
- * @head_tail: unsigned long union of @head and @tail
+ * @pipe_index: the pipe indeces
* @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
@@ -87,14 +85,7 @@ struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
- /* This has to match the 'union pipe_index' above */
- union {
- unsigned long head_tail;
- struct {
- pipe_index_t head;
- pipe_index_t tail;
- };
- };
+ union pipe_index;
unsigned int max_usage;
unsigned int ring_size;
diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h
index 1571e9157fa5..b1b837583d54 100644
--- a/include/linux/platform_data/x86/int3472.h
+++ b/include/linux/platform_data/x86/int3472.h
@@ -100,7 +100,6 @@ struct int3472_gpio_regulator {
struct regulator_consumer_supply supply_map[GPIO_REGULATOR_SUPPLY_MAP_COUNT * 2];
char supply_name_upper[GPIO_SUPPLY_NAME_LENGTH];
char regulator_name[GPIO_REGULATOR_NAME_LENGTH];
- struct gpio_desc *ena_gpio;
struct regulator_dev *rdev;
struct regulator_desc rdesc;
};
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 074754c23d33..93c945331f39 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -102,6 +102,8 @@ devm_platform_ioremap_resource_byname(struct platform_device *pdev,
extern int platform_get_irq(struct platform_device *, unsigned int);
extern int platform_get_irq_optional(struct platform_device *, unsigned int);
+extern int platform_get_irq_affinity(struct platform_device *, unsigned int,
+ const struct cpumask **);
extern int platform_irq_count(struct platform_device *);
extern int devm_platform_get_irqs_affinity(struct platform_device *dev,
struct irq_affinity *affd,
@@ -232,6 +234,7 @@ extern int platform_device_add_data(struct platform_device *pdev,
extern int platform_device_add(struct platform_device *pdev);
extern void platform_device_del(struct platform_device *pdev);
extern void platform_device_put(struct platform_device *pdev);
+DEFINE_FREE(platform_device_put, struct platform_device *, if (_T) platform_device_put(_T))
struct platform_driver {
int (*probe)(struct platform_device *);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index cc7b2dc28574..7f69f739f613 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -25,11 +25,12 @@ extern void (*pm_power_off)(void);
struct device; /* we have a circular dep with device.h */
#ifdef CONFIG_VT_CONSOLE_SLEEP
-extern void pm_vt_switch_required(struct device *dev, bool required);
+extern int pm_vt_switch_required(struct device *dev, bool required);
extern void pm_vt_switch_unregister(struct device *dev);
#else
-static inline void pm_vt_switch_required(struct device *dev, bool required)
+static inline int pm_vt_switch_required(struct device *dev, bool required)
{
+ return 0;
}
static inline void pm_vt_switch_unregister(struct device *dev)
{
@@ -507,6 +508,7 @@ const struct dev_pm_ops name = { \
* RECOVER Creation of a hibernation image or restoration of the main
* memory contents from a hibernation image has failed, call
* ->thaw() and ->complete() for all devices.
+ * POWEROFF System will poweroff, call ->poweroff() for all devices.
*
* The following PM_EVENT_ messages are defined for internal use by
* kernel subsystems. They are never issued by the PM core.
@@ -537,6 +539,7 @@ const struct dev_pm_ops name = { \
#define PM_EVENT_USER 0x0100
#define PM_EVENT_REMOTE 0x0200
#define PM_EVENT_AUTO 0x0400
+#define PM_EVENT_POWEROFF 0x0800
#define PM_EVENT_SLEEP (PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE)
#define PM_EVENT_USER_SUSPEND (PM_EVENT_USER | PM_EVENT_SUSPEND)
@@ -551,6 +554,7 @@ const struct dev_pm_ops name = { \
#define PMSG_QUIESCE ((struct pm_message){ .event = PM_EVENT_QUIESCE, })
#define PMSG_SUSPEND ((struct pm_message){ .event = PM_EVENT_SUSPEND, })
#define PMSG_HIBERNATE ((struct pm_message){ .event = PM_EVENT_HIBERNATE, })
+#define PMSG_POWEROFF ((struct pm_message){ .event = PM_EVENT_POWEROFF, })
#define PMSG_RESUME ((struct pm_message){ .event = PM_EVENT_RESUME, })
#define PMSG_THAW ((struct pm_message){ .event = PM_EVENT_THAW, })
#define PMSG_RESTORE ((struct pm_message){ .event = PM_EVENT_RESTORE, })
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index f67a2cb7d781..93ba0143ca47 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -153,6 +153,7 @@ enum genpd_sync_state {
};
struct dev_power_governor {
+ bool (*system_power_down_ok)(struct dev_pm_domain *domain);
bool (*power_down_ok)(struct dev_pm_domain *domain);
bool (*suspend_ok)(struct device *dev);
};
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 4a69d4af3ff8..6cea4455f867 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -162,6 +162,15 @@ static inline void cpu_latency_qos_update_request(struct pm_qos_request *req,
static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {}
#endif
+#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
+s32 cpu_wakeup_latency_qos_limit(void);
+#else
+static inline s32 cpu_wakeup_latency_qos_limit(void)
+{
+ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+}
+#endif
+
#ifdef CONFIG_PM
enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask);
enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask);
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 0b436e15f4cd..911d7a4d32c1 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -637,6 +637,30 @@ DEFINE_GUARD_COND(pm_runtime_active_auto, _try,
DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled,
pm_runtime_resume_and_get(_T), _RET == 0)
+/* ACQUIRE() wrapper macros for the guards defined above. */
+
+#define PM_RUNTIME_ACQUIRE(_dev, _var) \
+ ACQUIRE(pm_runtime_active_try, _var)(_dev)
+
+#define PM_RUNTIME_ACQUIRE_AUTOSUSPEND(_dev, _var) \
+ ACQUIRE(pm_runtime_active_auto_try, _var)(_dev)
+
+#define PM_RUNTIME_ACQUIRE_IF_ENABLED(_dev, _var) \
+ ACQUIRE(pm_runtime_active_try_enabled, _var)(_dev)
+
+#define PM_RUNTIME_ACQUIRE_IF_ENABLED_AUTOSUSPEND(_dev, _var) \
+ ACQUIRE(pm_runtime_active_auto_try_enabled, _var)(_dev)
+
+/*
+ * ACQUIRE_ERR() wrapper macro for guard pm_runtime_active.
+ *
+ * Always check PM_RUNTIME_ACQUIRE_ERR() after using one of the
+ * PM_RUNTIME_ACQUIRE*() macros defined above (yes, it can be used with
+ * any of them) and if it is nonzero, avoid accessing the given device.
+ */
+#define PM_RUNTIME_ACQUIRE_ERR(_var_ptr) \
+ ACQUIRE_ERR(pm_runtime_active, _var_ptr)
+
/**
* pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0.
* @dev: Target device.
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 102202185d7a..d964f965c8ff 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -134,11 +134,9 @@ static __always_inline unsigned char interrupt_context_level(void)
/*
* The following macros are deprecated and should not be used in new code:
- * in_irq() - Obsolete version of in_hardirq()
* in_softirq() - We have BH disabled, or are processing softirqs
* in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
*/
-#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
diff --git a/include/linux/prmt.h b/include/linux/prmt.h
index c53ab287e932..8cdc987de963 100644
--- a/include/linux/prmt.h
+++ b/include/linux/prmt.h
@@ -4,9 +4,11 @@
#ifdef CONFIG_ACPI_PRMT
void init_prmt(void);
+bool acpi_prm_handler_available(const guid_t *handler_guid);
int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer);
#else
static inline void init_prmt(void) { }
+static inline bool acpi_prm_handler_available(const guid_t *handler_guid) { return false; }
static inline int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer)
{
return -EOPNOTSUPP;
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
index 2503f7625d65..a651e60d9410 100644
--- a/include/linux/pseudo_fs.h
+++ b/include/linux/pseudo_fs.h
@@ -9,6 +9,7 @@ struct pseudo_fs_context {
const struct xattr_handler * const *xattr;
const struct dentry_operations *dops;
unsigned long magic;
+ unsigned int s_d_flags;
};
struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 4e1ac1fbcec4..55343795644b 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1643,7 +1643,7 @@ struct regmap_irq_chip_data;
* @status_invert: Inverted status register: cleared bits are active interrupts.
* @status_is_level: Status register is actuall signal level: Xor status
* register with previous value to get active interrupts.
- * @wake_invert: Inverted wake register: cleared bits are wake enabled.
+ * @wake_invert: Inverted wake register: cleared bits are wake disabled.
* @type_in_mask: Use the mask registers for controlling irq type. Use this if
* the hardware provides separate bits for rising/falling edge
* or low/high level interrupts and they should be combined into
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index a7d92718b653..54701668b3df 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -206,6 +206,8 @@ struct rdt_mon_domain {
* @arch_has_sparse_bitmasks: True if a bitmask like f00f is valid.
* @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache
* level has CPU scope.
+ * @io_alloc_capable: True if portion of the cache can be configured
+ * for I/O traffic.
*/
struct resctrl_cache {
unsigned int cbm_len;
@@ -213,6 +215,7 @@ struct resctrl_cache {
unsigned int shareable_bits;
bool arch_has_sparse_bitmasks;
bool arch_has_per_cpu_cfg;
+ bool io_alloc_capable;
};
/**
@@ -654,6 +657,27 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
u32 closid, u32 rmid, int cntr_id,
enum resctrl_event_id eventid);
+/**
+ * resctrl_arch_io_alloc_enable() - Enable/disable io_alloc feature.
+ * @r: The resctrl resource.
+ * @enable: Enable (true) or disable (false) io_alloc on resource @r.
+ *
+ * This can be called from any CPU.
+ *
+ * Return:
+ * 0 on success, <0 on error.
+ */
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable);
+
+/**
+ * resctrl_arch_get_io_alloc_enabled() - Get io_alloc feature state.
+ * @r: The resctrl resource.
+ *
+ * Return:
+ * true if io_alloc is enabled or false if disabled.
+ */
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r);
+
extern unsigned int resctrl_rmid_realloc_threshold;
extern unsigned int resctrl_rmid_realloc_limit;
diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
index 7e50bbc94e47..36ddfa1ec301 100644
--- a/include/linux/restart_block.h
+++ b/include/linux/restart_block.h
@@ -43,7 +43,7 @@ struct restart_block {
struct __kernel_timespec __user *rmtp;
struct old_timespec32 __user *compat_rmtp;
};
- u64 expires;
+ ktime_t expires;
} nanosleep;
/* For poll */
struct {
diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h
index e0135e0adae0..bf92227c78d0 100644
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs)
mem_cgroup_handle_over_high(GFP_KERNEL);
blkcg_maybe_throttle_current();
- rseq_handle_notify_resume(NULL, regs);
+ rseq_handle_slowpath(regs);
}
#endif /* LINUX_RESUME_USER_MODE_H */
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 69553e7c14c1..2266f4dc77b6 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -3,134 +3,164 @@
#define _LINUX_RSEQ_H
#ifdef CONFIG_RSEQ
-
-#include <linux/preempt.h>
#include <linux/sched.h>
-#ifdef CONFIG_MEMBARRIER
-# define RSEQ_EVENT_GUARD irq
-#else
-# define RSEQ_EVENT_GUARD preempt
-#endif
-
-/*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
-enum rseq_event_mask_bits {
- RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
- RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
- RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
-};
-
-enum rseq_event_mask {
- RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
- RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
- RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
-};
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
- if (t->rseq)
- set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-}
+#include <uapi/linux/rseq.h>
-void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
+void __rseq_handle_slowpath(struct pt_regs *regs);
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
+/* Invoked from resume_user_mode_work() */
+static inline void rseq_handle_slowpath(struct pt_regs *regs)
{
- if (current->rseq)
- __rseq_handle_notify_resume(ksig, regs);
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
+ if (current->rseq.event.slowpath)
+ __rseq_handle_slowpath(regs);
+ } else {
+ /* '&' is intentional to spare one conditional branch */
+ if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
+ __rseq_handle_slowpath(regs);
+ }
}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
-{
- scoped_guard(RSEQ_EVENT_GUARD)
- __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
- rseq_handle_notify_resume(ksig, regs);
-}
+void __rseq_signal_deliver(int sig, struct pt_regs *regs);
-/* rseq_preempt() requires preemption to be disabled. */
-static inline void rseq_preempt(struct task_struct *t)
+/*
+ * Invoked from signal delivery to fixup based on the register context before
+ * switching to the signal delivery context.
+ */
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
{
- __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+ /* '&' is intentional to spare one conditional branch */
+ if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
+ __rseq_signal_deliver(ksig->sig, regs);
+ } else {
+ if (current->rseq.event.has_rseq)
+ __rseq_signal_deliver(ksig->sig, regs);
+ }
}
-/* rseq_migrate() requires preemption to be disabled. */
-static inline void rseq_migrate(struct task_struct *t)
+static inline void rseq_raise_notify_resume(struct task_struct *t)
{
- __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
+ set_tsk_thread_flag(t, TIF_RSEQ);
}
-/*
- * If parent process has a registered restartable sequences area, the
- * child inherits. Unregister rseq for a clone with CLONE_VM set.
- */
-static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
+/* Invoked from context switch to force evaluation on exit to user */
+static __always_inline void rseq_sched_switch_event(struct task_struct *t)
{
- if (clone_flags & CLONE_VM) {
- t->rseq = NULL;
- t->rseq_len = 0;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
+ struct rseq_event *ev = &t->rseq.event;
+
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+ /*
+ * Avoid a boat load of conditionals by using simple logic
+ * to determine whether NOTIFY_RESUME needs to be raised.
+ *
+ * It's required when the CPU or MM CID has changed or
+ * the entry was from user space.
+ */
+ bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
+
+ if (raise) {
+ ev->sched_switch = true;
+ rseq_raise_notify_resume(t);
+ }
} else {
- t->rseq = current->rseq;
- t->rseq_len = current->rseq_len;
- t->rseq_sig = current->rseq_sig;
- t->rseq_event_mask = current->rseq_event_mask;
+ if (ev->has_rseq) {
+ t->rseq.event.sched_switch = true;
+ rseq_raise_notify_resume(t);
+ }
}
}
-static inline void rseq_execve(struct task_struct *t)
+/*
+ * Invoked from __set_task_cpu() when a task migrates or from
+ * mm_cid_schedin() when the CID changes to enforce an IDs update.
+ *
+ * This does not raise TIF_NOTIFY_RESUME as that happens in
+ * rseq_sched_switch_event().
+ */
+static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
{
- t->rseq = NULL;
- t->rseq_len = 0;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
+ t->rseq.event.ids_changed = true;
}
-#else
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-}
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
+/* Enforce a full update after RSEQ registration and when execve() failed */
+static inline void rseq_force_update(void)
{
+ if (current->rseq.event.has_rseq) {
+ current->rseq.event.ids_changed = true;
+ current->rseq.event.sched_switch = true;
+ rseq_raise_notify_resume(current);
+ }
}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
+
+/*
+ * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
+ * which clears TIF_NOTIFY_RESUME on architectures that don't use the
+ * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
+ *
+ * To avoid updating user space RSEQ in that case just to do it eventually
+ * again before returning to user space, because __rseq_handle_slowpath()
+ * does nothing when invoked with NULL register state.
+ *
+ * After returning from guest mode, before exiting to userspace, hypervisors
+ * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
+ */
+static inline void rseq_virt_userspace_exit(void)
{
+ /*
+ * The generic optimization for deferring RSEQ updates until the next
+ * exit relies on having a dedicated TIF_RSEQ.
+ */
+ if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
+ current->rseq.event.sched_switch)
+ rseq_raise_notify_resume(current);
}
-static inline void rseq_preempt(struct task_struct *t)
+
+static inline void rseq_reset(struct task_struct *t)
{
+ memset(&t->rseq, 0, sizeof(t->rseq));
+ t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
}
-static inline void rseq_migrate(struct task_struct *t)
+
+static inline void rseq_execve(struct task_struct *t)
{
+ rseq_reset(t);
}
+
+/*
+ * If parent process has a registered restartable sequences area, the
+ * child inherits. Unregister rseq for a clone with CLONE_VM set.
+ *
+ * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
+ * on the COW page on exit to user space, when the child stays on the same
+ * CPU as the parent. That's obviously not guaranteed, but in overcommit
+ * scenarios it is more likely and optimizes for the fork/exec case without
+ * taking the fault.
+ */
static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
{
-}
-static inline void rseq_execve(struct task_struct *t)
-{
+ if (clone_flags & CLONE_VM)
+ rseq_reset(t);
+ else
+ t->rseq = current->rseq;
}
-#endif
+#else /* CONFIG_RSEQ */
+static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
+static inline void rseq_sched_switch_event(struct task_struct *t) { }
+static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
+static inline void rseq_force_update(void) { }
+static inline void rseq_virt_userspace_exit(void) { }
+static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
+static inline void rseq_execve(struct task_struct *t) { }
+#endif /* !CONFIG_RSEQ */
#ifdef CONFIG_DEBUG_RSEQ
-
void rseq_syscall(struct pt_regs *regs);
-
-#else
-
-static inline void rseq_syscall(struct pt_regs *regs)
-{
-}
-
-#endif
+#else /* CONFIG_DEBUG_RSEQ */
+static inline void rseq_syscall(struct pt_regs *regs) { }
+#endif /* !CONFIG_DEBUG_RSEQ */
#endif /* _LINUX_RSEQ_H */
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
new file mode 100644
index 000000000000..c92167ff8a7f
--- /dev/null
+++ b/include/linux/rseq_entry.h
@@ -0,0 +1,616 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_ENTRY_H
+#define _LINUX_RSEQ_ENTRY_H
+
+/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
+#ifdef CONFIG_RSEQ_STATS
+#include <linux/percpu.h>
+
+struct rseq_stats {
+ unsigned long exit;
+ unsigned long signal;
+ unsigned long slowpath;
+ unsigned long fastpath;
+ unsigned long ids;
+ unsigned long cs;
+ unsigned long clear;
+ unsigned long fixup;
+};
+
+DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
+
+/*
+ * Slow path has interrupts and preemption enabled, but the fast path
+ * runs with interrupts disabled so there is no point in having the
+ * preemption checks implied in __this_cpu_inc() for every operation.
+ */
+#ifdef RSEQ_BUILD_SLOW_PATH
+#define rseq_stat_inc(which) this_cpu_inc((which))
+#else
+#define rseq_stat_inc(which) raw_cpu_inc((which))
+#endif
+
+#else /* CONFIG_RSEQ_STATS */
+#define rseq_stat_inc(x) do { } while (0)
+#endif /* !CONFIG_RSEQ_STATS */
+
+#ifdef CONFIG_RSEQ
+#include <linux/jump_label.h>
+#include <linux/rseq.h>
+#include <linux/uaccess.h>
+
+#include <linux/tracepoint-defs.h>
+
+#ifdef CONFIG_TRACEPOINTS
+DECLARE_TRACEPOINT(rseq_update);
+DECLARE_TRACEPOINT(rseq_ip_fixup);
+void __rseq_trace_update(struct task_struct *t);
+void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+ unsigned long offset, unsigned long abort_ip);
+
+static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
+{
+ if (tracepoint_enabled(rseq_update) && ids)
+ __rseq_trace_update(t);
+}
+
+static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+ unsigned long offset, unsigned long abort_ip)
+{
+ if (tracepoint_enabled(rseq_ip_fixup))
+ __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+}
+
+#else /* CONFIG_TRACEPOINT */
+static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
+static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+ unsigned long offset, unsigned long abort_ip) { }
+#endif /* !CONFIG_TRACEPOINT */
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
+
+#ifdef RSEQ_BUILD_SLOW_PATH
+#define rseq_inline
+#else
+#define rseq_inline __always_inline
+#endif
+
+bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
+bool rseq_debug_validate_ids(struct task_struct *t);
+
+static __always_inline void rseq_note_user_irq_entry(void)
+{
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
+ current->rseq.event.user_irq = true;
+}
+
+/*
+ * Check whether there is a valid critical section and whether the
+ * instruction pointer in @regs is inside the critical section.
+ *
+ * - If the critical section is invalid, terminate the task.
+ *
+ * - If valid and the instruction pointer is inside, set it to the abort IP.
+ *
+ * - If valid and the instruction pointer is outside, clear the critical
+ * section address.
+ *
+ * Returns true, if the section was valid and either fixup or clear was
+ * done, false otherwise.
+ *
+ * In the failure case task::rseq_event::fatal is set when a invalid
+ * section was found. It's clear when the failure was an unresolved page
+ * fault.
+ *
+ * If inlined into the exit to user path with interrupts disabled, the
+ * caller has to protect against page faults with pagefault_disable().
+ *
+ * In preemptible task context this would be counterproductive as the page
+ * faults could not be fully resolved. As a consequence unresolved page
+ * faults in task context are fatal too.
+ */
+
+#ifdef RSEQ_BUILD_SLOW_PATH
+/*
+ * The debug version is put out of line, but kept here so the code stays
+ * together.
+ *
+ * @csaddr has already been checked by the caller to be in user space
+ */
+bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
+ unsigned long csaddr)
+{
+ struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
+ u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
+ unsigned long ip = instruction_pointer(regs);
+ u64 __user *uc_head = (u64 __user *) ucs;
+ u32 usig, __user *uc_sig;
+
+ scoped_user_rw_access(ucs, efault) {
+ /*
+ * Evaluate the user pile and exit if one of the conditions
+ * is not fulfilled.
+ */
+ unsafe_get_user(start_ip, &ucs->start_ip, efault);
+ if (unlikely(start_ip >= tasksize))
+ goto die;
+ /* If outside, just clear the critical section. */
+ if (ip < start_ip)
+ goto clear;
+
+ unsafe_get_user(offset, &ucs->post_commit_offset, efault);
+ cs_end = start_ip + offset;
+ /* Check for overflow and wraparound */
+ if (unlikely(cs_end >= tasksize || cs_end < start_ip))
+ goto die;
+
+ /* If not inside, clear it. */
+ if (ip >= cs_end)
+ goto clear;
+
+ unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
+ /* Ensure it's "valid" */
+ if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
+ goto die;
+ /* Validate that the abort IP is not in the critical section */
+ if (unlikely(abort_ip - start_ip < offset))
+ goto die;
+
+ /*
+ * Check version and flags for 0. No point in emitting
+ * deprecated warnings before dying. That could be done in
+ * the slow path eventually, but *shrug*.
+ */
+ unsafe_get_user(head, uc_head, efault);
+ if (unlikely(head))
+ goto die;
+
+ /* abort_ip - 4 is >= 0. See abort_ip check above */
+ uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
+ unsafe_get_user(usig, uc_sig, efault);
+ if (unlikely(usig != t->rseq.sig))
+ goto die;
+
+ /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+ /* If not in interrupt from user context, let it die */
+ if (unlikely(!t->rseq.event.user_irq))
+ goto die;
+ }
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ instruction_pointer_set(regs, (unsigned long)abort_ip);
+ rseq_stat_inc(rseq_stats.fixup);
+ break;
+ clear:
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ rseq_stat_inc(rseq_stats.clear);
+ abort_ip = 0ULL;
+ }
+
+ if (unlikely(abort_ip))
+ rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+ return true;
+die:
+ t->rseq.event.fatal = true;
+efault:
+ return false;
+}
+
+/*
+ * On debug kernels validate that user space did not mess with it if the
+ * debug branch is enabled.
+ */
+bool rseq_debug_validate_ids(struct task_struct *t)
+{
+ struct rseq __user *rseq = t->rseq.usrptr;
+ u32 cpu_id, uval, node_id;
+
+ /*
+ * On the first exit after registering the rseq region CPU ID is
+ * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
+ */
+ node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
+ cpu_to_node(t->rseq.ids.cpu_id) : 0;
+
+ scoped_user_read_access(rseq, efault) {
+ unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
+ if (cpu_id != t->rseq.ids.cpu_id)
+ goto die;
+ unsafe_get_user(uval, &rseq->cpu_id, efault);
+ if (uval != cpu_id)
+ goto die;
+ unsafe_get_user(uval, &rseq->node_id, efault);
+ if (uval != node_id)
+ goto die;
+ unsafe_get_user(uval, &rseq->mm_cid, efault);
+ if (uval != t->rseq.ids.mm_cid)
+ goto die;
+ }
+ return true;
+die:
+ t->rseq.event.fatal = true;
+efault:
+ return false;
+}
+
+#endif /* RSEQ_BUILD_SLOW_PATH */
+
+/*
+ * This only ensures that abort_ip is in the user address space and
+ * validates that it is preceded by the signature.
+ *
+ * No other sanity checks are done here, that's what the debug code is for.
+ */
+static rseq_inline bool
+rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
+{
+ struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
+ unsigned long ip = instruction_pointer(regs);
+ unsigned long tasksize = TASK_SIZE;
+ u64 start_ip, abort_ip, offset;
+ u32 usig, __user *uc_sig;
+
+ rseq_stat_inc(rseq_stats.cs);
+
+ if (unlikely(csaddr >= tasksize)) {
+ t->rseq.event.fatal = true;
+ return false;
+ }
+
+ if (static_branch_unlikely(&rseq_debug_enabled))
+ return rseq_debug_update_user_cs(t, regs, csaddr);
+
+ scoped_user_rw_access(ucs, efault) {
+ unsafe_get_user(start_ip, &ucs->start_ip, efault);
+ unsafe_get_user(offset, &ucs->post_commit_offset, efault);
+ unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
+
+ /*
+ * No sanity checks. If user space screwed it up, it can
+ * keep the pieces. That's what debug code is for.
+ *
+ * If outside, just clear the critical section.
+ */
+ if (ip - start_ip >= offset)
+ goto clear;
+
+ /*
+ * Two requirements for @abort_ip:
+ * - Must be in user space as x86 IRET would happily return to
+ * the kernel.
+ * - The four bytes preceding the instruction at @abort_ip must
+ * contain the signature.
+ *
+ * The latter protects against the following attack vector:
+ *
+ * An attacker with limited abilities to write, creates a critical
+ * section descriptor, sets the abort IP to a library function or
+ * some other ROP gadget and stores the address of the descriptor
+ * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
+ * protection.
+ */
+ if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
+ goto die;
+
+ /* The address is guaranteed to be >= 0 and < TASK_SIZE */
+ uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
+ unsafe_get_user(usig, uc_sig, efault);
+ if (unlikely(usig != t->rseq.sig))
+ goto die;
+
+ /* Invalidate the critical section */
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ /* Update the instruction pointer */
+ instruction_pointer_set(regs, (unsigned long)abort_ip);
+ rseq_stat_inc(rseq_stats.fixup);
+ break;
+ clear:
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ rseq_stat_inc(rseq_stats.clear);
+ abort_ip = 0ULL;
+ }
+
+ if (unlikely(abort_ip))
+ rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+ return true;
+die:
+ t->rseq.event.fatal = true;
+efault:
+ return false;
+}
+
+/*
+ * Updates CPU ID, Node ID and MM CID and reads the critical section
+ * address, when @csaddr != NULL. This allows to put the ID update and the
+ * read under the same uaccess region to spare a separate begin/end.
+ *
+ * As this is either invoked from a C wrapper with @csaddr = NULL or from
+ * the fast path code with a valid pointer, a clever compiler should be
+ * able to optimize the read out. Spares a duplicate implementation.
+ *
+ * Returns true, if the operation was successful, false otherwise.
+ *
+ * In the failure case task::rseq_event::fatal is set when invalid data
+ * was found on debug kernels. It's clear when the failure was an unresolved page
+ * fault.
+ *
+ * If inlined into the exit to user path with interrupts disabled, the
+ * caller has to protect against page faults with pagefault_disable().
+ *
+ * In preemptible task context this would be counterproductive as the page
+ * faults could not be fully resolved. As a consequence unresolved page
+ * faults in task context are fatal too.
+ */
+static rseq_inline
+bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
+ u32 node_id, u64 *csaddr)
+{
+ struct rseq __user *rseq = t->rseq.usrptr;
+
+ if (static_branch_unlikely(&rseq_debug_enabled)) {
+ if (!rseq_debug_validate_ids(t))
+ return false;
+ }
+
+ scoped_user_rw_access(rseq, efault) {
+ unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
+ unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
+ unsafe_put_user(node_id, &rseq->node_id, efault);
+ unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
+ if (csaddr)
+ unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
+ }
+
+ /* Cache the new values */
+ t->rseq.ids.cpu_cid = ids->cpu_cid;
+ rseq_stat_inc(rseq_stats.ids);
+ rseq_trace_update(t, ids);
+ return true;
+efault:
+ return false;
+}
+
+/*
+ * Update user space with new IDs and conditionally check whether the task
+ * is in a critical section.
+ */
+static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
+ struct rseq_ids *ids, u32 node_id)
+{
+ u64 csaddr;
+
+ if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
+ return false;
+
+ /*
+ * On architectures which utilize the generic entry code this
+ * allows to skip the critical section when the entry was not from
+ * a user space interrupt, unless debug mode is enabled.
+ */
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+ if (!static_branch_unlikely(&rseq_debug_enabled)) {
+ if (likely(!t->rseq.event.user_irq))
+ return true;
+ }
+ }
+ if (likely(!csaddr))
+ return true;
+ /* Sigh, this really needs to do work */
+ return rseq_update_user_cs(t, regs, csaddr);
+}
+
+/*
+ * If you want to use this then convert your architecture to the generic
+ * entry code. I'm tired of building workarounds for people who can't be
+ * bothered to make the maintenance of generic infrastructure less
+ * burdensome. Just sucking everything into the architecture code and
+ * thereby making others chase the horrible hacks and keep them working is
+ * neither acceptable nor sustainable.
+ */
+#ifdef CONFIG_GENERIC_ENTRY
+
+/*
+ * This is inlined into the exit path because:
+ *
+ * 1) It's a one time comparison in the fast path when there is no event to
+ * handle
+ *
+ * 2) The access to the user space rseq memory (TLS) is unlikely to fault
+ * so the straight inline operation is:
+ *
+ * - Four 32-bit stores only if CPU ID/ MM CID need to be updated
+ * - One 64-bit load to retrieve the critical section address
+ *
+ * 3) In the unlikely case that the critical section address is != NULL:
+ *
+ * - One 64-bit load to retrieve the start IP
+ * - One 64-bit load to retrieve the offset for calculating the end
+ * - One 64-bit load to retrieve the abort IP
+ * - One 64-bit load to retrieve the signature
+ * - One store to clear the critical section address
+ *
+ * The non-debug case implements only the minimal required checking. It
+ * provides protection against a rogue abort IP in kernel space, which
+ * would be exploitable at least on x86, and also against a rogue CS
+ * descriptor by checking the signature at the abort IP. Any fallout from
+ * invalid critical section descriptors is a user space problem. The debug
+ * case provides the full set of checks and terminates the task if a
+ * condition is not met.
+ *
+ * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
+ * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
+ * slow path there will handle the failure.
+ */
+static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
+{
+ /*
+ * Page faults need to be disabled as this is called with
+ * interrupts disabled
+ */
+ guard(pagefault)();
+ if (likely(!t->rseq.event.ids_changed)) {
+ struct rseq __user *rseq = t->rseq.usrptr;
+ /*
+ * If IDs have not changed rseq_event::user_irq must be true
+ * See rseq_sched_switch_event().
+ */
+ u64 csaddr;
+
+ if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs)))
+ return false;
+
+ if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
+ if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
+ return false;
+ }
+ return true;
+ }
+
+ struct rseq_ids ids = {
+ .cpu_id = task_cpu(t),
+ .mm_cid = task_mm_cid(t),
+ };
+ u32 node_id = cpu_to_node(ids.cpu_id);
+
+ return rseq_update_usr(t, regs, &ids, node_id);
+}
+
+static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+{
+ struct task_struct *t = current;
+
+ /*
+ * If the task did not go through schedule or got the flag enforced
+ * by the rseq syscall or execve, then nothing to do here.
+ *
+ * CPU ID and MM CID can only change when going through a context
+ * switch.
+ *
+ * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
+ * only when rseq_event::has_rseq is true. That conditional is
+ * required to avoid setting the TIF bit if RSEQ is not registered
+ * for a task. rseq_event::sched_switch is cleared when RSEQ is
+ * unregistered by a task so it's sufficient to check for the
+ * sched_switch bit alone.
+ *
+ * A sane compiler requires three instructions for the nothing to do
+ * case including clearing the events, but your mileage might vary.
+ */
+ if (unlikely((t->rseq.event.sched_switch))) {
+ rseq_stat_inc(rseq_stats.fastpath);
+
+ if (unlikely(!rseq_exit_user_update(regs, t)))
+ return true;
+ }
+ /* Clear state so next entry starts from a clean slate */
+ t->rseq.event.events = 0;
+ return false;
+}
+
+/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+static __always_inline bool test_tif_rseq(unsigned long ti_work)
+{
+ return ti_work & _TIF_RSEQ;
+}
+
+static __always_inline void clear_tif_rseq(void)
+{
+ static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
+ clear_thread_flag(TIF_RSEQ);
+}
+#else
+static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
+static __always_inline void clear_tif_rseq(void) { }
+#endif
+
+static __always_inline bool
+rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
+{
+ if (likely(!test_tif_rseq(ti_work)))
+ return false;
+
+ if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
+ current->rseq.event.slowpath = true;
+ set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+ return true;
+ }
+
+ clear_tif_rseq();
+ return false;
+}
+
+#else /* CONFIG_GENERIC_ENTRY */
+static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
+{
+ return false;
+}
+#endif /* !CONFIG_GENERIC_ENTRY */
+
+static __always_inline void rseq_syscall_exit_to_user_mode(void)
+{
+ struct rseq_event *ev = &current->rseq.event;
+
+ rseq_stat_inc(rseq_stats.exit);
+
+ /* Needed to remove the store for the !lockdep case */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ WARN_ON_ONCE(ev->sched_switch);
+ ev->events = 0;
+ }
+}
+
+static __always_inline void rseq_irqentry_exit_to_user_mode(void)
+{
+ struct rseq_event *ev = &current->rseq.event;
+
+ rseq_stat_inc(rseq_stats.exit);
+
+ lockdep_assert_once(!ev->sched_switch);
+
+ /*
+ * Ensure that event (especially user_irq) is cleared when the
+ * interrupt did not result in a schedule and therefore the
+ * rseq processing could not clear it.
+ */
+ ev->events = 0;
+}
+
+/* Required to keep ARM64 working */
+static __always_inline void rseq_exit_to_user_mode_legacy(void)
+{
+ struct rseq_event *ev = &current->rseq.event;
+
+ rseq_stat_inc(rseq_stats.exit);
+
+ if (static_branch_unlikely(&rseq_debug_enabled))
+ WARN_ON_ONCE(ev->sched_switch);
+
+ /*
+ * Ensure that event (especially user_irq) is cleared when the
+ * interrupt did not result in a schedule and therefore the
+ * rseq processing did not clear it.
+ */
+ ev->events = 0;
+}
+
+void __rseq_debug_syscall_return(struct pt_regs *regs);
+
+static inline void rseq_debug_syscall_return(struct pt_regs *regs)
+{
+ if (static_branch_unlikely(&rseq_debug_enabled))
+ __rseq_debug_syscall_return(regs);
+}
+#else /* CONFIG_RSEQ */
+static inline void rseq_note_user_irq_entry(void) { }
+static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
+{
+ return false;
+}
+static inline void rseq_syscall_exit_to_user_mode(void) { }
+static inline void rseq_irqentry_exit_to_user_mode(void) { }
+static inline void rseq_exit_to_user_mode_legacy(void) { }
+static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
+#endif /* !CONFIG_RSEQ */
+
+#endif /* _LINUX_RSEQ_ENTRY_H */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
new file mode 100644
index 000000000000..332dc14b81c9
--- /dev/null
+++ b/include/linux/rseq_types.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_TYPES_H
+#define _LINUX_RSEQ_TYPES_H
+
+#include <linux/irq_work_types.h>
+#include <linux/types.h>
+#include <linux/workqueue_types.h>
+
+#ifdef CONFIG_RSEQ
+struct rseq;
+
+/**
+ * struct rseq_event - Storage for rseq related event management
+ * @all: Compound to initialize and clear the data efficiently
+ * @events: Compound to access events with a single load/store
+ * @sched_switch: True if the task was scheduled and needs update on
+ * exit to user
+ * @ids_changed: Indicator that IDs need to be updated
+ * @user_irq: True on interrupt entry from user mode
+ * @has_rseq: True if the task has a rseq pointer installed
+ * @error: Compound error code for the slow path to analyze
+ * @fatal: User space data corrupted or invalid
+ * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME
+ * is required
+ *
+ * @sched_switch and @ids_changed must be adjacent and the combo must be
+ * 16bit aligned to allow a single store, when both are set at the same
+ * time in the scheduler.
+ */
+struct rseq_event {
+ union {
+ u64 all;
+ struct {
+ union {
+ u32 events;
+ struct {
+ u8 sched_switch;
+ u8 ids_changed;
+ u8 user_irq;
+ };
+ };
+
+ u8 has_rseq;
+ u8 __pad;
+ union {
+ u16 error;
+ struct {
+ u8 fatal;
+ u8 slowpath;
+ };
+ };
+ };
+ };
+};
+
+/**
+ * struct rseq_ids - Cache for ids, which need to be updated
+ * @cpu_cid: Compound of @cpu_id and @mm_cid to make the
+ * compiler emit a single compare on 64-bit
+ * @cpu_id: The CPU ID which was written last to user space
+ * @mm_cid: The MM CID which was written last to user space
+ *
+ * @cpu_id and @mm_cid are updated when the data is written to user space.
+ */
+struct rseq_ids {
+ union {
+ u64 cpu_cid;
+ struct {
+ u32 cpu_id;
+ u32 mm_cid;
+ };
+ };
+};
+
+/**
+ * struct rseq_data - Storage for all rseq related data
+ * @usrptr: Pointer to the registered user space RSEQ memory
+ * @len: Length of the RSEQ region
+ * @sig: Signature of critial section abort IPs
+ * @event: Storage for event management
+ * @ids: Storage for cached CPU ID and MM CID
+ */
+struct rseq_data {
+ struct rseq __user *usrptr;
+ u32 len;
+ u32 sig;
+ struct rseq_event event;
+ struct rseq_ids ids;
+};
+
+#else /* CONFIG_RSEQ */
+struct rseq_data { };
+#endif /* !CONFIG_RSEQ */
+
+#ifdef CONFIG_SCHED_MM_CID
+
+#define MM_CID_UNSET BIT(31)
+#define MM_CID_ONCPU BIT(30)
+#define MM_CID_TRANSIT BIT(29)
+
+/**
+ * struct sched_mm_cid - Storage for per task MM CID data
+ * @active: MM CID is active for the task
+ * @cid: The CID associated to the task either permanently or
+ * borrowed from the CPU
+ */
+struct sched_mm_cid {
+ unsigned int active;
+ unsigned int cid;
+};
+
+/**
+ * struct mm_cid_pcpu - Storage for per CPU MM_CID data
+ * @cid: The CID associated to the CPU either permanently or
+ * while a task with a CID is running
+ */
+struct mm_cid_pcpu {
+ unsigned int cid;
+}____cacheline_aligned_in_smp;
+
+/**
+ * struct mm_mm_cid - Storage for per MM CID data
+ * @pcpu: Per CPU storage for CIDs associated to a CPU
+ * @percpu: Set, when CIDs are in per CPU mode
+ * @transit: Set to MM_CID_TRANSIT during a mode change transition phase
+ * @max_cids: The exclusive maximum CID value for allocation and convergence
+ * @irq_work: irq_work to handle the affinity mode change case
+ * @work: Regular work to handle the affinity mode change case
+ * @lock: Spinlock to protect against affinity setting which can't take @mutex
+ * @mutex: Mutex to serialize forks and exits related to this mm
+ * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
+ * is growth only.
+ * @users: The number of tasks sharing this MM. Separate from mm::mm_users
+ * as that is modified by mmget()/mm_put() by other entities which
+ * do not actually share the MM.
+ * @pcpu_thrs: Threshold for switching back from per CPU mode
+ * @update_deferred: A deferred switch back to per task mode is pending.
+ */
+struct mm_mm_cid {
+ /* Hotpath read mostly members */
+ struct mm_cid_pcpu __percpu *pcpu;
+ unsigned int percpu;
+ unsigned int transit;
+ unsigned int max_cids;
+
+ /* Rarely used. Moves @lock and @mutex into the second cacheline */
+ struct irq_work irq_work;
+ struct work_struct work;
+
+ raw_spinlock_t lock;
+ struct mutex mutex;
+
+ /* Low frequency modified */
+ unsigned int nr_cpus_allowed;
+ unsigned int users;
+ unsigned int pcpu_thrs;
+ unsigned int update_deferred;
+}____cacheline_aligned_in_smp;
+#else /* CONFIG_SCHED_MM_CID */
+struct mm_mm_cid { };
+struct sched_mm_cid { };
+#endif /* !CONFIG_SCHED_MM_CID */
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbb7340c5866..fac12bb7dbe4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -41,7 +41,7 @@
#include <linux/task_io_accounting.h>
#include <linux/posix-timers_types.h>
#include <linux/restart_block.h>
-#include <uapi/linux/rseq.h>
+#include <linux/rseq_types.h>
#include <linux/seqlock_types.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
@@ -637,8 +637,8 @@ struct sched_rt_entity {
#endif
} __randomize_layout;
-typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
-typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+struct rq_flags;
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf);
struct sched_dl_entity {
struct rb_node rb_node;
@@ -685,20 +685,22 @@ struct sched_dl_entity {
*
* @dl_server tells if this is a server entity.
*
- * @dl_defer tells if this is a deferred or regular server. For
- * now only defer server exists.
- *
- * @dl_defer_armed tells if the deferrable server is waiting
- * for the replenishment timer to activate it.
- *
* @dl_server_active tells if the dlserver is active(started).
* dlserver is started on first cfs enqueue on an idle runqueue
* and is stopped when a dequeue results in 0 cfs tasks on the
* runqueue. In other words, dlserver is active only when cpu's
* runqueue has atleast one cfs task.
*
+ * @dl_defer tells if this is a deferred or regular server. For
+ * now only defer server exists.
+ *
+ * @dl_defer_armed tells if the deferrable server is waiting
+ * for the replenishment timer to activate it.
+ *
* @dl_defer_running tells if the deferrable server is actually
* running, skipping the defer phase.
+ *
+ * @dl_defer_idle tracks idle state
*/
unsigned int dl_throttled : 1;
unsigned int dl_yielded : 1;
@@ -709,6 +711,7 @@ struct sched_dl_entity {
unsigned int dl_defer : 1;
unsigned int dl_defer_armed : 1;
unsigned int dl_defer_running : 1;
+ unsigned int dl_defer_idle : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
@@ -730,9 +733,6 @@ struct sched_dl_entity {
* dl_server_update().
*
* @rq the runqueue this server is for
- *
- * @server_has_tasks() returns true if @server_pick return a
- * runnable task.
*/
struct rq *rq;
dl_server_pick_f server_pick_task;
@@ -1406,33 +1406,8 @@ struct task_struct {
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_RSEQ
- struct rseq __user *rseq;
- u32 rseq_len;
- u32 rseq_sig;
- /*
- * RmW on rseq_event_mask must be performed atomically
- * with respect to preemption.
- */
- unsigned long rseq_event_mask;
-# ifdef CONFIG_DEBUG_RSEQ
- /*
- * This is a place holder to save a copy of the rseq fields for
- * validation of read-only fields. The struct rseq has a
- * variable-length array at the end, so it cannot be used
- * directly. Reserve a size large enough for the known fields.
- */
- char rseq_fields[sizeof(struct rseq)];
-# endif
-#endif
-
-#ifdef CONFIG_SCHED_MM_CID
- int mm_cid; /* Current cid in mm */
- int last_mm_cid; /* Most recent cid in mm */
- int migrate_from_cpu;
- int mm_cid_active; /* Whether cid bitmap is active */
- struct callback_head cid_work;
-#endif
+ struct rseq_data rseq;
+ struct sched_mm_cid mm_cid;
struct tlbflush_unmap_batch tlb_ubc;
@@ -1861,8 +1836,8 @@ extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);
-/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
-extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
+/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */
+extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask);
/**
* set_cpus_allowed_ptr - set CPU affinity mask of a task
@@ -1901,6 +1876,7 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_para
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
extern void sched_set_fifo_low(struct task_struct *p);
+extern void sched_set_fifo_secondary(struct task_struct *p);
extern void sched_set_normal(struct task_struct *p, int nice);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
@@ -2058,6 +2034,13 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}
+static inline void set_need_resched_current(void)
+{
+ lockdep_assert_irqs_disabled();
+ set_tsk_need_resched(current);
+ set_preempt_need_resched();
+}
+
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
@@ -2318,6 +2301,32 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
#define alloc_tag_restore(_tag, _old) do {} while (0)
#endif
+/* Avoids recursive inclusion hell */
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_before_execve(struct task_struct *t);
+void sched_mm_cid_after_execve(struct task_struct *t);
+void sched_mm_cid_fork(struct task_struct *t);
+void sched_mm_cid_exit(struct task_struct *t);
+static __always_inline int task_mm_cid(struct task_struct *t)
+{
+ return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT);
+}
+#else
+static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
+static inline void sched_mm_cid_exit(struct task_struct *t) { }
+static __always_inline int task_mm_cid(struct task_struct *t)
+{
+ /*
+ * Use the processor id as a fall-back when the mm cid feature is
+ * disabled. This provides functional per-cpu data structure accesses
+ * in user-space, althrough it won't provide the memory usage benefits.
+ */
+ return task_cpu(t);
+}
+#endif
+
#ifndef MODULE
#ifndef COMPILE_OFFSETS
@@ -2407,12 +2416,12 @@ static inline void __migrate_enable(void) { }
* be defined in kernel/sched/core.c.
*/
#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE
-static inline void migrate_disable(void)
+static __always_inline void migrate_disable(void)
{
__migrate_disable();
}
-static inline void migrate_enable(void)
+static __always_inline void migrate_enable(void)
{
__migrate_enable();
}
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index b7fafe999073..624fda17a785 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -8,7 +8,7 @@
#define SUID_DUMP_USER 1 /* Dump as user of process */
#define SUID_DUMP_ROOT 2 /* Dump as root */
-static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm)
+static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm)
{
/*
* By convention, dumpable bits are contained in first 32 bits of the
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index bbcfdf12aa6e..45c0022b91ce 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -92,6 +92,9 @@ struct sched_domain {
unsigned int nr_balance_failed; /* initialise to 0 */
/* idle_balance() stats */
+ unsigned int newidle_call;
+ unsigned int newidle_success;
+ unsigned int newidle_ratio;
u64 max_newidle_lb_cost;
unsigned long last_decay_max_lb_cost;
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 5ce48eab7a2a..a8a8661839b6 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -1209,4 +1209,118 @@ done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
if (seq & 1)
read_sequnlock_excl_irqrestore(lock, flags);
}
+
+enum ss_state {
+ ss_done = 0,
+ ss_lock,
+ ss_lock_irqsave,
+ ss_lockless,
+};
+
+struct ss_tmp {
+ enum ss_state state;
+ unsigned long data;
+ spinlock_t *lock;
+ spinlock_t *lock_irqsave;
+};
+
+static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst)
+{
+ if (sst->lock)
+ spin_unlock(sst->lock);
+ if (sst->lock_irqsave)
+ spin_unlock_irqrestore(sst->lock_irqsave, sst->data);
+}
+
+extern void __scoped_seqlock_invalid_target(void);
+
+#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000) || defined(CONFIG_KASAN)
+/*
+ * For some reason some GCC-8 architectures (nios2, alpha) have trouble
+ * determining that the ss_done state is impossible in __scoped_seqlock_next()
+ * below.
+ *
+ * Similarly KASAN is known to confuse compilers enough to break this. But we
+ * don't care about code quality for KASAN builds anyway.
+ */
+static inline void __scoped_seqlock_bug(void) { }
+#else
+/*
+ * Canary for compiler optimization -- if the compiler doesn't realize this is
+ * an impossible state, it very likely generates sub-optimal code here.
+ */
+extern void __scoped_seqlock_bug(void);
+#endif
+
+static inline void
+__scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target)
+{
+ switch (sst->state) {
+ case ss_done:
+ __scoped_seqlock_bug();
+ return;
+
+ case ss_lock:
+ case ss_lock_irqsave:
+ sst->state = ss_done;
+ return;
+
+ case ss_lockless:
+ if (!read_seqretry(lock, sst->data)) {
+ sst->state = ss_done;
+ return;
+ }
+ break;
+ }
+
+ switch (target) {
+ case ss_done:
+ __scoped_seqlock_invalid_target();
+ return;
+
+ case ss_lock:
+ sst->lock = &lock->lock;
+ spin_lock(sst->lock);
+ sst->state = ss_lock;
+ return;
+
+ case ss_lock_irqsave:
+ sst->lock_irqsave = &lock->lock;
+ spin_lock_irqsave(sst->lock_irqsave, sst->data);
+ sst->state = ss_lock_irqsave;
+ return;
+
+ case ss_lockless:
+ sst->data = read_seqbegin(lock);
+ return;
+ }
+}
+
+#define __scoped_seqlock_read(_seqlock, _target, _s) \
+ for (struct ss_tmp _s __cleanup(__scoped_seqlock_cleanup) = \
+ { .state = ss_lockless, .data = read_seqbegin(_seqlock) }; \
+ _s.state != ss_done; \
+ __scoped_seqlock_next(&_s, _seqlock, _target))
+
+/**
+ * scoped_seqlock_read (lock, ss_state) - execute the read side critical
+ * section without manual sequence
+ * counter handling or calls to other
+ * helpers
+ * @lock: pointer to seqlock_t protecting the data
+ * @ss_state: one of {ss_lock, ss_lock_irqsave, ss_lockless} indicating
+ * the type of critical read section
+ *
+ * Example:
+ *
+ * scoped_seqlock_read (&lock, ss_lock) {
+ * // read-side critical section
+ * }
+ *
+ * Starts with a lockess pass first. If it fails, restarts the critical
+ * section with the lock held.
+ */
+#define scoped_seqlock_read(_seqlock, _target) \
+ __scoped_seqlock_read(_seqlock, _target, __UNIQUE_ID(seqlock))
+
#endif /* __LINUX_SEQLOCK_H */
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 0e47465ef0fd..774efe592a9a 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -111,7 +111,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
struct list_head *folio_list);
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
+void shmem_truncate_range(struct inode *inode, loff_t start, uoff_t end);
int shmem_unuse(unsigned int type);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 66c06fcdfe19..cf84d98964b2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -77,6 +77,7 @@ struct cachestat_range;
struct cachestat;
struct statmount;
struct mnt_id_req;
+struct ns_id_req;
struct xattr_args;
struct file_attr;
@@ -437,6 +438,9 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req,
asmlinkage long sys_listmount(const struct mnt_id_req __user *req,
u64 __user *mnt_ids, size_t nr_mnt_ids,
unsigned int flags);
+asmlinkage long sys_listns(const struct ns_id_req __user *req,
+ u64 __user *ns_ids, size_t nr_ns_ids,
+ unsigned int flags);
asmlinkage long sys_truncate(const char __user *path, long length);
asmlinkage long sys_ftruncate(unsigned int fd, off_t length);
#if BITS_PER_LONG == 32
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index dd925d84fa46..b40de9bab4b7 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -67,6 +67,11 @@ enum syscall_work_bit {
#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
#endif
+#ifndef TIF_RSEQ
+# define TIF_RSEQ TIF_NOTIFY_RESUME
+# define _TIF_RSEQ _TIF_NOTIFY_RESUME
+#endif
+
#ifdef __KERNEL__
#ifndef arch_set_restart_data
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 0414d9e6b4fc..62e1cea71125 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -188,4 +188,13 @@ int timers_dead_cpu(unsigned int cpu);
#define timers_dead_cpu NULL
#endif
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask);
+#else
+static inline int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
+{
+ return 0;
+}
+#endif
+
#endif
diff --git a/include/linux/types.h b/include/linux/types.h
index 6dfdb8e8e4c3..d4437e9c452c 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -50,6 +50,7 @@ typedef __kernel_old_gid_t old_gid_t;
#if defined(__GNUC__)
typedef __kernel_loff_t loff_t;
+typedef __kernel_uoff_t uoff_t;
#endif
/*
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 1beb5b395d81..be395f5f7ee3 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -2,6 +2,7 @@
#ifndef __LINUX_UACCESS_H__
#define __LINUX_UACCESS_H__
+#include <linux/cleanup.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/minmax.h>
@@ -35,9 +36,17 @@
#ifdef masked_user_access_begin
#define can_do_masked_user_access() 1
+# ifndef masked_user_write_access_begin
+# define masked_user_write_access_begin masked_user_access_begin
+# endif
+# ifndef masked_user_read_access_begin
+# define masked_user_read_access_begin masked_user_access_begin
+#endif
#else
#define can_do_masked_user_access() 0
#define masked_user_access_begin(src) NULL
+ #define masked_user_read_access_begin(src) NULL
+ #define masked_user_write_access_begin(src) NULL
#define mask_user_address(src) (src)
#endif
@@ -518,7 +527,34 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
long count);
long strnlen_user_nofault(const void __user *unsafe_addr, long count);
-#ifndef __get_kernel_nofault
+#ifdef arch_get_kernel_nofault
+/*
+ * Wrap the architecture implementation so that @label can be outside of a
+ * cleanup() scope. A regular C goto works correctly, but ASM goto does
+ * not. Clang rejects such an attempt, but GCC silently emits buggy code.
+ */
+#define __get_kernel_nofault(dst, src, type, label) \
+do { \
+ __label__ local_label; \
+ arch_get_kernel_nofault(dst, src, type, local_label); \
+ if (0) { \
+ local_label: \
+ goto label; \
+ } \
+} while (0)
+
+#define __put_kernel_nofault(dst, src, type, label) \
+do { \
+ __label__ local_label; \
+ arch_put_kernel_nofault(dst, src, type, local_label); \
+ if (0) { \
+ local_label: \
+ goto label; \
+ } \
+} while (0)
+
+#elif !defined(__get_kernel_nofault) /* arch_get_kernel_nofault */
+
#define __get_kernel_nofault(dst, src, type, label) \
do { \
type __user *p = (type __force __user *)(src); \
@@ -535,7 +571,8 @@ do { \
if (__put_user(data, p)) \
goto label; \
} while (0)
-#endif
+
+#endif /* !__get_kernel_nofault */
/**
* get_kernel_nofault(): safely attempt to read from a location
@@ -549,7 +586,42 @@ do { \
copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\
})
-#ifndef user_access_begin
+#ifdef user_access_begin
+
+#ifdef arch_unsafe_get_user
+/*
+ * Wrap the architecture implementation so that @label can be outside of a
+ * cleanup() scope. A regular C goto works correctly, but ASM goto does
+ * not. Clang rejects such an attempt, but GCC silently emits buggy code.
+ *
+ * Some architectures use internal local labels already, but this extra
+ * indirection here is harmless because the compiler optimizes it out
+ * completely in any case. This construct just ensures that the ASM GOTO
+ * target is always in the local scope. The C goto 'label' works correctly
+ * when leaving a cleanup() scope.
+ */
+#define unsafe_get_user(x, ptr, label) \
+do { \
+ __label__ local_label; \
+ arch_unsafe_get_user(x, ptr, local_label); \
+ if (0) { \
+ local_label: \
+ goto label; \
+ } \
+} while (0)
+
+#define unsafe_put_user(x, ptr, label) \
+do { \
+ __label__ local_label; \
+ arch_unsafe_put_user(x, ptr, local_label); \
+ if (0) { \
+ local_label: \
+ goto label; \
+ } \
+} while (0)
+#endif /* arch_unsafe_get_user */
+
+#else /* user_access_begin */
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
@@ -559,7 +631,8 @@ do { \
#define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e)
static inline unsigned long user_access_save(void) { return 0UL; }
static inline void user_access_restore(unsigned long flags) { }
-#endif
+#endif /* !user_access_begin */
+
#ifndef user_write_access_begin
#define user_write_access_begin user_access_begin
#define user_write_access_end user_access_end
@@ -569,6 +642,239 @@ static inline void user_access_restore(unsigned long flags) { }
#define user_read_access_end user_access_end
#endif
+/* Define RW variant so the below _mode macro expansion works */
+#define masked_user_rw_access_begin(u) masked_user_access_begin(u)
+#define user_rw_access_begin(u, s) user_access_begin(u, s)
+#define user_rw_access_end() user_access_end()
+
+/* Scoped user access */
+#define USER_ACCESS_GUARD(_mode) \
+static __always_inline void __user * \
+class_user_##_mode##_begin(void __user *ptr) \
+{ \
+ return ptr; \
+} \
+ \
+static __always_inline void \
+class_user_##_mode##_end(void __user *ptr) \
+{ \
+ user_##_mode##_access_end(); \
+} \
+ \
+DEFINE_CLASS(user_ ##_mode## _access, void __user *, \
+ class_user_##_mode##_end(_T), \
+ class_user_##_mode##_begin(ptr), void __user *ptr) \
+ \
+static __always_inline class_user_##_mode##_access_t \
+class_user_##_mode##_access_ptr(void __user *scope) \
+{ \
+ return scope; \
+}
+
+USER_ACCESS_GUARD(read)
+USER_ACCESS_GUARD(write)
+USER_ACCESS_GUARD(rw)
+#undef USER_ACCESS_GUARD
+
+/**
+ * __scoped_user_access_begin - Start a scoped user access
+ * @mode: The mode of the access class (read, write, rw)
+ * @uptr: The pointer to access user space memory
+ * @size: Size of the access
+ * @elbl: Error label to goto when the access region is rejected
+ *
+ * Internal helper for __scoped_user_access(). Don't use directly.
+ */
+#define __scoped_user_access_begin(mode, uptr, size, elbl) \
+({ \
+ typeof(uptr) __retptr; \
+ \
+ if (can_do_masked_user_access()) { \
+ __retptr = masked_user_##mode##_access_begin(uptr); \
+ } else { \
+ __retptr = uptr; \
+ if (!user_##mode##_access_begin(uptr, size)) \
+ goto elbl; \
+ } \
+ __retptr; \
+})
+
+/**
+ * __scoped_user_access - Open a scope for user access
+ * @mode: The mode of the access class (read, write, rw)
+ * @uptr: The pointer to access user space memory
+ * @size: Size of the access
+ * @elbl: Error label to goto when the access region is rejected. It
+ * must be placed outside the scope
+ *
+ * If the user access function inside the scope requires a fault label, it
+ * can use @elbl or a different label outside the scope, which requires
+ * that user access which is implemented with ASM GOTO has been properly
+ * wrapped. See unsafe_get_user() for reference.
+ *
+ * scoped_user_rw_access(ptr, efault) {
+ * unsafe_get_user(rval, &ptr->rval, efault);
+ * unsafe_put_user(wval, &ptr->wval, efault);
+ * }
+ * return 0;
+ * efault:
+ * return -EFAULT;
+ *
+ * The scope is internally implemented as a autoterminating nested for()
+ * loop, which can be left with 'return', 'break' and 'goto' at any
+ * point.
+ *
+ * When the scope is left user_##@_mode##_access_end() is automatically
+ * invoked.
+ *
+ * When the architecture supports masked user access and the access region
+ * which is determined by @uptr and @size is not a valid user space
+ * address, i.e. < TASK_SIZE, the scope sets the pointer to a faulting user
+ * space address and does not terminate early. This optimizes for the good
+ * case and lets the performance uncritical bad case go through the fault.
+ *
+ * The eventual modification of the pointer is limited to the scope.
+ * Outside of the scope the original pointer value is unmodified, so that
+ * the original pointer value is available for diagnostic purposes in an
+ * out of scope fault path.
+ *
+ * Nesting scoped user access into a user access scope is invalid and fails
+ * the build. Nesting into other guards, e.g. pagefault is safe.
+ *
+ * The masked variant does not check the size of the access and relies on a
+ * mapping hole (e.g. guard page) to catch an out of range pointer, the
+ * first access to user memory inside the scope has to be within
+ * @uptr ... @uptr + PAGE_SIZE - 1
+ *
+ * Don't use directly. Use scoped_masked_user_$MODE_access() instead.
+ */
+#define __scoped_user_access(mode, uptr, size, elbl) \
+for (bool done = false; !done; done = true) \
+ for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \
+ !done; done = true) \
+ for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true) \
+ /* Force modified pointer usage within the scope */ \
+ for (const typeof(uptr) uptr = _tmpptr; !done; done = true)
+
+/**
+ * scoped_user_read_access_size - Start a scoped user read access with given size
+ * @usrc: Pointer to the user space address to read from
+ * @size: Size of the access starting from @usrc
+ * @elbl: Error label to goto when the access region is rejected
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_read_access_size(usrc, size, elbl) \
+ __scoped_user_access(read, usrc, size, elbl)
+
+/**
+ * scoped_user_read_access - Start a scoped user read access
+ * @usrc: Pointer to the user space address to read from
+ * @elbl: Error label to goto when the access region is rejected
+ *
+ * The size of the access starting from @usrc is determined via sizeof(*@usrc)).
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_read_access(usrc, elbl) \
+ scoped_user_read_access_size(usrc, sizeof(*(usrc)), elbl)
+
+/**
+ * scoped_user_write_access_size - Start a scoped user write access with given size
+ * @udst: Pointer to the user space address to write to
+ * @size: Size of the access starting from @udst
+ * @elbl: Error label to goto when the access region is rejected
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_write_access_size(udst, size, elbl) \
+ __scoped_user_access(write, udst, size, elbl)
+
+/**
+ * scoped_user_write_access - Start a scoped user write access
+ * @udst: Pointer to the user space address to write to
+ * @elbl: Error label to goto when the access region is rejected
+ *
+ * The size of the access starting from @udst is determined via sizeof(*@udst)).
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_write_access(udst, elbl) \
+ scoped_user_write_access_size(udst, sizeof(*(udst)), elbl)
+
+/**
+ * scoped_user_rw_access_size - Start a scoped user read/write access with given size
+ * @uptr Pointer to the user space address to read from and write to
+ * @size: Size of the access starting from @uptr
+ * @elbl: Error label to goto when the access region is rejected
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_rw_access_size(uptr, size, elbl) \
+ __scoped_user_access(rw, uptr, size, elbl)
+
+/**
+ * scoped_user_rw_access - Start a scoped user read/write access
+ * @uptr Pointer to the user space address to read from and write to
+ * @elbl: Error label to goto when the access region is rejected
+ *
+ * The size of the access starting from @uptr is determined via sizeof(*@uptr)).
+ *
+ * For further information see __scoped_user_access() above.
+ */
+#define scoped_user_rw_access(uptr, elbl) \
+ scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl)
+
+/**
+ * get_user_inline - Read user data inlined
+ * @val: The variable to store the value read from user memory
+ * @usrc: Pointer to the user space memory to read from
+ *
+ * Return: 0 if successful, -EFAULT when faulted
+ *
+ * Inlined variant of get_user(). Only use when there is a demonstrable
+ * performance reason.
+ */
+#define get_user_inline(val, usrc) \
+({ \
+ __label__ efault; \
+ typeof(usrc) _tmpsrc = usrc; \
+ int _ret = 0; \
+ \
+ scoped_user_read_access(_tmpsrc, efault) \
+ unsafe_get_user(val, _tmpsrc, efault); \
+ if (0) { \
+ efault: \
+ _ret = -EFAULT; \
+ } \
+ _ret; \
+})
+
+/**
+ * put_user_inline - Write to user memory inlined
+ * @val: The value to write
+ * @udst: Pointer to the user space memory to write to
+ *
+ * Return: 0 if successful, -EFAULT when faulted
+ *
+ * Inlined variant of put_user(). Only use when there is a demonstrable
+ * performance reason.
+ */
+#define put_user_inline(val, udst) \
+({ \
+ __label__ efault; \
+ typeof(udst) _tmpdst = udst; \
+ int _ret = 0; \
+ \
+ scoped_user_write_access(_tmpdst, efault) \
+ unsafe_put_user(val, _tmpdst, efault); \
+ if (0) { \
+ efault: \
+ _ret = -EFAULT; \
+ } \
+ _ret; \
+})
+
#ifdef CONFIG_HARDENED_USERCOPY
void __noreturn usercopy_abort(const char *name, const char *detail,
bool to_user, unsigned long offset,
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 26122d00708a..bc7ae7d21900 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -6,16 +6,6 @@
#include <linux/unwind_user.h>
#include <linux/unwind_deferred_types.h>
-struct unwind_work;
-
-typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie);
-
-struct unwind_work {
- struct list_head list;
- unwind_callback_t func;
- int bit;
-};
-
#ifdef CONFIG_UNWIND_USER
enum {
@@ -44,22 +34,22 @@ void unwind_deferred_task_exit(struct task_struct *task);
static __always_inline void unwind_reset_info(void)
{
struct unwind_task_info *info = &current->unwind_info;
- unsigned long bits;
+ unsigned long bits = atomic_long_read(&info->unwind_mask);
/* Was there any unwinding? */
- if (unlikely(info->unwind_mask)) {
- bits = info->unwind_mask;
- do {
- /* Is a task_work going to run again before going back */
- if (bits & UNWIND_PENDING)
- return;
- } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL));
- current->unwind_info.id.id = 0;
-
- if (unlikely(info->cache)) {
- info->cache->nr_entries = 0;
- info->cache->unwind_completed = 0;
- }
+ if (likely(!bits))
+ return;
+
+ do {
+ /* Is a task_work going to run again before going back */
+ if (bits & UNWIND_PENDING)
+ return;
+ } while (!atomic_long_try_cmpxchg(&info->unwind_mask, &bits, 0UL));
+ current->unwind_info.id.id = 0;
+
+ if (unlikely(info->cache)) {
+ info->cache->nr_entries = 0;
+ info->cache->unwind_completed = 0;
}
}
@@ -68,9 +58,17 @@ static __always_inline void unwind_reset_info(void)
static inline void unwind_task_init(struct task_struct *task) {}
static inline void unwind_task_free(struct task_struct *task) {}
-static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; }
-static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; }
-static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; }
+static inline int unwind_user_faultable(struct unwind_stacktrace *trace)
+{ return -ENOSYS; }
+
+static inline int
+unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
+{ return -ENOSYS; }
+
+static inline int
+unwind_deferred_request(struct unwind_work *work, u64 *timestamp)
+{ return -ENOSYS; }
+
static inline void unwind_deferred_cancel(struct unwind_work *work) {}
static inline void unwind_deferred_task_exit(struct task_struct *task) {}
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index 33b62ac25c86..18fa3932f61c 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -2,6 +2,9 @@
#ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
#define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+#include <linux/types.h>
+#include <linux/atomic.h>
+
struct unwind_cache {
unsigned long unwind_completed;
unsigned int nr_entries;
@@ -30,10 +33,23 @@ union unwind_task_id {
};
struct unwind_task_info {
- unsigned long unwind_mask;
+ atomic_long_t unwind_mask;
struct unwind_cache *cache;
struct callback_head work;
union unwind_task_id id;
};
+struct unwind_work;
+struct unwind_stacktrace;
+
+typedef void (*unwind_callback_t)(struct unwind_work *work,
+ struct unwind_stacktrace *trace,
+ u64 cookie);
+
+struct unwind_work {
+ struct list_head list;
+ unwind_callback_t func;
+ int bit;
+};
+
#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h
index a449f15be890..412729a269bc 100644
--- a/include/linux/unwind_user_types.h
+++ b/include/linux/unwind_user_types.h
@@ -36,8 +36,10 @@ struct unwind_user_state {
unsigned long ip;
unsigned long sp;
unsigned long fp;
+ unsigned int ws;
enum unwind_user_type current_type;
unsigned int available_types;
+ bool topmost;
bool done;
};
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 3aaf19e77558..8285b19a25e0 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -376,6 +376,9 @@ struct usb_gadget_ops {
* can handle. The UDC must support this and all slower speeds and lower
* number of lanes.
* @state: the state we are now (attached, suspended, configured, etc)
+ * @state_lock: Spinlock protecting the `state` and `teardown` members.
+ * @teardown: True if the device is undergoing teardown, used to prevent
+ * new work from being scheduled during cleanup.
* @name: Identifies the controller hardware type. Used in diagnostics
* and sometimes configuration.
* @dev: Driver model state for this abstract device.
@@ -451,6 +454,8 @@ struct usb_gadget {
enum usb_ssp_rate max_ssp_rate;
enum usb_device_state state;
+ spinlock_t state_lock;
+ bool teardown;
const char *name;
struct device dev;
unsigned isoch_delay;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 9a9aebbf96b9..9c3be157397e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -166,13 +166,13 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX;
}
-#ifdef CONFIG_USER_NS
-
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
return container_of(ns, struct user_namespace, ns);
}
+#ifdef CONFIG_USER_NS
+
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
if (ns)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 4d1780848d0e..75dabb763c65 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -384,7 +384,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb,
struct virtio_net_hdr_v1_hash_tunnel *vhdr,
bool tnl_hdr_negotiated,
bool little_endian,
- int vlan_hlen)
+ int vlan_hlen,
+ bool has_data_valid)
{
struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr;
unsigned int inner_nh, outer_th;
@@ -394,14 +395,15 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb,
tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL |
SKB_GSO_UDP_TUNNEL_CSUM);
if (!tnl_gso_type)
- return virtio_net_hdr_from_skb(skb, hdr, little_endian, false,
- vlan_hlen);
+ return virtio_net_hdr_from_skb(skb, hdr, little_endian,
+ has_data_valid, vlan_hlen);
/* Tunnel support not negotiated but skb ask for it. */
if (!tnl_hdr_negotiated)
return -EINVAL;
- vhdr->hash_hdr.hash_value = 0;
+ vhdr->hash_hdr.hash_value_lo = 0;
+ vhdr->hash_hdr.hash_value_hi = 0;
vhdr->hash_hdr.hash_report = 0;
vhdr->hash_hdr.padding = 0;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 22dd4adc5667..f48e8ccffe81 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -189,11 +189,11 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
void inode_wait_for_writeback(struct inode *inode);
void inode_io_list_del(struct inode *inode);
-/* writeback.h requires fs.h; it, too, is not included from here. */
-static inline void wait_on_inode(struct inode *inode)
+static inline xa_mark_t wbc_to_tag(struct writeback_control *wbc)
{
- wait_var_event(inode_state_wait_address(inode, __I_NEW),
- !(READ_ONCE(inode->i_state) & I_NEW));
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ return PAGECACHE_TAG_TOWRITE;
+ return PAGECACHE_TAG_DIRTY;
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -234,7 +234,7 @@ static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
static inline void inode_detach_wb(struct inode *inode)
{
if (inode->i_wb) {
- WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
+ WARN_ON_ONCE(!(inode_state_read_once(inode) & I_CLEAR));
wb_put(inode->i_wb);
inode->i_wb = NULL;
}
@@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *);
void sb_mark_inode_writeback(struct inode *inode);
void sb_clear_inode_writeback(struct inode *inode);
+/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
+
#endif /* WRITEBACK_H */
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 86b0d47984a1..64e9afe7d647 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -85,12 +85,12 @@ int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *,
const char *, const void *, size_t, int);
int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *,
const char *, const void *, size_t, int,
- struct inode **);
+ struct delegated_inode *);
int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *,
const void *, size_t, int);
int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);
int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *,
- const char *, struct inode **);
+ const char *, struct delegated_inode *);
int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);
ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 9ecc70baaca9..cb4c02d00759 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -434,6 +434,7 @@ enum {
HCI_USER_CHANNEL,
HCI_EXT_CONFIGURED,
HCI_LE_ADV,
+ HCI_LE_ADV_0,
HCI_LE_PER_ADV,
HCI_LE_SCAN,
HCI_SSP_ENABLED,
@@ -2782,6 +2783,11 @@ struct hci_ev_le_per_adv_report {
__u8 data[];
} __packed;
+#define HCI_EV_LE_PA_SYNC_LOST 0x10
+struct hci_ev_le_pa_sync_lost {
+ __le16 handle;
+} __packed;
+
#define LE_PA_DATA_COMPLETE 0x00
#define LE_PA_DATA_MORE_TO_COME 0x01
#define LE_PA_DATA_TRUNCATED 0x02
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 2924c2bf2a98..0cb87687837f 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -244,6 +244,7 @@ struct adv_info {
bool enabled;
bool pending;
bool periodic;
+ bool periodic_enabled;
__u8 mesh;
__u8 instance;
__u8 handle;
@@ -748,7 +749,6 @@ struct hci_conn {
__u8 remote_cap;
__u8 remote_auth;
- __u8 remote_id;
unsigned int sent;
@@ -856,11 +856,12 @@ extern struct mutex hci_cb_list_lock;
/* ----- HCI interface to upper protocols ----- */
int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr);
int l2cap_disconn_ind(struct hci_conn *hcon);
-void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags);
+int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb,
+ u16 flags);
#if IS_ENABLED(CONFIG_BT_BREDR)
int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags);
-void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb);
+int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb);
#else
static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
__u8 *flags)
@@ -868,23 +869,30 @@ static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
return 0;
}
-static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
+static inline int sco_recv_scodata(struct hci_dev *hdev, u16 handle,
+ struct sk_buff *skb)
{
+ kfree_skb(skb);
+ return -ENOENT;
}
#endif
#if IS_ENABLED(CONFIG_BT_LE)
int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags);
-void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags);
+int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb,
+ u16 flags);
#else
static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
__u8 *flags)
{
return 0;
}
-static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb,
- u16 flags)
+
+static inline int iso_recv(struct hci_dev *hdev, u16 handle,
+ struct sk_buff *skb, u16 flags)
{
+ kfree_skb(skb);
+ return -ENOENT;
}
#endif
diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 4bb0eaedda18..00e182a22720 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -38,8 +38,8 @@
#define L2CAP_DEFAULT_TX_WINDOW 63
#define L2CAP_DEFAULT_EXT_WINDOW 0x3FFF
#define L2CAP_DEFAULT_MAX_TX 3
-#define L2CAP_DEFAULT_RETRANS_TO 2 /* seconds */
-#define L2CAP_DEFAULT_MONITOR_TO 12 /* seconds */
+#define L2CAP_DEFAULT_RETRANS_TO 2000 /* 2 seconds */
+#define L2CAP_DEFAULT_MONITOR_TO 12000 /* 12 seconds */
#define L2CAP_DEFAULT_MAX_PDU_SIZE 1492 /* Sized for AMP packet */
#define L2CAP_DEFAULT_ACK_TO 200
#define L2CAP_DEFAULT_MAX_SDU_SIZE 0xFFFF
diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index 74edea06985b..f5be96f08b9d 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -780,7 +780,7 @@ struct mgmt_adv_pattern {
__u8 ad_type;
__u8 offset;
__u8 length;
- __u8 value[31];
+ __u8 value[HCI_MAX_AD_LENGTH];
} __packed;
#define MGMT_OP_ADD_ADV_PATTERNS_MONITOR 0x0052
@@ -853,7 +853,7 @@ struct mgmt_cp_set_mesh {
__le16 window;
__le16 period;
__u8 num_ad_types;
- __u8 ad_types[];
+ __u8 ad_types[] __counted_by(num_ad_types);
} __packed;
#define MGMT_SET_MESH_RECEIVER_SIZE 6
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 781624f5913a..820e299f06b5 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6435,6 +6435,11 @@ static inline void wiphy_delayed_work_init(struct wiphy_delayed_work *dwork,
* after wiphy_lock() was called. Therefore, wiphy_cancel_work() can
* use just cancel_work() instead of cancel_work_sync(), it requires
* being in a section protected by wiphy_lock().
+ *
+ * Note that these are scheduled with a timer where the accuracy
+ * becomes less the longer in the future the scheduled timer is. Use
+ * wiphy_hrtimer_work_queue() if the timer must be not be late by more
+ * than approximately 10 percent.
*/
void wiphy_delayed_work_queue(struct wiphy *wiphy,
struct wiphy_delayed_work *dwork,
@@ -6506,6 +6511,79 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy,
bool wiphy_delayed_work_pending(struct wiphy *wiphy,
struct wiphy_delayed_work *dwork);
+struct wiphy_hrtimer_work {
+ struct wiphy_work work;
+ struct wiphy *wiphy;
+ struct hrtimer timer;
+};
+
+enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t);
+
+static inline void wiphy_hrtimer_work_init(struct wiphy_hrtimer_work *hrwork,
+ wiphy_work_func_t func)
+{
+ hrtimer_setup(&hrwork->timer, wiphy_hrtimer_work_timer,
+ CLOCK_BOOTTIME, HRTIMER_MODE_REL);
+ wiphy_work_init(&hrwork->work, func);
+}
+
+/**
+ * wiphy_hrtimer_work_queue - queue hrtimer work for the wiphy
+ * @wiphy: the wiphy to queue for
+ * @hrwork: the high resolution timer worker
+ * @delay: the delay given as a ktime_t
+ *
+ * Please refer to wiphy_delayed_work_queue(). The difference is that
+ * the hrtimer work uses a high resolution timer for scheduling. This
+ * may be needed if timeouts might be scheduled further in the future
+ * and the accuracy of the normal timer is not sufficient.
+ *
+ * Expect a delay of a few milliseconds as the timer is scheduled
+ * with some slack and some more time may pass between queueing the
+ * work and its start.
+ */
+void wiphy_hrtimer_work_queue(struct wiphy *wiphy,
+ struct wiphy_hrtimer_work *hrwork,
+ ktime_t delay);
+
+/**
+ * wiphy_hrtimer_work_cancel - cancel previously queued hrtimer work
+ * @wiphy: the wiphy, for debug purposes
+ * @hrtimer: the hrtimer work to cancel
+ *
+ * Cancel the work *without* waiting for it, this assumes being
+ * called under the wiphy mutex acquired by wiphy_lock().
+ */
+void wiphy_hrtimer_work_cancel(struct wiphy *wiphy,
+ struct wiphy_hrtimer_work *hrtimer);
+
+/**
+ * wiphy_hrtimer_work_flush - flush previously queued hrtimer work
+ * @wiphy: the wiphy, for debug purposes
+ * @hrwork: the hrtimer work to flush
+ *
+ * Flush the work (i.e. run it if pending). This must be called
+ * under the wiphy mutex acquired by wiphy_lock().
+ */
+void wiphy_hrtimer_work_flush(struct wiphy *wiphy,
+ struct wiphy_hrtimer_work *hrwork);
+
+/**
+ * wiphy_hrtimer_work_pending - Find out whether a wiphy hrtimer
+ * work item is currently pending.
+ *
+ * @wiphy: the wiphy, for debug purposes
+ * @hrwork: the hrtimer work in question
+ *
+ * Return: true if timer is pending, false otherwise
+ *
+ * Please refer to the wiphy_delayed_work_pending() documentation as
+ * this is the equivalent function for hrtimer based delayed work
+ * items.
+ */
+bool wiphy_hrtimer_work_pending(struct wiphy *wiphy,
+ struct wiphy_hrtimer_work *hrwork);
+
/**
* enum ieee80211_ap_reg_power - regulatory power for an Access Point
*
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index bc3507edd589..898723ab62e8 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -513,7 +513,7 @@ struct libeth_xdp_tx_desc {
* can't fail, but can send less frames if there's no enough free descriptors
* available. The actual free space is returned by @prep from the driver.
*/
-static __always_inline u32
+static __always_inline __nocfi_generic u32
libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq,
u32 n, bool unroll, u64 priv,
u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq),
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index c64fd896b1f9..99ac747b7906 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -536,6 +536,8 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
case TCF_LAYER_NETWORK:
return skb_network_header(skb);
case TCF_LAYER_TRANSPORT:
+ if (!skb_transport_header_was_set(skb))
+ break;
return skb_transport_header(skb);
}
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5ca230ed526a..ab20f549b8f9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -370,7 +370,7 @@ void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, int *karg);
enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
-void tcp_rcvbuf_grow(struct sock *sk);
+void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
diff --git a/include/net/tls.h b/include/net/tls.h
index 857340338b69..c7bcdb3afad7 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -451,25 +451,26 @@ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq)
/* Log all TLS record header TCP sequences in [seq, seq+len] */
static inline void
-tls_offload_rx_resync_async_request_start(struct sock *sk, __be32 seq, u16 len)
+tls_offload_rx_resync_async_request_start(struct tls_offload_resync_async *resync_async,
+ __be32 seq, u16 len)
{
- struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);
-
- atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) |
+ atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) |
((u64)len << 16) | RESYNC_REQ | RESYNC_REQ_ASYNC);
- rx_ctx->resync_async->loglen = 0;
- rx_ctx->resync_async->rcd_delta = 0;
+ resync_async->loglen = 0;
+ resync_async->rcd_delta = 0;
}
static inline void
-tls_offload_rx_resync_async_request_end(struct sock *sk, __be32 seq)
+tls_offload_rx_resync_async_request_end(struct tls_offload_resync_async *resync_async,
+ __be32 seq)
{
- struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);
+ atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) | RESYNC_REQ);
+}
- atomic64_set(&rx_ctx->resync_async->req,
- ((u64)ntohl(seq) << 32) | RESYNC_REQ);
+static inline void
+tls_offload_rx_resync_async_request_cancel(struct tls_offload_resync_async *resync_async)
+{
+ atomic64_set(&resync_async->req, 0);
}
static inline void
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index f3014e4f54fc..0a14daaa5dd4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -536,7 +536,8 @@ static inline int xfrm_af2proto(unsigned int family)
static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
{
- if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
+ if ((x->sel.family != AF_UNSPEC) ||
+ (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
(ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
return &x->inner_mode;
else
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 6d6500148c4b..993008cdea65 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -252,8 +252,8 @@ struct scsi_device {
unsigned int queue_stopped; /* request queue is quiesced */
bool offline_already; /* Device offline message logged */
- unsigned int ua_new_media_ctr; /* Counter for New Media UNIT ATTENTIONs */
- unsigned int ua_por_ctr; /* Counter for Power On / Reset UAs */
+ atomic_t ua_new_media_ctr; /* Counter for New Media UNIT ATTENTIONs */
+ atomic_t ua_por_ctr; /* Counter for Power On / Reset UAs */
atomic_t disk_events_disable_depth; /* disable depth for disk events */
@@ -693,10 +693,8 @@ static inline int scsi_device_busy(struct scsi_device *sdev)
}
/* Macros to access the UNIT ATTENTION counters */
-#define scsi_get_ua_new_media_ctr(sdev) \
- ((const unsigned int)(sdev->ua_new_media_ctr))
-#define scsi_get_ua_por_ctr(sdev) \
- ((const unsigned int)(sdev->ua_por_ctr))
+#define scsi_get_ua_new_media_ctr(sdev) atomic_read(&sdev->ua_new_media_ctr)
+#define scsi_get_ua_por_ctr(sdev) atomic_read(&sdev->ua_por_ctr)
#define MODULE_ALIAS_SCSI_DEVICE(type) \
MODULE_ALIAS("scsi:t-" __stringify(type) "*")
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 82904291c2b8..370f8df2fdb4 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -179,7 +179,8 @@ TRACE_EVENT(pstate_sample,
{ PM_EVENT_HIBERNATE, "hibernate" }, \
{ PM_EVENT_THAW, "thaw" }, \
{ PM_EVENT_RESTORE, "restore" }, \
- { PM_EVENT_RECOVER, "recover" })
+ { PM_EVENT_RECOVER, "recover" }, \
+ { PM_EVENT_POWEROFF, "poweroff" })
DEFINE_EVENT(cpu, cpu_frequency,
diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
index 823b47d1ba1e..ce85d650bf4b 100644
--- a/include/trace/events/rseq.h
+++ b/include/trace/events/rseq.h
@@ -21,9 +21,9 @@ TRACE_EVENT(rseq_update,
),
TP_fast_assign(
- __entry->cpu_id = raw_smp_processor_id();
+ __entry->cpu_id = t->rseq.ids.cpu_id;
__entry->node_id = cpu_to_node(__entry->cpu_id);
- __entry->mm_cid = task_mm_cid(t);
+ __entry->mm_cid = t->rseq.ids.mm_cid;
),
TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id,
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 9d2c36c6a0ed..6757233bd064 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -218,6 +218,9 @@ TRACE_EVENT(tcp_rcvbuf_grow,
__field(__u32, space)
__field(__u32, ooo_space)
__field(__u32, rcvbuf)
+ __field(__u32, rcv_ssthresh)
+ __field(__u32, window_clamp)
+ __field(__u32, rcv_wnd)
__field(__u8, scaling_ratio)
__field(__u16, sport)
__field(__u16, dport)
@@ -245,6 +248,9 @@ TRACE_EVENT(tcp_rcvbuf_grow,
tp->rcv_nxt;
__entry->rcvbuf = sk->sk_rcvbuf;
+ __entry->rcv_ssthresh = tp->rcv_ssthresh;
+ __entry->window_clamp = tp->window_clamp;
+ __entry->rcv_wnd = tp->rcv_wnd;
__entry->scaling_ratio = tp->scaling_ratio;
__entry->sport = ntohs(inet->inet_sport);
__entry->dport = ntohs(inet->inet_dport);
@@ -264,11 +270,14 @@ TRACE_EVENT(tcp_rcvbuf_grow,
),
TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
+ "rcv_ssthresh=%u window_clamp=%u rcv_wnd=%u "
"family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
"saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx",
__entry->time, __entry->rtt_us, __entry->copied,
__entry->inq, __entry->space, __entry->ooo_space,
__entry->scaling_ratio, __entry->rcvbuf,
+ __entry->rcv_ssthresh, __entry->window_clamp,
+ __entry->rcv_wnd,
show_family_name(__entry->family),
__entry->sport, __entry->dport,
__entry->saddr, __entry->daddr,
diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h
index 47db5eaf2f9a..61171b13c687 100644
--- a/include/trace/events/timer_migration.h
+++ b/include/trace/events/timer_migration.h
@@ -173,14 +173,14 @@ DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_active,
TP_ARGS(tmc)
);
-DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_online,
+DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_available,
TP_PROTO(struct tmigr_cpu *tmc),
TP_ARGS(tmc)
);
-DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_offline,
+DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_unavailable,
TP_PROTO(struct tmigr_cpu *tmc),
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index c08aff044e80..311a341e6fe4 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
/* may be called for files on pseudo FSes w/ unregistered bdi */
strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
__entry->ino = inode->i_ino;
- __entry->state = inode->i_state;
+ __entry->state = inode_state_read_once(inode);
__entry->flags = flags;
),
@@ -748,7 +748,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
strscpy_pad(__entry->name,
bdi_dev_name(inode_to_bdi(inode)), 32);
__entry->ino = inode->i_ino;
- __entry->state = inode->i_state;
+ __entry->state = inode_state_read_once(inode);
__entry->dirtied_when = inode->dirtied_when;
__entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode));
),
@@ -787,7 +787,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
strscpy_pad(__entry->name,
bdi_dev_name(inode_to_bdi(inode)), 32);
__entry->ino = inode->i_ino;
- __entry->state = inode->i_state;
+ __entry->state = inode_state_read_once(inode);
__entry->dirtied_when = inode->dirtied_when;
__entry->writeback_index = inode->i_mapping->writeback_index;
__entry->nr_to_write = nr_to_write;
@@ -839,7 +839,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template,
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
- __entry->state = inode->i_state;
+ __entry->state = inode_state_read_once(inode);
__entry->mode = inode->i_mode;
__entry->dirtied_when = inode->dirtied_when;
),
diff --git a/include/uapi/asm-generic/posix_types.h b/include/uapi/asm-generic/posix_types.h
index b5f7594eee7a..0a90ad92dbf3 100644
--- a/include/uapi/asm-generic/posix_types.h
+++ b/include/uapi/asm-generic/posix_types.h
@@ -86,6 +86,7 @@ typedef struct {
*/
typedef __kernel_long_t __kernel_off_t;
typedef long long __kernel_loff_t;
+typedef unsigned long long __kernel_uoff_t;
typedef __kernel_long_t __kernel_old_time_t;
#ifndef __KERNEL__
typedef __kernel_long_t __kernel_time_t;
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 04e0077fb4c9..942370b3f5d2 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -857,9 +857,11 @@ __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr)
__SYSCALL(__NR_file_getattr, sys_file_getattr)
#define __NR_file_setattr 469
__SYSCALL(__NR_file_setattr, sys_file_setattr)
+#define __NR_listns 470
+__SYSCALL(__NR_listns, sys_listns)
#undef __NR_syscalls
-#define __NR_syscalls 470
+#define __NR_syscalls 471
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index ea91aa8afde9..e527b24bd824 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -979,14 +979,20 @@ extern "C" {
* 2 = Gob Height 8, Turing+ Page Kind mapping
* 3 = Reserved for future use.
*
- * 22:22 s Sector layout. On Tegra GPUs prior to Xavier, there is a further
- * bit remapping step that occurs at an even lower level than the
- * page kind and block linear swizzles. This causes the layout of
- * surfaces mapped in those SOC's GPUs to be incompatible with the
- * equivalent mapping on other GPUs in the same system.
- *
- * 0 = Tegra K1 - Tegra Parker/TX2 Layout.
- * 1 = Desktop GPU and Tegra Xavier+ Layout
+ * 22:22 s Sector layout. There is a further bit remapping step that occurs
+ * 26:27 at an even lower level than the page kind and block linear
+ * swizzles. This causes the bit arrangement of surfaces in memory
+ * to differ subtly, and prevents direct sharing of surfaces between
+ * GPUs with different layouts.
+ *
+ * 0 = Tegra K1 - Tegra Parker/TX2 Layout
+ * 1 = Pre-GB20x, GB20x 32+ bpp, GB10, Tegra Xavier-Orin Layout
+ * 2 = GB20x(Blackwell 2)+ 8 bpp surface layout
+ * 3 = GB20x(Blackwell 2)+ 16 bpp surface layout
+ * 4 = Reserved for future use.
+ * 5 = Reserved for future use.
+ * 6 = Reserved for future use.
+ * 7 = Reserved for future use.
*
* 25:23 c Lossless Framebuffer Compression type.
*
@@ -1001,7 +1007,7 @@ extern "C" {
* 6 = Reserved for future use
* 7 = Reserved for future use
*
- * 55:25 - Reserved for future use. Must be zero.
+ * 55:28 - Reserved for future use. Must be zero.
*/
#define DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(c, s, g, k, h) \
fourcc_mod_code(NVIDIA, (0x10 | \
@@ -1009,6 +1015,7 @@ extern "C" {
(((k) & 0xff) << 12) | \
(((g) & 0x3) << 20) | \
(((s) & 0x1) << 22) | \
+ (((s) & 0x6) << 25) | \
(((c) & 0x7) << 23)))
/* To grandfather in prior block linear format modifiers to the above layout,
diff --git a/include/uapi/linux/energy_model.h b/include/uapi/linux/energy_model.h
new file mode 100644
index 000000000000..4ec4c0eabbbb
--- /dev/null
+++ b/include/uapi/linux/energy_model.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/em.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_ENERGY_MODEL_H
+#define _UAPI_LINUX_ENERGY_MODEL_H
+
+#define EM_FAMILY_NAME "em"
+#define EM_FAMILY_VERSION 1
+
+enum {
+ EM_A_PDS_PD = 1,
+
+ __EM_A_PDS_MAX,
+ EM_A_PDS_MAX = (__EM_A_PDS_MAX - 1)
+};
+
+enum {
+ EM_A_PD_PAD = 1,
+ EM_A_PD_PD_ID,
+ EM_A_PD_FLAGS,
+ EM_A_PD_CPUS,
+
+ __EM_A_PD_MAX,
+ EM_A_PD_MAX = (__EM_A_PD_MAX - 1)
+};
+
+enum {
+ EM_A_PD_TABLE_PD_ID = 1,
+ EM_A_PD_TABLE_PS,
+
+ __EM_A_PD_TABLE_MAX,
+ EM_A_PD_TABLE_MAX = (__EM_A_PD_TABLE_MAX - 1)
+};
+
+enum {
+ EM_A_PS_PAD = 1,
+ EM_A_PS_PERFORMANCE,
+ EM_A_PS_FREQUENCY,
+ EM_A_PS_POWER,
+ EM_A_PS_COST,
+ EM_A_PS_FLAGS,
+
+ __EM_A_PS_MAX,
+ EM_A_PS_MAX = (__EM_A_PS_MAX - 1)
+};
+
+enum {
+ EM_CMD_GET_PDS = 1,
+ EM_CMD_GET_PD_TABLE,
+ EM_CMD_PD_CREATED,
+ EM_CMD_PD_UPDATED,
+ EM_CMD_PD_DELETED,
+
+ __EM_CMD_MAX,
+ EM_CMD_MAX = (__EM_CMD_MAX - 1)
+};
+
+#define EM_MCGRP_EVENT "event"
+
+#endif /* _UAPI_LINUX_ENERGY_MODEL_H */
diff --git a/include/uapi/linux/fb.h b/include/uapi/linux/fb.h
index cde8f173f566..22acaaec7b1c 100644
--- a/include/uapi/linux/fb.h
+++ b/include/uapi/linux/fb.h
@@ -319,7 +319,7 @@ enum {
#define FB_VBLANK_HAVE_VCOUNT 0x020 /* the vcount field is valid */
#define FB_VBLANK_HAVE_HCOUNT 0x040 /* the hcount field is valid */
#define FB_VBLANK_VSYNCING 0x080 /* currently in a vsync */
-#define FB_VBLANK_HAVE_VSYNC 0x100 /* verical syncs can be detected */
+#define FB_VBLANK_HAVE_VSYNC 0x100 /* vertical syncs can be detected */
struct fb_vblank {
__u32 flags; /* FB_VBLANK flags */
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 3741ea1b73d8..5e277fd955aa 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -4,6 +4,11 @@
#include <asm/fcntl.h>
#include <linux/openat2.h>
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0)
#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1)
@@ -79,6 +84,17 @@
*/
#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+/* Set/Get delegations */
+#define F_GETDELEG (F_LINUX_SPECIFIC_BASE + 15)
+#define F_SETDELEG (F_LINUX_SPECIFIC_BASE + 16)
+
+/* Argument structure for F_GETDELEG and F_SETDELEG */
+struct delegation {
+ uint32_t d_flags; /* Must be 0 */
+ uint16_t d_type; /* F_RDLCK, F_WRLCK, F_UNLCK */
+ uint16_t __pad; /* Must be 0 */
+};
+
/*
* Types of directory notifications that may be requested.
*/
diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 4a9fbf42aa9f..30f3c9eaafaa 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -27,7 +27,7 @@
#define INPUT_PROP_TOPBUTTONPAD 0x04 /* softbuttons at top of pad */
#define INPUT_PROP_POINTING_STICK 0x05 /* is a pointing stick */
#define INPUT_PROP_ACCELEROMETER 0x06 /* has accelerometer */
-#define INPUT_PROP_HAPTIC_TOUCHPAD 0x07 /* is a haptic touchpad */
+#define INPUT_PROP_PRESSUREPAD 0x07 /* pressure triggers clicks */
#define INPUT_PROP_MAX 0x1f
#define INPUT_PROP_CNT (INPUT_PROP_MAX + 1)
@@ -631,6 +631,18 @@
#define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */
#define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */
+/*
+ * Keycodes for hotkeys toggling the electronic privacy screen found on some
+ * laptops on/off. Note when the embedded-controller turns on/off the eprivacy
+ * screen itself then the state should be reported through drm connecter props:
+ * https://www.kernel.org/doc/html/latest/gpu/drm-kms.html#standard-connector-properties
+ * Except when implementing the drm connecter properties API is not possible
+ * because e.g. the firmware does not allow querying the presence and/or status
+ * of the eprivacy screen at boot.
+ */
+#define KEY_EPRIVACY_SCREEN_ON 0x252
+#define KEY_EPRIVACY_SCREEN_OFF 0x253
+
#define KEY_KBDINPUTASSIST_PREV 0x260
#define KEY_KBDINPUTASSIST_NEXT 0x261
#define KEY_KBDINPUTASSIST_PREVGROUP 0x262
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 263bed13473e..b7c8dad26690 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -689,9 +689,6 @@ enum io_uring_register_op {
/* query various aspects of io_uring, see linux/io_uring/query.h */
IORING_REGISTER_QUERY = 35,
- /* return zcrx buffers back into circulation */
- IORING_REGISTER_ZCRX_REFILL = 36,
-
/* this goes last */
IORING_REGISTER_LAST,
@@ -1073,15 +1070,6 @@ struct io_uring_zcrx_ifq_reg {
__u64 __resv[3];
};
-struct io_uring_zcrx_sync_refill {
- __u32 zcrx_id;
- /* the number of entries to return */
- __u32 nr_entries;
- /* pointer to an array of struct io_uring_zcrx_rqe */
- __u64 rqes;
- __u64 __resv[2];
-};
-
#ifdef __cplusplus
}
#endif
diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h
index 5d754322a27c..3539ccbfd064 100644
--- a/include/uapi/linux/io_uring/query.h
+++ b/include/uapi/linux/io_uring/query.h
@@ -36,6 +36,9 @@ struct io_uring_query_opcode {
__u64 enter_flags;
/* Bitmask of all supported IOSQE_* flags */
__u64 sqe_flags;
+ /* The number of available query opcodes */
+ __u32 nr_query_opcodes;
+ __u32 __pad;
};
#endif
diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
index 8197a4800604..40aa545101a3 100644
--- a/include/uapi/linux/isst_if.h
+++ b/include/uapi/linux/isst_if.h
@@ -52,7 +52,7 @@ struct isst_if_cpu_map {
/**
* struct isst_if_cpu_maps - structure for CPU map IOCTL
* @cmd_count: Number of CPU mapping command in cpu_map[]
- * @cpu_map[]: Holds one or more CPU map data structure
+ * @cpu_map: Holds one or more CPU map data structure
*
* This structure used with ioctl ISST_IF_GET_PHY_ID to send
* one or more CPU mapping commands. Here IOCTL return value indicates
@@ -82,8 +82,8 @@ struct isst_if_io_reg {
/**
* struct isst_if_io_regs - structure for IO register commands
- * @cmd_count: Number of io reg commands in io_reg[]
- * @io_reg[]: Holds one or more io_reg command structure
+ * @req_count: Number of io reg commands in io_reg[]
+ * @io_reg: Holds one or more io_reg command structure
*
* This structure used with ioctl ISST_IF_IO_CMD to send
* one or more read/write commands to PUNIT. Here IOCTL return value
@@ -120,7 +120,7 @@ struct isst_if_mbox_cmd {
/**
* struct isst_if_mbox_cmds - structure for mailbox commands
* @cmd_count: Number of mailbox commands in mbox_cmd[]
- * @mbox_cmd[]: Holds one or more mbox commands
+ * @mbox_cmd: Holds one or more mbox commands
*
* This structure used with ioctl ISST_IF_MBOX_COMMAND to send
* one or more mailbox commands to PUNIT. Here IOCTL return value
@@ -152,7 +152,7 @@ struct isst_if_msr_cmd {
/**
* struct isst_if_msr_cmds - structure for msr commands
* @cmd_count: Number of mailbox commands in msr_cmd[]
- * @msr_cmd[]: Holds one or more msr commands
+ * @msr_cmd: Holds one or more msr commands
*
* This structure used with ioctl ISST_IF_MSR_COMMAND to send
* one or more MSR commands. IOCTL return value indicates number of
@@ -167,8 +167,9 @@ struct isst_if_msr_cmds {
* struct isst_core_power - Structure to get/set core_power feature
* @get_set: 0: Get, 1: Set
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @enable: Feature enable status
+ * @supported: Power domain supports SST_CP interface
* @priority_type: Priority type for the feature (ordered/proportional)
*
* Structure to get/set core_power feature state using IOCTL
@@ -187,11 +188,11 @@ struct isst_core_power {
* struct isst_clos_param - Structure to get/set clos praram
* @get_set: 0: Get, 1: Set
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
- * clos: Clos ID for the parameters
- * min_freq_mhz: Minimum frequency in MHz
- * max_freq_mhz: Maximum frequency in MHz
- * prop_prio: Proportional priority from 0-15
+ * @power_domain_id: Power Domain id
+ * @clos: Clos ID for the parameters
+ * @min_freq_mhz: Minimum frequency in MHz
+ * @max_freq_mhz: Maximum frequency in MHz
+ * @prop_prio: Proportional priority from 0-15
*
* Structure to get/set per clos property using IOCTL
* ISST_IF_CLOS_PARAM.
@@ -209,7 +210,7 @@ struct isst_clos_param {
/**
* struct isst_if_clos_assoc - Structure to assign clos to a CPU
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @logical_cpu: CPU number
* @clos: Clos ID to assign to the logical CPU
*
@@ -228,6 +229,7 @@ struct isst_if_clos_assoc {
* @get_set: Request is for get or set
* @punit_cpu_map: Set to 1 if the CPU number is punit numbering not
* Linux CPU number
+ * @assoc_info: CLOS data for this CPU
*
* Structure used to get/set associate CPUs to clos using IOCTL
* ISST_IF_CLOS_ASSOC.
@@ -257,7 +259,7 @@ struct isst_tpmi_instance_count {
/**
* struct isst_perf_level_info - Structure to get information on SST-PP levels
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @logical_cpu: CPU number
* @clos: Clos ID to assign to the logical CPU
* @max_level: Maximum performance level supported by the platform
@@ -267,8 +269,8 @@ struct isst_tpmi_instance_count {
* @feature_state: SST-BF and SST-TF (enabled/disabled) status at current level
* @locked: SST-PP performance level change is locked/unlocked
* @enabled: SST-PP feature is enabled or not
- * @sst-tf_support: SST-TF support status at this level
- * @sst-bf_support: SST-BF support status at this level
+ * @sst_tf_support: SST-TF support status at this level
+ * @sst_bf_support: SST-BF support status at this level
*
* Structure to get SST-PP details using IOCTL ISST_IF_PERF_LEVELS.
*/
@@ -289,7 +291,7 @@ struct isst_perf_level_info {
/**
* struct isst_perf_level_control - Structure to set SST-PP level
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @level: level to set
*
* Structure used change SST-PP level using IOCTL ISST_IF_PERF_SET_LEVEL.
@@ -303,7 +305,7 @@ struct isst_perf_level_control {
/**
* struct isst_perf_feature_control - Structure to activate SST-BF/SST-TF
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @feature: bit 0 = SST-BF state, bit 1 = SST-TF state
*
* Structure used to enable SST-BF/SST-TF using IOCTL ISST_IF_PERF_SET_FEATURE.
@@ -320,7 +322,7 @@ struct isst_perf_feature_control {
/**
* struct isst_perf_level_data_info - Structure to get SST-PP level details
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @level: SST-PP level for which caller wants to get information
* @tdp_ratio: TDP Ratio
* @base_freq_mhz: Base frequency in MHz
@@ -341,8 +343,8 @@ struct isst_perf_feature_control {
* @pm_fabric_freq_mhz: Fabric (Uncore) minimum frequency
* @max_buckets: Maximum trl buckets
* @max_trl_levels: Maximum trl levels
- * @bucket_core_counts[TRL_MAX_BUCKETS]: Number of cores per bucket
- * @trl_freq_mhz[TRL_MAX_LEVELS][TRL_MAX_BUCKETS]: maximum frequency
+ * @bucket_core_counts: Number of cores per bucket
+ * @trl_freq_mhz: maximum frequency
* for a bucket and trl level
*
* Structure used to get information on frequencies and TDP for a SST-PP
@@ -402,7 +404,7 @@ struct isst_perf_level_fabric_info {
/**
* struct isst_perf_level_cpu_mask - Structure to get SST-PP level CPU mask
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @level: SST-PP level for which caller wants to get information
* @punit_cpu_map: Set to 1 if the CPU number is punit numbering not
* Linux CPU number. If 0 CPU buffer is copied to user space
@@ -430,7 +432,7 @@ struct isst_perf_level_cpu_mask {
/**
* struct isst_base_freq_info - Structure to get SST-BF frequencies
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @level: SST-PP level for which caller wants to get information
* @high_base_freq_mhz: High priority CPU base frequency
* @low_base_freq_mhz: Low priority CPU base frequency
@@ -453,9 +455,11 @@ struct isst_base_freq_info {
/**
* struct isst_turbo_freq_info - Structure to get SST-TF frequencies
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @level: SST-PP level for which caller wants to get information
* @max_clip_freqs: Maximum number of low priority core clipping frequencies
+ * @max_buckets: Maximum trl buckets
+ * @max_trl_levels: Maximum trl levels
* @lp_clip_freq_mhz: Clip frequencies per trl level
* @bucket_core_counts: Maximum number of cores for a bucket
* @trl_freq_mhz: Frequencies per trl level for each bucket
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 7fa67c2031a5..5d3f8c9e3a62 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -197,7 +197,7 @@ struct statmount {
*/
struct mnt_id_req {
__u32 size;
- __u32 spare;
+ __u32 mnt_ns_fd;
__u64 mnt_id;
__u64 param;
__u64 mnt_ns_id;
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index e098759ec917..a25e38d1c874 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -67,4 +67,62 @@ struct nsfs_file_handle {
#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */
#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */
+enum init_ns_id {
+ IPC_NS_INIT_ID = 1ULL,
+ UTS_NS_INIT_ID = 2ULL,
+ USER_NS_INIT_ID = 3ULL,
+ PID_NS_INIT_ID = 4ULL,
+ CGROUP_NS_INIT_ID = 5ULL,
+ TIME_NS_INIT_ID = 6ULL,
+ NET_NS_INIT_ID = 7ULL,
+ MNT_NS_INIT_ID = 8ULL,
+#ifdef __KERNEL__
+ NS_LAST_INIT_ID = MNT_NS_INIT_ID,
+#endif
+};
+
+enum ns_type {
+ TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */
+ MNT_NS = (1ULL << 17), /* CLONE_NEWNS */
+ CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */
+ UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */
+ IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */
+ USER_NS = (1ULL << 28), /* CLONE_NEWUSER */
+ PID_NS = (1ULL << 29), /* CLONE_NEWPID */
+ NET_NS = (1ULL << 30), /* CLONE_NEWNET */
+};
+
+/**
+ * struct ns_id_req - namespace ID request structure
+ * @size: size of this structure
+ * @spare: reserved for future use
+ * @filter: filter mask
+ * @ns_id: last namespace id
+ * @user_ns_id: owning user namespace ID
+ *
+ * Structure for passing namespace ID and miscellaneous parameters to
+ * statns(2) and listns(2).
+ *
+ * For statns(2) @param represents the request mask.
+ * For listns(2) @param represents the last listed mount id (or zero).
+ */
+struct ns_id_req {
+ __u32 size;
+ __u32 spare;
+ __u64 ns_id;
+ struct /* listns */ {
+ __u32 ns_type;
+ __u32 spare2;
+ __u64 user_ns_id;
+ };
+};
+
+/*
+ * Special @user_ns_id value that can be passed to listns()
+ */
+#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */
+
+/* List of all ns_id_req versions. */
+#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */
+
#endif /* __LINUX_NSFS_H */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 78a362b80027..c44a8fb3e418 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -382,6 +382,7 @@ enum perf_event_read_format {
#define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */
#define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
#define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
+#define PERF_ATTR_SIZE_VER9 144 /* add: config4 */
/*
* 'struct perf_event_attr' contains various attributes that define
@@ -463,7 +464,9 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
- __reserved_1 : 26;
+ defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */
+ defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */
+ __reserved_1 : 24;
union {
__u32 wakeup_events; /* wake up every n events */
@@ -543,6 +546,7 @@ struct perf_event_attr {
__u64 sig_data;
__u64 config3; /* extension of config2 */
+ __u64 config4; /* extension of config3 */
};
/*
@@ -1239,6 +1243,22 @@ enum perf_event_type {
*/
PERF_RECORD_AUX_OUTPUT_HW_ID = 21,
+ /*
+ * This user callchain capture was deferred until shortly before
+ * returning to user space. Previous samples would have kernel
+ * callchains only and they need to be stitched with this to make full
+ * callchains.
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u64 cookie;
+ * u64 nr;
+ * u64 ips[nr];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_CALLCHAIN_DEFERRED = 22,
+
PERF_RECORD_MAX, /* non-ABI */
};
@@ -1269,6 +1289,7 @@ enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
+ PERF_CONTEXT_USER_DEFERRED = (__u64)-640,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 957db425d459..ea9a6811fc76 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -26,8 +26,12 @@
#define PIDFD_INFO_CGROUPID (1UL << 2) /* Always returned if available, even if not requested */
#define PIDFD_INFO_EXIT (1UL << 3) /* Only returned if requested. */
#define PIDFD_INFO_COREDUMP (1UL << 4) /* Only returned if requested. */
+#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) /* Want/got supported mask flags */
+#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) /* Always returned if PIDFD_INFO_COREDUMP is requested. */
#define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */
+#define PIDFD_INFO_SIZE_VER1 72 /* sizeof second published struct */
+#define PIDFD_INFO_SIZE_VER2 80 /* sizeof third published struct */
/*
* Values for @coredump_mask in pidfd_info.
@@ -91,8 +95,11 @@ struct pidfd_info {
__u32 fsuid;
__u32 fsgid;
__s32 exit_code;
- __u32 coredump_mask;
- __u32 __spare1;
+ struct /* coredump info */ {
+ __u32 coredump_mask;
+ __u32 coredump_signal;
+ };
+ __u64 supported_mask; /* Mask flags that this kernel supports */
};
#define PIDFS_IOCTL_MAGIC 0xFF
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index c233aae5eac9..1b76d508400c 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -114,20 +114,13 @@ struct rseq {
/*
* Restartable sequences flags field.
*
- * This field should only be updated by the thread which
- * registered this data structure. Read by the kernel.
- * Mainly used for single-stepping through rseq critical sections
- * with debuggers.
- *
- * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
- * Inhibit instruction sequence block restart on preemption
- * for this thread.
- * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
- * Inhibit instruction sequence block restart on signal
- * delivery for this thread.
- * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
- * Inhibit instruction sequence block restart on migration for
- * this thread.
+ * This field was initially intended to allow event masking for
+ * single-stepping through rseq critical sections with debuggers.
+ * The kernel does not support this anymore and the relevant bits
+ * are checked for being always false:
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
*/
__u32 flags;
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 386ad36f1a0a..cab5cadca8ef 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -249,8 +249,9 @@ struct tee_ioctl_param {
* @cancel_id: [in] Cancellation id, a unique value to identify this request
* @session: [out] Session id
* @ret: [out] return value
- * @ret_origin [out] origin of the return value
- * @num_params [in] number of parameters following this struct
+ * @ret_origin: [out] origin of the return value
+ * @num_params: [in] number of &struct tee_ioctl_param entries in @params
+ * @params: array of ioctl parameters
*/
struct tee_ioctl_open_session_arg {
__u8 uuid[TEE_IOCTL_UUID_LEN];
@@ -276,14 +277,14 @@ struct tee_ioctl_open_session_arg {
struct tee_ioctl_buf_data)
/**
- * struct tee_ioctl_invoke_func_arg - Invokes a function in a Trusted
- * Application
+ * struct tee_ioctl_invoke_arg - Invokes a function in a Trusted Application
* @func: [in] Trusted Application function, specific to the TA
* @session: [in] Session id
* @cancel_id: [in] Cancellation id, a unique value to identify this request
* @ret: [out] return value
- * @ret_origin [out] origin of the return value
- * @num_params [in] number of parameters following this struct
+ * @ret_origin: [out] origin of the return value
+ * @num_params: [in] number of parameters following this struct
+ * @params: array of ioctl parameters
*/
struct tee_ioctl_invoke_arg {
__u32 func;
@@ -338,7 +339,8 @@ struct tee_ioctl_close_session_arg {
/**
* struct tee_iocl_supp_recv_arg - Receive a request for a supplicant function
* @func: [in] supplicant function
- * @num_params [in/out] number of parameters following this struct
+ * @num_params: [in/out] number of &struct tee_ioctl_param entries in @params
+ * @params: array of ioctl parameters
*
* @num_params is the number of params that tee-supplicant has room to
* receive when input, @num_params is the number of actual params
@@ -363,7 +365,8 @@ struct tee_iocl_supp_recv_arg {
/**
* struct tee_iocl_supp_send_arg - Send a response to a received request
* @ret: [out] return value
- * @num_params [in] number of parameters following this struct
+ * @num_params: [in] number of &struct tee_ioctl_param entries in @params
+ * @params: array of ioctl parameters
*/
struct tee_iocl_supp_send_arg {
__u32 ret;
@@ -454,11 +457,13 @@ struct tee_ioctl_shm_register_fd_data {
*/
/**
- * struct tee_ioctl_invoke_func_arg - Invokes an object in a Trusted Application
+ * struct tee_ioctl_object_invoke_arg - Invokes an object in a
+ * Trusted Application
* @id: [in] Object id
* @op: [in] Object operation, specific to the object
* @ret: [out] return value
* @num_params: [in] number of parameters following this struct
+ * @params: array of ioctl parameters
*/
struct tee_ioctl_object_invoke_arg {
__u64 id;
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 8bf27ab8bcb4..1db45b01532b 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -193,7 +193,8 @@ struct virtio_net_hdr_v1 {
struct virtio_net_hdr_v1_hash {
struct virtio_net_hdr_v1 hdr;
- __le32 hash_value;
+ __le16 hash_value_lo;
+ __le16 hash_value_hi;
#define VIRTIO_NET_HASH_REPORT_NONE 0
#define VIRTIO_NET_HASH_REPORT_IPv4 1
#define VIRTIO_NET_HASH_REPORT_TCPv4 2
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 9425cfd9d00e..0f95576bf1f6 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -688,6 +688,13 @@ enum ufshcd_quirks {
* single doorbell mode.
*/
UFSHCD_QUIRK_BROKEN_LSDBS_CAP = 1 << 25,
+
+ /*
+ * This quirk indicates that DME_LINKSTARTUP should not be issued a 2nd
+ * time (refer link_startup_again) after the 1st time was successful,
+ * because it causes link startup to become unreliable.
+ */
+ UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE = 1 << 26,
};
enum ufshcd_caps {