diff options
Diffstat (limited to 'tools')
425 files changed, 17823 insertions, 2898 deletions
diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 9a5d85cfd1fb..139d5e87dc95 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -75,11 +75,13 @@ #define ARM_CPU_PART_CORTEX_A76 0xD0B #define ARM_CPU_PART_NEOVERSE_N1 0xD0C #define ARM_CPU_PART_CORTEX_A77 0xD0D +#define ARM_CPU_PART_CORTEX_A76AE 0xD0E #define ARM_CPU_PART_NEOVERSE_V1 0xD40 #define ARM_CPU_PART_CORTEX_A78 0xD41 #define ARM_CPU_PART_CORTEX_A78AE 0xD42 #define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A510 0xD46 +#define ARM_CPU_PART_CORTEX_X1C 0xD4C #define ARM_CPU_PART_CORTEX_A520 0xD80 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_CORTEX_A715 0xD4D @@ -119,9 +121,11 @@ #define QCOM_CPU_PART_KRYO 0x200 #define QCOM_CPU_PART_KRYO_2XX_GOLD 0x800 #define QCOM_CPU_PART_KRYO_2XX_SILVER 0x801 +#define QCOM_CPU_PART_KRYO_3XX_GOLD 0x802 #define QCOM_CPU_PART_KRYO_3XX_SILVER 0x803 #define QCOM_CPU_PART_KRYO_4XX_GOLD 0x804 #define QCOM_CPU_PART_KRYO_4XX_SILVER 0x805 +#define QCOM_CPU_PART_ORYON_X1 0x001 #define NVIDIA_CPU_PART_DENVER 0x003 #define NVIDIA_CPU_PART_CARMEL 0x004 @@ -129,6 +133,7 @@ #define FUJITSU_CPU_PART_A64FX 0x001 #define HISI_CPU_PART_TSV110 0xD01 +#define HISI_CPU_PART_HIP09 0xD02 #define HISI_CPU_PART_HIP12 0xD06 #define APPLE_CPU_PART_M1_ICESTORM 0x022 @@ -159,11 +164,13 @@ #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) #define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) +#define MIDR_CORTEX_A76AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76AE) #define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1) #define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) #define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) #define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) +#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C) #define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_CORTEX_A715 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A715) @@ -196,13 +203,26 @@ #define MIDR_QCOM_KRYO MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO) #define MIDR_QCOM_KRYO_2XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_GOLD) #define MIDR_QCOM_KRYO_2XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_SILVER) +#define MIDR_QCOM_KRYO_3XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_GOLD) #define MIDR_QCOM_KRYO_3XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_SILVER) #define MIDR_QCOM_KRYO_4XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_GOLD) #define MIDR_QCOM_KRYO_4XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_SILVER) +#define MIDR_QCOM_ORYON_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_ORYON_X1) + +/* + * NOTES: + * - Qualcomm Kryo 5XX Prime / Gold ID themselves as MIDR_CORTEX_A77 + * - Qualcomm Kryo 5XX Silver IDs itself as MIDR_QCOM_KRYO_4XX_SILVER + * - Qualcomm Kryo 6XX Prime IDs itself as MIDR_CORTEX_X1 + * - Qualcomm Kryo 6XX Gold IDs itself as ARM_CPU_PART_CORTEX_A78 + * - Qualcomm Kryo 6XX Silver IDs itself as MIDR_CORTEX_A55 + */ + #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER) #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) +#define MIDR_HISI_HIP09 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP09) #define MIDR_HISI_HIP12 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP12) #define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM) #define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM) @@ -291,6 +311,14 @@ static inline u32 __attribute_const__ read_cpuid_id(void) return read_cpuid(MIDR_EL1); } +struct target_impl_cpu { + u64 midr; + u64 revidr; + u64 aidr; +}; + +bool cpu_errata_set_target_impl(u64 num, void *impl_cpus); + static inline u64 __attribute_const__ read_cpuid_mpidr(void) { return read_cpuid(MPIDR_EL1); diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index 690b6ebd118f..65f2759ea27a 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -1080,9 +1080,6 @@ #define ARM64_FEATURE_FIELD_BITS 4 -/* Defined for compatibility only, do not add new users. */ -#define ARM64_FEATURE_MASK(x) (x##_MASK) - #ifdef __ASSEMBLY__ .macro mrs_s, rt, sreg diff --git a/tools/arch/loongarch/include/asm/inst.h b/tools/arch/loongarch/include/asm/inst.h index c25b5853181d..d68fad63c8b7 100644 --- a/tools/arch/loongarch/include/asm/inst.h +++ b/tools/arch/loongarch/include/asm/inst.h @@ -51,6 +51,10 @@ enum reg2i16_op { bgeu_op = 0x1b, }; +enum reg3_op { + amswapw_op = 0x70c0, +}; + struct reg0i15_format { unsigned int immediate : 15; unsigned int opcode : 17; @@ -96,6 +100,13 @@ struct reg2i16_format { unsigned int opcode : 6; }; +struct reg3_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int rk : 5; + unsigned int opcode : 17; +}; + union loongarch_instruction { unsigned int word; struct reg0i15_format reg0i15_format; @@ -105,6 +116,7 @@ union loongarch_instruction { struct reg2i12_format reg2i12_format; struct reg2i14_format reg2i14_format; struct reg2i16_format reg2i16_format; + struct reg3_format reg3_format; }; #define LOONGARCH_INSN_SIZE sizeof(union loongarch_instruction) diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index eaeda001784e..077c5437f521 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -1,18 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * * Copyright IBM Corp. 2007 * * Authors: Hollis Blanchard <hollisb@us.ibm.com> diff --git a/tools/arch/riscv/include/asm/csr.h b/tools/arch/riscv/include/asm/csr.h index 0dfc09254f99..56d7367ee344 100644 --- a/tools/arch/riscv/include/asm/csr.h +++ b/tools/arch/riscv/include/asm/csr.h @@ -468,13 +468,13 @@ #define IE_TIE (_AC(0x1, UL) << RV_IRQ_TIMER) #define IE_EIE (_AC(0x1, UL) << RV_IRQ_EXT) -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __ASM_STR(x) x #else #define __ASM_STR(x) #x #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define csr_swap(csr, val) \ ({ \ @@ -536,6 +536,6 @@ : "memory"); \ }) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_CSR_H */ diff --git a/tools/arch/riscv/include/asm/vdso/processor.h b/tools/arch/riscv/include/asm/vdso/processor.h index 662aca039848..0665b117f30f 100644 --- a/tools/arch/riscv/include/asm/vdso/processor.h +++ b/tools/arch/riscv/include/asm/vdso/processor.h @@ -2,7 +2,7 @@ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm-generic/barrier.h> @@ -27,6 +27,6 @@ static inline void cpu_relax(void) barrier(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index ee176236c2be..06fc0479a23f 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -218,6 +218,7 @@ #define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* "flexpriority" Intel FlexPriority */ #define X86_FEATURE_EPT ( 8*32+ 2) /* "ept" Intel Extended Page Table */ #define X86_FEATURE_VPID ( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */ +#define X86_FEATURE_COHERENCY_SFW_NO ( 8*32+ 4) /* SNP cache coherency software work around not needed */ #define X86_FEATURE_VMMCALL ( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */ #define X86_FEATURE_XENPV ( 8*32+16) /* Xen paravirtual guest */ @@ -456,10 +457,14 @@ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */ #define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ #define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */ +#define X86_FEATURE_VERW_CLEAR (20*32+ 5) /* The memory form of VERW mitigates TSA */ #define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */ + #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ +#define X86_FEATURE_GP_ON_USER_CPUID (20*32+17) /* User CPUID faulting */ + #define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */ #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ @@ -487,6 +492,9 @@ #define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */ #define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */ #define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */ +#define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */ +#define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ +#define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ /* * BUG word(s) @@ -542,5 +550,5 @@ #define X86_BUG_OLD_MICROCODE X86_BUG( 1*32+ 6) /* "old_microcode" CPU has old microcode, it is surely vulnerable to something */ #define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */ #define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ - +#define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/inat.h b/tools/arch/x86/include/asm/inat.h index 183aa662b165..099e926595bd 100644 --- a/tools/arch/x86/include/asm/inat.h +++ b/tools/arch/x86/include/asm/inat.h @@ -37,6 +37,8 @@ #define INAT_PFX_EVEX 15 /* EVEX prefix */ /* x86-64 REX2 prefix */ #define INAT_PFX_REX2 16 /* 0xD5 */ +/* AMD XOP prefix */ +#define INAT_PFX_XOP 17 /* 0x8F */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -77,6 +79,7 @@ #define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_XOPOK INAT_VEXOK #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) #define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) @@ -111,6 +114,8 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_pp); +extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, + insn_byte_t map_select); /* Attribute checking functions */ static inline int inat_is_legacy_prefix(insn_attr_t attr) @@ -164,6 +169,11 @@ static inline int inat_is_vex3_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; } +static inline int inat_is_xop_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_XOP; +} + static inline int inat_is_escape(insn_attr_t attr) { return attr & INAT_ESC_MASK; @@ -229,6 +239,11 @@ static inline int inat_accept_vex(insn_attr_t attr) return attr & INAT_VEXOK; } +static inline int inat_accept_xop(insn_attr_t attr) +{ + return attr & INAT_XOPOK; +} + static inline int inat_must_vex(insn_attr_t attr) { return attr & (INAT_VEXONLY | INAT_EVEXONLY); diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 0e5abd896ad4..c683d609934b 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -71,7 +71,10 @@ struct insn { * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ - struct insn_field vex_prefix; /* VEX prefix */ + union { + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field xop_prefix; /* XOP prefix */ + }; struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 @@ -135,6 +138,17 @@ struct insn { #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ +/* XOP bit fields */ +#define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */ +#define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */ +#define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */ +#define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */ +#define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */ +#define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */ +#define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */ +#define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */ +#define X86_XOP_M_MIN 0x08 /* Min of XOP.M */ +#define X86_XOP_M_MAX 0x1f /* Max of XOP.M */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); @@ -178,7 +192,7 @@ static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) return X86_REX2_M(insn->rex_prefix.bytes[1]); } -static inline int insn_is_avx(struct insn *insn) +static inline int insn_is_avx_or_xop(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); @@ -192,6 +206,22 @@ static inline int insn_is_evex(struct insn *insn) return (insn->vex_prefix.nbytes == 4); } +/* If we already know this is AVX/XOP encoded */ +static inline int avx_insn_is_xop(struct insn *insn) +{ + insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]); + + return inat_is_xop_prefix(attr); +} + +static inline int insn_is_xop(struct insn *insn) +{ + if (!insn_is_avx_or_xop(insn)) + return 0; + + return avx_insn_is_xop(insn); +} + static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; @@ -222,11 +252,26 @@ static inline insn_byte_t insn_vex_w_bit(struct insn *insn) return X86_VEX_W(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_xop_map_bits(struct insn *insn) +{ + if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */ + return 0; + return X86_XOP_M(insn->xop_prefix.bytes[1]); +} + +static inline insn_byte_t insn_xop_p_bits(struct insn *insn) +{ + return X86_XOP_P(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { - if (insn_is_avx(insn)) + if (insn_is_avx_or_xop(insn)) { + if (avx_insn_is_xop(insn)) + return insn_xop_p_bits(insn); return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ + } if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 5cfb5d74dd5f..f627196eb796 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -315,12 +315,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) @@ -419,6 +421,7 @@ #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12) #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) +#define DEBUGCTLMSR_RTM_DEBUG BIT(15) #define MSR_PEBS_FRONTEND 0x000003f7 @@ -733,6 +736,11 @@ #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +/* AMD Hardware Feedback Support MSRs */ +#define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500 +#define MSR_AMD_WORKLOAD_CLASS_ID 0xc0000501 +#define MSR_AMD_WORKLOAD_HRST 0xc0000502 + /* AMD Last Branch Record MSRs */ #define MSR_AMD64_LBR_SELECT 0xc000010e @@ -831,6 +839,7 @@ #define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT) #define MSR_K7_HWCR_IRPERF_EN_BIT 30 #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) +#define MSR_K7_HWCR_CPUID_USER_DIS_BIT 35 #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 #define MSR_K7_HWCR_CPB_DIS_BIT 25 diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 6f3499507c5e..0f15d683817d 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -965,7 +965,13 @@ struct kvm_tdx_cmd { struct kvm_tdx_capabilities { __u64 supported_attrs; __u64 supported_xfam; - __u64 reserved[254]; + + __u64 kernel_tdvmcallinfo_1_r11; + __u64 user_tdvmcallinfo_1_r11; + __u64 kernel_tdvmcallinfo_1_r12; + __u64 user_tdvmcallinfo_1_r12; + + __u64 reserved[250]; /* Configurable CPUID bits for userspace */ struct kvm_cpuid2 cpuid; diff --git a/tools/arch/x86/lib/inat.c b/tools/arch/x86/lib/inat.c index dfbcc6405941..ffcb0e27453b 100644 --- a/tools/arch/x86/lib/inat.c +++ b/tools/arch/x86/lib/inat.c @@ -81,3 +81,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, return table[opcode]; } +insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) +{ + const insn_attr_t *table; + + if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) + return 0; + map_select -= X86_XOP_M_MIN; + /* At first, this checks the master table */ + table = inat_xop_tables[map_select]; + if (!table) + return 0; + return table[opcode]; +} diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index bce69c6bfa69..1d1c57c74d1f 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -200,12 +200,15 @@ found: } insn->rex_prefix.got = 1; - /* Decode VEX prefix */ + /* Decode VEX/XOP prefix */ b = peek_next(insn_byte_t, insn); - attr = inat_get_opcode_attribute(b); - if (inat_is_vex_prefix(attr)) { + if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); - if (!insn->x86_64) { + + if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) { + /* Grp1A.0 is always POP Ev */ + goto vex_end; + } else if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is @@ -226,13 +229,13 @@ found: if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; - } else if (inat_is_vex3_prefix(attr)) { + } else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) - /* VEX.W overrides opnd_size */ + /* VEX.W/XOP.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* @@ -288,9 +291,22 @@ int insn_get_opcode(struct insn *insn) insn_set_byte(opcode, 0, op); opcode->nbytes = 1; - /* Check if there is VEX prefix or not */ - if (insn_is_avx(insn)) { + /* Check if there is VEX/XOP prefix or not */ + if (insn_is_avx_or_xop(insn)) { insn_byte_t m, p; + + /* XOP prefix has different encoding */ + if (unlikely(avx_insn_is_xop(insn))) { + m = insn_xop_map_bits(insn); + insn->attr = inat_get_xop_attribute(op, m); + if (!inat_accept_xop(insn->attr)) { + insn->attr = 0; + return -EINVAL; + } + /* XOP has only 1 byte for opcode */ + goto end; + } + m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); @@ -383,7 +399,8 @@ int insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) && + !inat_accept_xop(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index 262f7ca1fb95..2a4e69ecc2de 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -27,6 +27,11 @@ # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. +# (xop): this opcode accepts XOP prefix. +# +# XOP Superscripts +# (W=0): this opcode requires XOP.W == 0 +# (W=1): this opcode requires XOP.W == 1 # # Last Prefix Superscripts # - (66): the last prefix is 0x66 @@ -194,7 +199,7 @@ AVXcode: 8c: MOV Ev,Sw 8d: LEA Gv,M 8e: MOV Sw,Ew -8f: Grp1A (1A) | POP Ev (d64) +8f: Grp1A (1A) | POP Ev (d64) | XOP (Prefix) # 0x90 - 0x9f 90: NOP | PAUSE (F3) | XCHG r8,rAX 91: XCHG rCX/r9,rAX @@ -1106,6 +1111,84 @@ AVXcode: 7 f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) EndTable +# From AMD64 Architecture Programmer's Manual Vol3, Appendix A.1.5 +Table: XOP map 8h +Referrer: +XOPcode: 0 +85: VPMACSSWW Vo,Ho,Wo,Lo +86: VPMACSSWD Vo,Ho,Wo,Lo +87: VPMACSSDQL Vo,Ho,Wo,Lo +8e: VPMACSSDD Vo,Ho,Wo,Lo +8f: VPMACSSDQH Vo,Ho,Wo,Lo +95: VPMACSWW Vo,Ho,Wo,Lo +96: VPMACSWD Vo,Ho,Wo,Lo +97: VPMACSDQL Vo,Ho,Wo,Lo +9e: VPMACSDD Vo,Ho,Wo,Lo +9f: VPMACSDQH Vo,Ho,Wo,Lo +a2: VPCMOV Vx,Hx,Wx,Lx (W=0) | VPCMOV Vx,Hx,Lx,Wx (W=1) +a3: VPPERM Vo,Ho,Wo,Lo (W=0) | VPPERM Vo,Ho,Lo,Wo (W=1) +a6: VPMADCSSWD Vo,Ho,Wo,Lo +b6: VPMADCSWD Vo,Ho,Wo,Lo +c0: VPROTB Vo,Wo,Ib +c1: VPROTW Vo,Wo,Ib +c2: VPROTD Vo,Wo,Ib +c3: VPROTQ Vo,Wo,Ib +cc: VPCOMccB Vo,Ho,Wo,Ib +cd: VPCOMccW Vo,Ho,Wo,Ib +ce: VPCOMccD Vo,Ho,Wo,Ib +cf: VPCOMccQ Vo,Ho,Wo,Ib +ec: VPCOMccUB Vo,Ho,Wo,Ib +ed: VPCOMccUW Vo,Ho,Wo,Ib +ee: VPCOMccUD Vo,Ho,Wo,Ib +ef: VPCOMccUQ Vo,Ho,Wo,Ib +EndTable + +Table: XOP map 9h +Referrer: +XOPcode: 1 +01: GrpXOP1 +02: GrpXOP2 +12: GrpXOP3 +80: VFRCZPS Vx,Wx +81: VFRCZPD Vx,Wx +82: VFRCZSS Vq,Wss +83: VFRCZSD Vq,Wsd +90: VPROTB Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +91: VPROTW Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +92: VPROTD Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +93: VPROTQ Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +94: VPSHLB Vo,Wo,Ho (W=0) | VPSHLB Vo,Ho,Wo (W=1) +95: VPSHLW Vo,Wo,Ho (W=0) | VPSHLW Vo,Ho,Wo (W=1) +96: VPSHLD Vo,Wo,Ho (W=0) | VPSHLD Vo,Ho,Wo (W=1) +97: VPSHLQ Vo,Wo,Ho (W=0) | VPSHLQ Vo,Ho,Wo (W=1) +98: VPSHAB Vo,Wo,Ho (W=0) | VPSHAB Vo,Ho,Wo (W=1) +99: VPSHAW Vo,Wo,Ho (W=0) | VPSHAW Vo,Ho,Wo (W=1) +9a: VPSHAD Vo,Wo,Ho (W=0) | VPSHAD Vo,Ho,Wo (W=1) +9b: VPSHAQ Vo,Wo,Ho (W=0) | VPSHAQ Vo,Ho,Wo (W=1) +c1: VPHADDBW Vo,Wo +c2: VPHADDBD Vo,Wo +c3: VPHADDBQ Vo,Wo +c6: VPHADDWD Vo,Wo +c7: VPHADDWQ Vo,Wo +cb: VPHADDDQ Vo,Wo +d1: VPHADDUBWD Vo,Wo +d2: VPHADDUBD Vo,Wo +d3: VPHADDUBQ Vo,Wo +d6: VPHADDUWD Vo,Wo +d7: VPHADDUWQ Vo,Wo +db: VPHADDUDQ Vo,Wo +e1: VPHSUBBW Vo,Wo +e2: VPHSUBWD Vo,Wo +e3: VPHSUBDQ Vo,Wo +EndTable + +Table: XOP map Ah +Referrer: +XOPcode: 2 +10: BEXTR Gy,Ey,Id +12: GrpXOP4 +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1320,3 +1403,29 @@ GrpTable: GrpRNG 4: xcrypt-cfb 5: xcrypt-ofb EndTable + +# GrpXOP1-4 is shown in AMD APM Vol.3 Appendix A as XOP group #1-4 +GrpTable: GrpXOP1 +1: BLCFILL By,Ey (xop) +2: BLSFILL By,Ey (xop) +3: BLCS By,Ey (xop) +4: TZMSK By,Ey (xop) +5: BLCIC By,Ey (xop) +6: BLSIC By,Ey (xop) +7: T1MSKC By,Ey (xop) +EndTable + +GrpTable: GrpXOP2 +1: BLCMSK By,Ey (xop) +6: BLCI By,Ey (xop) +EndTable + +GrpTable: GrpXOP3 +0: LLWPCB Ry (xop) +1: SLWPCB Ry (xop) +EndTable + +GrpTable: GrpXOP4 +0: LWPINS By,Ed,Id (xop) +1: LWPVAL By,Ed,Id (xop) +EndTable diff --git a/tools/arch/x86/tools/gen-insn-attr-x86.awk b/tools/arch/x86/tools/gen-insn-attr-x86.awk index 2c19d7fc8a85..7ea1b75e59b7 100644 --- a/tools/arch/x86/tools/gen-insn-attr-x86.awk +++ b/tools/arch/x86/tools/gen-insn-attr-x86.awk @@ -21,6 +21,7 @@ function clear_vars() { eid = -1 # escape id gid = -1 # group id aid = -1 # AVX id + xopid = -1 # XOP id tname = "" } @@ -39,9 +40,11 @@ BEGIN { ggid = 1 geid = 1 gaid = 0 + gxopid = 0 delete etable delete gtable delete atable + delete xoptable opnd_expr = "^[A-Za-z/]" ext_expr = "^\\(" @@ -61,6 +64,7 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Lo"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -87,6 +91,8 @@ BEGIN { evexonly_expr = "\\(ev\\)" # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size evex_scalable_expr = "\\(es\\)" + # All opcodes in XOP table or with (xop) superscript accept XOP prefix + xopok_expr = "\\(xop\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -106,6 +112,7 @@ BEGIN { prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" prefix_num["REX2"] = "INAT_PFX_REX2" + prefix_num["XOP"] = "INAT_PFX_XOP" clear_vars() } @@ -147,6 +154,7 @@ function array_size(arr, i,c) { if (NF != 1) { # AVX/escape opcode table aid = $2 + xopid = -1 if (gaid <= aid) gaid = aid + 1 if (tname == "") # AVX only opcode table @@ -156,6 +164,20 @@ function array_size(arr, i,c) { tname = "inat_primary_table" } +/^XOPcode:/ { + if (NF != 1) { + # XOP opcode table + xopid = $2 + aid = -1 + if (gxopid <= xopid) + gxopid = xopid + 1 + if (tname == "") # XOP only opcode table + tname = sprintf("inat_xop_table_%d", $2) + } + if (xopid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + /^GrpTable:/ { print "/* " $0 " */" if (!($2 in group)) @@ -206,6 +228,8 @@ function print_table(tbl,name,fmt,n) etable[eid,0] = tname if (aid >= 0) atable[aid,0] = tname + else if (xopid >= 0) + xoptable[xopid] = tname } if (array_size(lptable1) != 0) { print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", @@ -347,6 +371,8 @@ function convert_operands(count,opnd, i,j,imm,mod) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") + else if (match(ext, xopok_expr) || xopid >= 0) + flags = add_flags(flags, "INAT_XOPOK") # check prefixes if (match(ext, prefix_expr)) { @@ -413,6 +439,14 @@ END { print " ["i"]["j"] = "atable[i,j]"," print "};\n" + print "/* XOP opcode map array */" + print "const insn_attr_t * const inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1]" \ + " = {" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print " ["i"] = "xoptable[i]"," + print "};" + print "#else /* !__BOOT_COMPRESSED */\n" print "/* Escape opcode map array */" @@ -430,6 +464,10 @@ END { "[INAT_LSTPFX_MAX + 1];" print "" + print "/* XOP opcode map array */" + print "static const insn_attr_t *inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1];" + print "" + print "static void inat_init_tables(void)" print "{" @@ -455,6 +493,12 @@ END { if (atable[i,j]) print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + print "" + print "\t/* Print XOP opcode map array */" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print "\tinat_xop_tables["i"] = "xoptable[i]";" + print "}" print "#endif" } diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 57c669d2aa90..55d59ed507d5 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -193,7 +193,7 @@ static int load_xbc_from_initrd(int fd, char **buf) if (stat.st_size < BOOTCONFIG_FOOTER_SIZE) return 0; - if (lseek(fd, -BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0) + if (lseek(fd, -(off_t)BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0) return pr_errno("Failed to lseek for magic", -errno); if (read(fd, magic, BOOTCONFIG_MAGIC_LEN) < 0) @@ -203,7 +203,7 @@ static int load_xbc_from_initrd(int fd, char **buf) if (memcmp(magic, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN) != 0) return 0; - if (lseek(fd, -BOOTCONFIG_FOOTER_SIZE, SEEK_END) < 0) + if (lseek(fd, -(off_t)BOOTCONFIG_FOOTER_SIZE, SEEK_END) < 0) return pr_errno("Failed to lseek for size", -errno); if (read(fd, &size, sizeof(uint32_t)) < 0) diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index ca860fd97d8d..d0a36f442db7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -16,7 +16,7 @@ SYNOPSIS **bpftool** [*OPTIONS*] **gen** *COMMAND* -*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } | [ { **-S** | **--sign** } {**-k** <private_key.pem>} **-i** <certificate.x509> ] } *COMMAND* := { **object** | **skeleton** | **help** } @@ -186,6 +186,17 @@ OPTIONS skeleton). A light skeleton contains a loader eBPF program. It does not use the majority of the libbpf infrastructure, and does not need libelf. +-S, --sign + For skeletons, generate a signed skeleton. This option must be used with + **-k** and **-i**. Using this flag implicitly enables **--use-loader**. + +-k <private_key.pem> + Path to the private key file in PEM format, required for signing. + +-i <certificate.x509> + Path to the X.509 certificate file in PEM or DER format, required for + signing. + EXAMPLES ======== **$ cat example1.bpf.c** diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index f69fd92df8d8..009633294b09 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -18,7 +18,7 @@ SYNOPSIS *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | -{ **-L** | **--use-loader** } } +{ **-L** | **--use-loader** } | [ { **-S** | **--sign** } **-k** <private_key.pem> **-i** <certificate.x509> ] } *COMMANDS* := { **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | @@ -248,6 +248,18 @@ OPTIONS creating the maps, and loading the programs (see **bpftool prog tracelog** as a way to dump those messages). +-S, --sign + Enable signing of the BPF program before loading. This option must be + used with **-k** and **-i**. Using this flag implicitly enables + **--use-loader**. + +-k <private_key.pem> + Path to the private key file in PEM format, required when signing. + +-i <certificate.x509> + Path to the X.509 certificate file in PEM or DER format, required when + signing. + EXAMPLES ======== **# bpftool prog show** diff --git a/tools/bpf/bpftool/Documentation/bpftool-token.rst b/tools/bpf/bpftool/Documentation/bpftool-token.rst new file mode 100644 index 000000000000..d082c499cfe3 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-token.rst @@ -0,0 +1,64 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +bpftool-token +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF tokens +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + +**bpftool** [*OPTIONS*] **token** *COMMAND* + +*OPTIONS* := { |COMMON_OPTIONS| } + +*COMMANDS* := { **show** | **list** | **help** } + +TOKEN COMMANDS +=============== + +| **bpftool** **token** { **show** | **list** } +| **bpftool** **token help** +| + +DESCRIPTION +=========== +bpftool token { show | list } + List BPF token information for each *bpffs* mount point containing token + information on the system. Information include mount point path, allowed + **bpf**\ () system call commands, maps, programs, and attach types for the + token. + +bpftool prog help + Print short help message. + +OPTIONS +======== +.. include:: common_options.rst + +EXAMPLES +======== +| +| **# mkdir -p /sys/fs/bpf/token** +| **# mount -t bpf bpffs /sys/fs/bpf/token** \ +| **-o delegate_cmds=prog_load:map_create** \ +| **-o delegate_progs=kprobe** \ +| **-o delegate_attachs=xdp** +| **# bpftool token list** + +:: + + token_info /sys/fs/bpf/token + allowed_cmds: + map_create prog_load + allowed_maps: + allowed_progs: + kprobe + allowed_attachs: + xdp diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 9e9a5f006cd2..586d1b2595d1 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -130,8 +130,8 @@ include $(FEATURES_DUMP) endif endif -LIBS = $(LIBBPF) -lelf -lz -LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz +LIBS = $(LIBBPF) -lelf -lz -lcrypto +LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz -lcrypto ifeq ($(feature-libelf-zstd),1) LIBS += -lzstd @@ -194,7 +194,7 @@ endif BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool -BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) +BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o sign.o) $(BOOTSTRAP_OBJS): $(LIBBPF_BOOTSTRAP) OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index a759ba24471d..53bcfeb1a76e 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -262,7 +262,7 @@ _bpftool() # Deal with options if [[ ${words[cword]} == -* ]]; then local c='--version --json --pretty --bpffs --mapcompat --debug \ - --use-loader --base-btf' + --use-loader --base-btf --sign -i -k' COMPREPLY=( $( compgen -W "$c" -- "$cur" ) ) return 0 fi @@ -283,7 +283,7 @@ _bpftool() _sysfs_get_netdevs return 0 ;; - file|pinned|-B|--base-btf) + file|pinned|-B|--base-btf|-i|-k) _filedir return 0 ;; @@ -296,13 +296,21 @@ _bpftool() # Remove all options so completions don't have to deal with them. local i pprev for (( i=1; i < ${#words[@]}; )); do - if [[ ${words[i]::1} == - ]] && - [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then - words=( "${words[@]:0:i}" "${words[@]:i+1}" ) - [[ $i -le $cword ]] && cword=$(( cword - 1 )) - else - i=$(( ++i )) - fi + case ${words[i]} in + # Remove option and its argument + -B|--base-btf|-i|-k) + words=( "${words[@]:0:i}" "${words[@]:i+2}" ) + [[ $i -le $(($cword + 1)) ]] && cword=$(( cword - 2 )) + ;; + # No argument, remove option only + -*) + words=( "${words[@]:0:i}" "${words[@]:i+1}" ) + [[ $i -le $cword ]] && cword=$(( cword - 1 )) + ;; + *) + i=$(( ++i )) + ;; + esac done cur=${words[cword]} prev=${words[cword - 1]} @@ -1215,6 +1223,17 @@ _bpftool() ;; esac ;; + token) + case $command in + show|list) + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help show list' -- "$cur" ) ) + ;; + esac + ;; esac } && complete -F _bpftool bpftool diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 4e896d8a2416..ff12628593ae 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -38,7 +38,7 @@ static int dump_prog_id_as_func_ptr(const struct btf_dumper *d, __u32 info_len = sizeof(info); const char *prog_name = NULL; struct btf *prog_btf = NULL; - struct bpf_func_info finfo; + struct bpf_func_info finfo = {}; __u32 finfo_rec_size; char prog_str[1024]; int err; diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 944ebe21a216..ec356deb27c9 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -2,6 +2,10 @@ // Copyright (C) 2017 Facebook // Author: Roman Gushchin <guro@fb.com> +#undef GCC_VERSION +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif #define _XOPEN_SOURCE 500 #include <errno.h> #include <fcntl.h> diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index b07317d2842f..e8daf963ecef 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -21,6 +21,7 @@ #include <sys/resource.h> #include <sys/stat.h> #include <sys/vfs.h> +#include <sys/utsname.h> #include <linux/filter.h> #include <linux/limits.h> @@ -31,6 +32,7 @@ #include <bpf/hashmap.h> #include <bpf/libbpf.h> /* libbpf_num_possible_cpus */ #include <bpf/btf.h> +#include <zlib.h> #include "main.h" @@ -1208,3 +1210,94 @@ int pathname_concat(char *buf, int buf_sz, const char *path, return 0; } + +static bool read_next_kernel_config_option(gzFile file, char *buf, size_t n, + char **value) +{ + char *sep; + + while (gzgets(file, buf, n)) { + if (strncmp(buf, "CONFIG_", 7)) + continue; + + sep = strchr(buf, '='); + if (!sep) + continue; + + /* Trim ending '\n' */ + buf[strlen(buf) - 1] = '\0'; + + /* Split on '=' and ensure that a value is present. */ + *sep = '\0'; + if (!sep[1]) + continue; + + *value = sep + 1; + return true; + } + + return false; +} + +int read_kernel_config(const struct kernel_config_option *requested_options, + size_t num_options, char **out_values, + const char *define_prefix) +{ + struct utsname utsn; + char path[PATH_MAX]; + gzFile file = NULL; + char buf[4096]; + char *value; + size_t i; + int ret = 0; + + if (!requested_options || !out_values || num_options == 0) + return -1; + + if (!uname(&utsn)) { + snprintf(path, sizeof(path), "/boot/config-%s", utsn.release); + + /* gzopen also accepts uncompressed files. */ + file = gzopen(path, "r"); + } + + if (!file) { + /* Some distributions build with CONFIG_IKCONFIG=y and put the + * config file at /proc/config.gz. + */ + file = gzopen("/proc/config.gz", "r"); + } + + if (!file) { + p_info("skipping kernel config, can't open file: %s", + strerror(errno)); + return -1; + } + + if (!gzgets(file, buf, sizeof(buf)) || !gzgets(file, buf, sizeof(buf))) { + p_info("skipping kernel config, can't read from file: %s", + strerror(errno)); + ret = -1; + goto end_parse; + } + + if (strcmp(buf, "# Automatically generated file; DO NOT EDIT.\n")) { + p_info("skipping kernel config, can't find correct file"); + ret = -1; + goto end_parse; + } + + while (read_next_kernel_config_option(file, buf, sizeof(buf), &value)) { + for (i = 0; i < num_options; i++) { + if ((define_prefix && !requested_options[i].macro_dump) || + out_values[i] || strcmp(buf, requested_options[i].name)) + continue; + + out_values[i] = strdup(value); + } + } + +end_parse: + gzclose(file); + return ret; +} diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 24fecdf8e430..0f6070a0c8e7 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -10,7 +10,6 @@ #ifdef USE_LIBCAP #include <sys/capability.h> #endif -#include <sys/utsname.h> #include <sys/vfs.h> #include <linux/filter.h> @@ -18,7 +17,6 @@ #include <bpf/bpf.h> #include <bpf/libbpf.h> -#include <zlib.h> #include "main.h" @@ -327,40 +325,9 @@ static void probe_jit_limit(void) } } -static bool read_next_kernel_config_option(gzFile file, char *buf, size_t n, - char **value) -{ - char *sep; - - while (gzgets(file, buf, n)) { - if (strncmp(buf, "CONFIG_", 7)) - continue; - - sep = strchr(buf, '='); - if (!sep) - continue; - - /* Trim ending '\n' */ - buf[strlen(buf) - 1] = '\0'; - - /* Split on '=' and ensure that a value is present. */ - *sep = '\0'; - if (!sep[1]) - continue; - - *value = sep + 1; - return true; - } - - return false; -} - static void probe_kernel_image_config(const char *define_prefix) { - static const struct { - const char * const name; - bool macro_dump; - } options[] = { + struct kernel_config_option options[] = { /* Enable BPF */ { "CONFIG_BPF", }, /* Enable bpf() syscall */ @@ -435,52 +402,11 @@ static void probe_kernel_image_config(const char *define_prefix) { "CONFIG_HZ", true, } }; char *values[ARRAY_SIZE(options)] = { }; - struct utsname utsn; - char path[PATH_MAX]; - gzFile file = NULL; - char buf[4096]; - char *value; size_t i; - if (!uname(&utsn)) { - snprintf(path, sizeof(path), "/boot/config-%s", utsn.release); - - /* gzopen also accepts uncompressed files. */ - file = gzopen(path, "r"); - } - - if (!file) { - /* Some distributions build with CONFIG_IKCONFIG=y and put the - * config file at /proc/config.gz. - */ - file = gzopen("/proc/config.gz", "r"); - } - if (!file) { - p_info("skipping kernel config, can't open file: %s", - strerror(errno)); - goto end_parse; - } - /* Sanity checks */ - if (!gzgets(file, buf, sizeof(buf)) || - !gzgets(file, buf, sizeof(buf))) { - p_info("skipping kernel config, can't read from file: %s", - strerror(errno)); - goto end_parse; - } - if (strcmp(buf, "# Automatically generated file; DO NOT EDIT.\n")) { - p_info("skipping kernel config, can't find correct file"); - goto end_parse; - } - - while (read_next_kernel_config_option(file, buf, sizeof(buf), &value)) { - for (i = 0; i < ARRAY_SIZE(options); i++) { - if ((define_prefix && !options[i].macro_dump) || - values[i] || strcmp(buf, options[i].name)) - continue; - - values[i] = strdup(value); - } - } + if (read_kernel_config(options, ARRAY_SIZE(options), values, + define_prefix)) + return; for (i = 0; i < ARRAY_SIZE(options); i++) { if (define_prefix && !options[i].macro_dump) @@ -488,10 +414,6 @@ static void probe_kernel_image_config(const char *define_prefix) print_kernel_option(options[i].name, values[i], define_prefix); free(values[i]); } - -end_parse: - if (file) - gzclose(file); } static bool probe_bpf_syscall(const char *define_prefix) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 67a60114368f..993c7d9484a4 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -688,10 +688,17 @@ static void codegen_destroy(struct bpf_object *obj, const char *obj_name) static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *header_guard) { DECLARE_LIBBPF_OPTS(gen_loader_opts, opts); + struct bpf_load_and_run_opts sopts = {}; + char sig_buf[MAX_SIG_SIZE]; + __u8 prog_sha[SHA256_DIGEST_LENGTH]; struct bpf_map *map; + char ident[256]; int err = 0; + if (sign_progs) + opts.gen_hash = true; + err = bpf_object__gen_loader(obj, &opts); if (err) return err; @@ -701,6 +708,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h p_err("failed to load object file"); goto out; } + /* If there was no error during load then gen_loader_opts * are populated with the loader program. */ @@ -780,8 +788,52 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h print_hex(opts.insns, opts.insns_sz); codegen("\ \n\ - \"; \n\ - \n\ + \";\n"); + + if (sign_progs) { + sopts.insns = opts.insns; + sopts.insns_sz = opts.insns_sz; + sopts.excl_prog_hash = prog_sha; + sopts.excl_prog_hash_sz = sizeof(prog_sha); + sopts.signature = sig_buf; + sopts.signature_sz = MAX_SIG_SIZE; + + err = bpftool_prog_sign(&sopts); + if (err < 0) { + p_err("failed to sign program"); + goto out; + } + + codegen("\ + \n\ + static const char opts_sig[] __attribute__((__aligned__(8))) = \"\\\n\ + "); + print_hex((const void *)sig_buf, sopts.signature_sz); + codegen("\ + \n\ + \";\n"); + + codegen("\ + \n\ + static const char opts_excl_hash[] __attribute__((__aligned__(8))) = \"\\\n\ + "); + print_hex((const void *)prog_sha, sizeof(prog_sha)); + codegen("\ + \n\ + \";\n"); + + codegen("\ + \n\ + opts.signature = (void *)opts_sig; \n\ + opts.signature_sz = sizeof(opts_sig) - 1; \n\ + opts.excl_prog_hash = (void *)opts_excl_hash; \n\ + opts.excl_prog_hash_sz = sizeof(opts_excl_hash) - 1; \n\ + opts.keyring_id = skel->keyring_id; \n\ + "); + } + + codegen("\ + \n\ opts.ctx = (struct bpf_loader_ctx *)skel; \n\ opts.data_sz = sizeof(opts_data) - 1; \n\ opts.data = (void *)opts_data; \n\ @@ -1240,7 +1292,7 @@ static int do_skeleton(int argc, char **argv) err = -errno; libbpf_strerror(err, err_buf, sizeof(err_buf)); p_err("failed to open BPF object file: %s", err_buf); - goto out; + goto out_obj; } bpf_object__for_each_map(map, obj) { @@ -1355,6 +1407,13 @@ static int do_skeleton(int argc, char **argv) printf("\t} links;\n"); } + if (sign_progs) { + codegen("\ + \n\ + __s32 keyring_id; \n\ + "); + } + if (btf) { err = codegen_datasecs(obj, obj_name); if (err) @@ -1552,6 +1611,7 @@ static int do_skeleton(int argc, char **argv) err = 0; out: bpf_object__close(obj); +out_obj: if (obj_data) munmap(obj_data, mmap_sz); close(fd); @@ -1930,7 +1990,7 @@ static int do_help(int argc, char **argv) " %1$s %2$s help\n" "\n" " " HELP_SPEC_OPTIONS " |\n" - " {-L|--use-loader} }\n" + " {-L|--use-loader} | [ {-S|--sign } {-k} <private_key.pem> {-i} <certificate.x509> ]}\n" "", bin_name, "gen"); diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index a773e05d5ade..bdcd717b0348 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -282,11 +282,52 @@ get_addr_cookie_array(__u64 *addrs, __u64 *cookies, __u32 count) return data; } +static bool is_x86_ibt_enabled(void) +{ +#if defined(__x86_64__) + struct kernel_config_option options[] = { + { "CONFIG_X86_KERNEL_IBT", }, + }; + char *values[ARRAY_SIZE(options)] = { }; + bool ret; + + if (read_kernel_config(options, ARRAY_SIZE(options), values, NULL)) + return false; + + ret = !!values[0]; + free(values[0]); + return ret; +#else + return false; +#endif +} + +static bool +symbol_matches_target(__u64 sym_addr, __u64 target_addr, bool is_ibt_enabled) +{ + if (sym_addr == target_addr) + return true; + + /* + * On x86_64 architectures with CET (Control-flow Enforcement Technology), + * function entry points have a 4-byte 'endbr' instruction prefix. + * This causes kprobe hooks to target the address *after* 'endbr' + * (symbol address + 4), preserving the CET instruction. + * Here we check if the symbol address matches the hook target address + * minus 4, indicating a CET-enabled function entry point. + */ + if (is_ibt_enabled && sym_addr == target_addr - 4) + return true; + + return false; +} + static void show_kprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr) { struct addr_cookie *data; __u32 i, j = 0; + bool is_ibt_enabled; jsonw_bool_field(json_wtr, "retprobe", info->kprobe_multi.flags & BPF_F_KPROBE_MULTI_RETURN); @@ -306,11 +347,13 @@ show_kprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr) if (!dd.sym_count) goto error; + is_ibt_enabled = is_x86_ibt_enabled(); for (i = 0; i < dd.sym_count; i++) { - if (dd.sym_mapping[i].address != data[j].addr) + if (!symbol_matches_target(dd.sym_mapping[i].address, + data[j].addr, is_ibt_enabled)) continue; jsonw_start_object(json_wtr); - jsonw_uint_field(json_wtr, "addr", dd.sym_mapping[i].address); + jsonw_uint_field(json_wtr, "addr", (unsigned long)data[j].addr); jsonw_string_field(json_wtr, "func", dd.sym_mapping[i].name); /* Print null if it is vmlinux */ if (dd.sym_mapping[i].module[0] == '\0') { @@ -719,6 +762,7 @@ static void show_kprobe_multi_plain(struct bpf_link_info *info) { struct addr_cookie *data; __u32 i, j = 0; + bool is_ibt_enabled; if (!info->kprobe_multi.count) return; @@ -742,12 +786,14 @@ static void show_kprobe_multi_plain(struct bpf_link_info *info) if (!dd.sym_count) goto error; + is_ibt_enabled = is_x86_ibt_enabled(); printf("\n\t%-16s %-16s %s", "addr", "cookie", "func [module]"); for (i = 0; i < dd.sym_count; i++) { - if (dd.sym_mapping[i].address != data[j].addr) + if (!symbol_matches_target(dd.sym_mapping[i].address, + data[j].addr, is_ibt_enabled)) continue; printf("\n\t%016lx %-16llx %s", - dd.sym_mapping[i].address, data[j].cookie, dd.sym_mapping[i].name); + (unsigned long)data[j].addr, data[j].cookie, dd.sym_mapping[i].name); if (dd.sym_mapping[i].module[0] != '\0') printf(" [%s] ", dd.sym_mapping[i].module); else diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 2b7f2bd3a7db..a829a6a49037 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -33,6 +33,9 @@ bool relaxed_maps; bool use_loader; struct btf *base_btf; struct hashmap *refs_table; +bool sign_progs; +const char *private_key_path; +const char *cert_path; static void __noreturn clean_and_exit(int i) { @@ -61,7 +64,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n" + " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter | token }\n" " " HELP_SPEC_OPTIONS " |\n" " {-V|--version} }\n" "", @@ -87,6 +90,7 @@ static const struct cmd commands[] = { { "gen", do_gen }, { "struct_ops", do_struct_ops }, { "iter", do_iter }, + { "token", do_token }, { "version", do_version }, { 0 } }; @@ -447,6 +451,7 @@ int main(int argc, char **argv) { "nomount", no_argument, NULL, 'n' }, { "debug", no_argument, NULL, 'd' }, { "use-loader", no_argument, NULL, 'L' }, + { "sign", no_argument, NULL, 'S' }, { "base-btf", required_argument, NULL, 'B' }, { 0 } }; @@ -473,7 +478,7 @@ int main(int argc, char **argv) bin_name = "bpftool"; opterr = 0; - while ((opt = getopt_long(argc, argv, "VhpjfLmndB:l", + while ((opt = getopt_long(argc, argv, "VhpjfLmndSi:k:B:l", options, NULL)) >= 0) { switch (opt) { case 'V': @@ -519,6 +524,16 @@ int main(int argc, char **argv) case 'L': use_loader = true; break; + case 'S': + sign_progs = true; + use_loader = true; + break; + case 'k': + private_key_path = optarg; + break; + case 'i': + cert_path = optarg; + break; default: p_err("unrecognized option '%s'", argv[optind - 1]); if (json_output) @@ -533,6 +548,16 @@ int main(int argc, char **argv) if (argc < 0) usage(); + if (sign_progs && (private_key_path == NULL || cert_path == NULL)) { + p_err("-i <identity_x509_cert> and -k <private_key> must be supplied with -S for signing"); + return -EINVAL; + } + + if (!sign_progs && (private_key_path != NULL || cert_path != NULL)) { + p_err("--sign (or -S) must be explicitly passed with -i <identity_x509_cert> and -k <private_key> to sign the programs"); + return -EINVAL; + } + if (version_requested) ret = do_version(argc, argv); else diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 6db704fda5c0..1130299cede0 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -6,9 +6,14 @@ /* BFD and kernel.h both define GCC_VERSION, differently */ #undef GCC_VERSION +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif #include <stdbool.h> #include <stdio.h> +#include <errno.h> #include <stdlib.h> +#include <bpf/skel_internal.h> #include <linux/bpf.h> #include <linux/compiler.h> #include <linux/kernel.h> @@ -52,6 +57,7 @@ static inline void *u64_to_ptr(__u64 ptr) }) #define ERR_MAX_LEN 1024 +#define MAX_SIG_SIZE 4096 #define BPF_TAG_FMT "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx" @@ -85,6 +91,9 @@ extern bool relaxed_maps; extern bool use_loader; extern struct btf *base_btf; extern struct hashmap *refs_table; +extern bool sign_progs; +extern const char *private_key_path; +extern const char *cert_path; void __printf(1, 2) p_err(const char *fmt, ...); void __printf(1, 2) p_info(const char *fmt, ...); @@ -166,6 +175,7 @@ int do_tracelog(int argc, char **arg) __weak; int do_feature(int argc, char **argv) __weak; int do_struct_ops(int argc, char **argv) __weak; int do_iter(int argc, char **argv) __weak; +int do_token(int argc, char **argv) __weak; int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int prog_parse_fd(int *argc, char ***argv); @@ -274,4 +284,15 @@ int pathname_concat(char *buf, int buf_sz, const char *path, /* print netfilter bpf_link info */ void netfilter_dump_plain(const struct bpf_link_info *info); void netfilter_dump_json(const struct bpf_link_info *info, json_writer_t *wtr); + +struct kernel_config_option { + const char *name; + bool macro_dump; +}; + +int read_kernel_config(const struct kernel_config_option *requested_options, + size_t num_options, char **out_values, + const char *define_prefix); +int bpftool_prog_sign(struct bpf_load_and_run_opts *opts); +__u32 register_session_key(const char *key_der_path); #endif diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 9722d841abc0..6daf19809ca4 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -23,6 +23,7 @@ #include <linux/err.h> #include <linux/perf_event.h> #include <linux/sizes.h> +#include <linux/keyctl.h> #include <bpf/bpf.h> #include <bpf/btf.h> @@ -714,7 +715,7 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, if (mode == DUMP_JITED) { if (info->jited_prog_len == 0 || !info->jited_prog_insns) { - p_info("no instructions returned"); + p_err("error retrieving jit dump: no instructions returned or kernel.kptr_restrict set?"); return -1; } buf = u64_to_ptr(info->jited_prog_insns); @@ -1930,6 +1931,8 @@ static int try_loader(struct gen_loader_opts *gen) { struct bpf_load_and_run_opts opts = {}; struct bpf_loader_ctx *ctx; + char sig_buf[MAX_SIG_SIZE]; + __u8 prog_sha[SHA256_DIGEST_LENGTH]; int ctx_sz = sizeof(*ctx) + 64 * max(sizeof(struct bpf_map_desc), sizeof(struct bpf_prog_desc)); int log_buf_sz = (1u << 24) - 1; @@ -1953,6 +1956,26 @@ static int try_loader(struct gen_loader_opts *gen) opts.insns = gen->insns; opts.insns_sz = gen->insns_sz; fds_before = count_open_fds(); + + if (sign_progs) { + opts.excl_prog_hash = prog_sha; + opts.excl_prog_hash_sz = sizeof(prog_sha); + opts.signature = sig_buf; + opts.signature_sz = MAX_SIG_SIZE; + opts.keyring_id = KEY_SPEC_SESSION_KEYRING; + + err = bpftool_prog_sign(&opts); + if (err < 0) { + p_err("failed to sign program"); + goto out; + } + + err = register_session_key(cert_path); + if (err < 0) { + p_err("failed to add session key"); + goto out; + } + } err = bpf_load_and_run(&opts); fd_delta = count_open_fds() - fds_before; if (err < 0 || verifier_logs) { @@ -1961,6 +1984,7 @@ static int try_loader(struct gen_loader_opts *gen) fprintf(stderr, "loader prog leaked %d FDs\n", fd_delta); } +out: free(log_buf); return err; } @@ -1988,6 +2012,9 @@ static int do_loader(int argc, char **argv) goto err_close_obj; } + if (sign_progs) + gen.gen_hash = true; + err = bpf_object__gen_loader(obj, &gen); if (err) goto err_close_obj; @@ -2262,7 +2289,7 @@ static void profile_print_readings(void) static char *profile_target_name(int tgt_fd) { - struct bpf_func_info func_info; + struct bpf_func_info func_info = {}; struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); const struct btf_type *t; @@ -2562,7 +2589,7 @@ static int do_help(int argc, char **argv) " METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} |\n" - " {-L|--use-loader} }\n" + " {-L|--use-loader} | [ {-S|--sign } {-k} <private_key.pem> {-i} <certificate.x509> ] \n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/sign.c b/tools/bpf/bpftool/sign.c new file mode 100644 index 000000000000..b34f74d210e9 --- /dev/null +++ b/tools/bpf/bpftool/sign.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* + * Copyright (C) 2025 Google LLC. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdbool.h> +#include <string.h> +#include <getopt.h> +#include <err.h> +#include <openssl/opensslv.h> +#include <openssl/bio.h> +#include <openssl/evp.h> +#include <openssl/pem.h> +#include <openssl/err.h> +#include <openssl/cms.h> +#include <linux/keyctl.h> +#include <errno.h> + +#include <bpf/skel_internal.h> + +#include "main.h" + +#define OPEN_SSL_ERR_BUF_LEN 256 + +static void display_openssl_errors(int l) +{ + char buf[OPEN_SSL_ERR_BUF_LEN]; + const char *file; + const char *data; + unsigned long e; + int flags; + int line; + + while ((e = ERR_get_error_all(&file, &line, NULL, &data, &flags))) { + ERR_error_string_n(e, buf, sizeof(buf)); + if (data && (flags & ERR_TXT_STRING)) { + p_err("OpenSSL %s: %s:%d: %s", buf, file, line, data); + } else { + p_err("OpenSSL %s: %s:%d", buf, file, line); + } + } +} + +#define DISPLAY_OSSL_ERR(cond) \ + do { \ + bool __cond = (cond); \ + if (__cond && ERR_peek_error()) \ + display_openssl_errors(__LINE__);\ + } while (0) + +static EVP_PKEY *read_private_key(const char *pkey_path) +{ + EVP_PKEY *private_key = NULL; + BIO *b; + + b = BIO_new_file(pkey_path, "rb"); + private_key = PEM_read_bio_PrivateKey(b, NULL, NULL, NULL); + BIO_free(b); + DISPLAY_OSSL_ERR(!private_key); + return private_key; +} + +static X509 *read_x509(const char *x509_name) +{ + unsigned char buf[2]; + X509 *x509 = NULL; + BIO *b; + int n; + + b = BIO_new_file(x509_name, "rb"); + if (!b) + goto cleanup; + + /* Look at the first two bytes of the file to determine the encoding */ + n = BIO_read(b, buf, 2); + if (n != 2) + goto cleanup; + + if (BIO_reset(b) != 0) + goto cleanup; + + if (buf[0] == 0x30 && buf[1] >= 0x81 && buf[1] <= 0x84) + /* Assume raw DER encoded X.509 */ + x509 = d2i_X509_bio(b, NULL); + else + /* Assume PEM encoded X.509 */ + x509 = PEM_read_bio_X509(b, NULL, NULL, NULL); + +cleanup: + BIO_free(b); + DISPLAY_OSSL_ERR(!x509); + return x509; +} + +__u32 register_session_key(const char *key_der_path) +{ + unsigned char *der_buf = NULL; + X509 *x509 = NULL; + int key_id = -1; + int der_len; + + if (!key_der_path) + return key_id; + x509 = read_x509(key_der_path); + if (!x509) + goto cleanup; + der_len = i2d_X509(x509, &der_buf); + if (der_len < 0) + goto cleanup; + key_id = syscall(__NR_add_key, "asymmetric", key_der_path, der_buf, + (size_t)der_len, KEY_SPEC_SESSION_KEYRING); +cleanup: + X509_free(x509); + OPENSSL_free(der_buf); + DISPLAY_OSSL_ERR(key_id == -1); + return key_id; +} + +int bpftool_prog_sign(struct bpf_load_and_run_opts *opts) +{ + BIO *bd_in = NULL, *bd_out = NULL; + EVP_PKEY *private_key = NULL; + CMS_ContentInfo *cms = NULL; + long actual_sig_len = 0; + X509 *x509 = NULL; + int err = 0; + + bd_in = BIO_new_mem_buf(opts->insns, opts->insns_sz); + if (!bd_in) { + err = -ENOMEM; + goto cleanup; + } + + private_key = read_private_key(private_key_path); + if (!private_key) { + err = -EINVAL; + goto cleanup; + } + + x509 = read_x509(cert_path); + if (!x509) { + err = -EINVAL; + goto cleanup; + } + + cms = CMS_sign(NULL, NULL, NULL, NULL, + CMS_NOCERTS | CMS_PARTIAL | CMS_BINARY | CMS_DETACHED | + CMS_STREAM); + if (!cms) { + err = -EINVAL; + goto cleanup; + } + + if (!CMS_add1_signer(cms, x509, private_key, EVP_sha256(), + CMS_NOCERTS | CMS_BINARY | CMS_NOSMIMECAP | + CMS_USE_KEYID | CMS_NOATTR)) { + err = -EINVAL; + goto cleanup; + } + + if (CMS_final(cms, bd_in, NULL, CMS_NOCERTS | CMS_BINARY) != 1) { + err = -EIO; + goto cleanup; + } + + EVP_Digest(opts->insns, opts->insns_sz, opts->excl_prog_hash, + &opts->excl_prog_hash_sz, EVP_sha256(), NULL); + + bd_out = BIO_new(BIO_s_mem()); + if (!bd_out) { + err = -ENOMEM; + goto cleanup; + } + + if (!i2d_CMS_bio_stream(bd_out, cms, NULL, 0)) { + err = -EIO; + goto cleanup; + } + + actual_sig_len = BIO_get_mem_data(bd_out, NULL); + if (actual_sig_len <= 0) { + err = -EIO; + goto cleanup; + } + + if ((size_t)actual_sig_len > opts->signature_sz) { + err = -ENOSPC; + goto cleanup; + } + + if (BIO_read(bd_out, opts->signature, actual_sig_len) != actual_sig_len) { + err = -EIO; + goto cleanup; + } + + opts->signature_sz = actual_sig_len; +cleanup: + BIO_free(bd_out); + CMS_ContentInfo_free(cms); + X509_free(x509); + EVP_PKEY_free(private_key); + BIO_free(bd_in); + DISPLAY_OSSL_ERR(err < 0); + return err; +} diff --git a/tools/bpf/bpftool/token.c b/tools/bpf/bpftool/token.c new file mode 100644 index 000000000000..c08f34b9d51b --- /dev/null +++ b/tools/bpf/bpftool/token.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2025 Didi Technology Co., Tao Chen */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <errno.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <mntent.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include "json_writer.h" +#include "main.h" + +#define MOUNTS_FILE "/proc/mounts" + +static struct { + const char *header; + const char *key; +} sets[] = { + {"allowed_cmds", "delegate_cmds"}, + {"allowed_maps", "delegate_maps"}, + {"allowed_progs", "delegate_progs"}, + {"allowed_attachs", "delegate_attachs"}, +}; + +static bool has_delegate_options(const char *mnt_ops) +{ + return strstr(mnt_ops, "delegate_cmds") || + strstr(mnt_ops, "delegate_maps") || + strstr(mnt_ops, "delegate_progs") || + strstr(mnt_ops, "delegate_attachs"); +} + +static char *get_delegate_value(char *opts, const char *key) +{ + char *token, *rest, *ret = NULL; + + if (!opts) + return NULL; + + for (token = strtok_r(opts, ",", &rest); token; + token = strtok_r(NULL, ",", &rest)) { + if (strncmp(token, key, strlen(key)) == 0 && + token[strlen(key)] == '=') { + ret = token + strlen(key) + 1; + break; + } + } + + return ret; +} + +static void print_items_per_line(char *input, int items_per_line) +{ + char *str, *rest; + int cnt = 0; + + if (!input) + return; + + for (str = strtok_r(input, ":", &rest); str; + str = strtok_r(NULL, ":", &rest)) { + if (cnt % items_per_line == 0) + printf("\n\t "); + + printf("%-20s", str); + cnt++; + } +} + +#define ITEMS_PER_LINE 4 +static void show_token_info_plain(struct mntent *mntent) +{ + size_t i; + + printf("token_info %s", mntent->mnt_dir); + + for (i = 0; i < ARRAY_SIZE(sets); i++) { + char *opts, *value; + + printf("\n\t%s:", sets[i].header); + opts = strdup(mntent->mnt_opts); + value = get_delegate_value(opts, sets[i].key); + print_items_per_line(value, ITEMS_PER_LINE); + free(opts); + } + + printf("\n"); +} + +static void split_json_array_str(char *input) +{ + char *str, *rest; + + if (!input) { + jsonw_start_array(json_wtr); + jsonw_end_array(json_wtr); + return; + } + + jsonw_start_array(json_wtr); + for (str = strtok_r(input, ":", &rest); str; + str = strtok_r(NULL, ":", &rest)) { + jsonw_string(json_wtr, str); + } + jsonw_end_array(json_wtr); +} + +static void show_token_info_json(struct mntent *mntent) +{ + size_t i; + + jsonw_start_object(json_wtr); + jsonw_string_field(json_wtr, "token_info", mntent->mnt_dir); + + for (i = 0; i < ARRAY_SIZE(sets); i++) { + char *opts, *value; + + jsonw_name(json_wtr, sets[i].header); + opts = strdup(mntent->mnt_opts); + value = get_delegate_value(opts, sets[i].key); + split_json_array_str(value); + free(opts); + } + + jsonw_end_object(json_wtr); +} + +static int __show_token_info(struct mntent *mntent) +{ + if (json_output) + show_token_info_json(mntent); + else + show_token_info_plain(mntent); + + return 0; +} + +static int show_token_info(void) +{ + FILE *fp; + struct mntent *ent; + + fp = setmntent(MOUNTS_FILE, "r"); + if (!fp) { + p_err("Failed to open: %s", MOUNTS_FILE); + return -1; + } + + if (json_output) + jsonw_start_array(json_wtr); + + while ((ent = getmntent(fp)) != NULL) { + if (strncmp(ent->mnt_type, "bpf", 3) == 0) { + if (has_delegate_options(ent->mnt_opts)) + __show_token_info(ent); + } + } + + if (json_output) + jsonw_end_array(json_wtr); + + endmntent(fp); + + return 0; +} + +static int do_show(int argc, char **argv) +{ + if (argc) + return BAD_ARG(); + + return show_token_info(); +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list }\n" + " %1$s %2$s help\n" + " " HELP_SPEC_OPTIONS " }\n" + "\n" + "", + bin_name, argv[-2]); + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { 0 } +}; + +int do_token(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/tracelog.c b/tools/bpf/bpftool/tracelog.c index 31d806e3bdaa..573a8d99f009 100644 --- a/tools/bpf/bpftool/tracelog.c +++ b/tools/bpf/bpftool/tracelog.c @@ -57,10 +57,8 @@ find_tracefs_mnt_single(unsigned long magic, char *mnt, const char *mntpt) static bool get_tracefs_pipe(char *mnt) { static const char * const known_mnts[] = { - "/sys/kernel/debug/tracing", "/sys/kernel/tracing", - "/tracing", - "/trace", + "/sys/kernel/debug/tracing", }; const char *pipe_name = "/trace_pipe"; const char *fstype = "tracefs"; @@ -95,12 +93,7 @@ static bool get_tracefs_pipe(char *mnt) return false; p_info("could not find tracefs, attempting to mount it now"); - /* Most of the time, tracefs is automatically mounted by debugfs at - * /sys/kernel/debug/tracing when we try to access it. If we could not - * find it, it is likely that debugfs is not mounted. Let's give one - * attempt at mounting just tracefs at /sys/kernel/tracing. - */ - strcpy(mnt, known_mnts[1]); + strcpy(mnt, known_mnts[0]); if (mount_tracefs(mnt)) return false; diff --git a/tools/gpio/Makefile b/tools/gpio/Makefile index ed565eb52275..342e056c8c66 100644 --- a/tools/gpio/Makefile +++ b/tools/gpio/Makefile @@ -77,7 +77,7 @@ $(OUTPUT)gpio-watch: $(GPIO_WATCH_IN) clean: rm -f $(ALL_PROGRAMS) - rm -f $(OUTPUT)include/linux/gpio.h + rm -rf $(OUTPUT)include find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete install: $(ALL_PROGRAMS) diff --git a/tools/include/linux/args.h b/tools/include/linux/args.h new file mode 100644 index 000000000000..2e8e65d975c7 --- /dev/null +++ b/tools/include/linux/args.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_ARGS_H +#define _LINUX_ARGS_H + +/* + * How do these macros work? + * + * In __COUNT_ARGS() _0 to _12 are just placeholders from the start + * in order to make sure _n is positioned over the correct number + * from 12 to 0 (depending on X, which is a variadic argument list). + * They serve no purpose other than occupying a position. Since each + * macro parameter must have a distinct identifier, those identifiers + * are as good as any. + * + * In COUNT_ARGS() we use actual integers, so __COUNT_ARGS() returns + * that as _n. + */ + +/* This counts to 15. Any more, it will return 16th argument. */ +#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _n, X...) _n +#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) + +/* Concatenate two parameters, but allow them to be expanded beforehand. */ +#define __CONCAT(a, b) a ## b +#define CONCATENATE(a, b) __CONCAT(a, b) + +#endif /* _LINUX_ARGS_H */ diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 7ad056219115..a40cc861b3a7 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -2,10 +2,8 @@ #ifndef __LINUX_BITS_H #define __LINUX_BITS_H -#include <linux/const.h> #include <vdso/bits.h> #include <uapi/linux/bits.h> -#include <asm/bitsperlong.h> #define BIT_MASK(nr) (UL(1) << ((nr) % BITS_PER_LONG)) #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) @@ -50,10 +48,14 @@ (type_max(t) << (l) & \ type_max(t) >> (BITS_PER_TYPE(t) - 1 - (h))))) +#define GENMASK(h, l) GENMASK_TYPE(unsigned long, h, l) +#define GENMASK_ULL(h, l) GENMASK_TYPE(unsigned long long, h, l) + #define GENMASK_U8(h, l) GENMASK_TYPE(u8, h, l) #define GENMASK_U16(h, l) GENMASK_TYPE(u16, h, l) #define GENMASK_U32(h, l) GENMASK_TYPE(u32, h, l) #define GENMASK_U64(h, l) GENMASK_TYPE(u64, h, l) +#define GENMASK_U128(h, l) GENMASK_TYPE(u128, h, l) /* * Fixed-type variants of BIT(), with additional checks like GENMASK_TYPE(). The @@ -79,28 +81,9 @@ * BUILD_BUG_ON_ZERO is not available in h files included from asm files, * disable the input check if that is the case. */ -#define GENMASK_INPUT_CHECK(h, l) 0 +#define GENMASK(h, l) __GENMASK(h, l) +#define GENMASK_ULL(h, l) __GENMASK_ULL(h, l) #endif /* !defined(__ASSEMBLY__) */ -#define GENMASK(h, l) \ - (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) -#define GENMASK_ULL(h, l) \ - (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l)) - -#if !defined(__ASSEMBLY__) -/* - * Missing asm support - * - * __GENMASK_U128() depends on _BIT128() which would not work - * in the asm code, as it shifts an 'unsigned __int128' data - * type instead of direct representation of 128 bit constants - * such as long and unsigned long. The fundamental problem is - * that a 128 bit constant will get silently truncated by the - * gcc compiler. - */ -#define GENMASK_U128(h, l) \ - (GENMASK_INPUT_CHECK(h, l) + __GENMASK_U128(h, l)) -#endif - #endif /* __LINUX_BITS_H */ diff --git a/tools/include/linux/cfi_types.h b/tools/include/linux/cfi_types.h index 6b8713675765..fb8d90bff92e 100644 --- a/tools/include/linux/cfi_types.h +++ b/tools/include/linux/cfi_types.h @@ -8,7 +8,7 @@ #ifdef __ASSEMBLY__ #include <linux/linkage.h> -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI /* * Use the __kcfi_typeid_<function> type identifier symbol to * annotate indirectly called assembly functions. The compiler emits @@ -29,17 +29,40 @@ #define SYM_TYPED_START(name, linkage, align...) \ SYM_TYPED_ENTRY(name, linkage, align) -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ #define SYM_TYPED_START(name, linkage, align...) \ SYM_START(name, linkage, align) -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifndef SYM_TYPED_FUNC_START #define SYM_TYPED_FUNC_START(name) \ SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) #endif +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_CFI_CLANG +#define DEFINE_CFI_TYPE(name, func) \ + /* \ + * Force a reference to the function so the compiler generates \ + * __kcfi_typeid_<func>. \ + */ \ + __ADDRESSABLE(func); \ + /* u32 name __ro_after_init = __kcfi_typeid_<func> */ \ + extern u32 name; \ + asm ( \ + " .pushsection .data..ro_after_init,\"aw\",\%progbits \n" \ + " .type " #name ",\%object \n" \ + " .globl " #name " \n" \ + " .p2align 2, 0x0 \n" \ + #name ": \n" \ + " .4byte __kcfi_typeid_" #func " \n" \ + " .size " #name ", 4 \n" \ + " .popsection \n" \ + ); +#endif + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_CFI_TYPES_H */ diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 2892a45023af..04e0077fb4c9 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -852,8 +852,14 @@ __SYSCALL(__NR_removexattrat, sys_removexattrat) #define __NR_open_tree_attr 467 __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr) +/* fs/inode.c */ +#define __NR_file_getattr 468 +__SYSCALL(__NR_file_getattr, sys_file_getattr) +#define __NR_file_setattr 469 +__SYSCALL(__NR_file_setattr, sys_file_setattr) + #undef __NR_syscalls -#define __NR_syscalls 468 +#define __NR_syscalls 470 /* * 32 bit systems traditionally used different diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 233de8677382..ae83d8649ef1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ @@ -1605,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -6666,6 +6682,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { @@ -7418,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 7415a3863891..f0f0d49d2544 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -178,6 +178,7 @@ struct kvm_xen_exit { #define KVM_EXIT_NOTIFY 37 #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 +#define KVM_EXIT_TDX 40 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -447,6 +448,31 @@ struct kvm_run { __u64 gpa; __u64 size; } memory_fault; + /* KVM_EXIT_TDX */ + struct { + __u64 flags; + __u64 nr; + union { + struct { + __u64 ret; + __u64 data[5]; + } unknown; + struct { + __u64 ret; + __u64 gpa; + __u64 size; + } get_quote; + struct { + __u64 ret; + __u64 leaf; + __u64 r11, r12, r13, r14; + } get_tdvmcall_info; + struct { + __u64 ret; + __u64 vector; + } setup_event_notify; + }; + } tdx; /* Fix the size of the union. */ char padding[256]; }; @@ -935,6 +961,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2 240 #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 +#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h index 34127653fd00..33c9b578b3b2 100644 --- a/tools/include/uapi/linux/nsfs.h +++ b/tools/include/uapi/linux/nsfs.h @@ -16,8 +16,6 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) -/* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ @@ -42,4 +40,19 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +/* Retrieve namespace identifiers. */ +#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) +#define NS_GET_ID _IOR(NSIO, 13, __u64) + +enum init_ns_ino { + IPC_NS_INIT_INO = 0xEFFFFFFFU, + UTS_NS_INIT_INO = 0xEFFFFFFEU, + USER_NS_INIT_INO = 0xEFFFFFFDU, + PID_NS_INIT_INO = 0xEFFFFFFCU, + CGROUP_NS_INIT_INO = 0xEFFFFFFBU, + TIME_NS_INIT_INO = 0xEFFFFFFAU, + NET_NS_INIT_INO = 0xEFFFFFF9U, + MNT_NS_INIT_INO = 0xEFFFFFF8U, +}; + #endif /* __LINUX_NSFS_H */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index ab40dbf9f020..339b19797237 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -172,7 +172,7 @@ int bpf_map_create(enum bpf_map_type map_type, __u32 max_entries, const struct bpf_map_create_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, map_token_fd); + const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size); union bpf_attr attr; int fd; @@ -203,6 +203,8 @@ int bpf_map_create(enum bpf_map_type map_type, attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0); attr.map_token_fd = OPTS_GET(opts, token_fd, 0); + attr.excl_prog_hash = ptr_to_u64(OPTS_GET(opts, excl_prog_hash, NULL)); + attr.excl_prog_hash_size = OPTS_GET(opts, excl_prog_hash_size, 0); fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); return libbpf_err_errno(fd); @@ -238,7 +240,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insn_cnt, struct bpf_prog_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, fd_array_cnt); + const size_t attr_sz = offsetofend(union bpf_attr, keyring_id); void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 7252150e7ad3..e983a3e40d61 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -54,9 +54,12 @@ struct bpf_map_create_opts { __s32 value_type_btf_obj_fd; __u32 token_fd; + + const void *excl_prog_hash; + __u32 excl_prog_hash_size; size_t :0; }; -#define bpf_map_create_opts__last_field token_fd +#define bpf_map_create_opts__last_field excl_prog_hash_size LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, const char *map_name, diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h index 6ff963a491d9..49af4260b8e6 100644 --- a/tools/lib/bpf/bpf_gen_internal.h +++ b/tools/lib/bpf/bpf_gen_internal.h @@ -4,6 +4,7 @@ #define __BPF_GEN_INTERNAL_H #include "bpf.h" +#include "libbpf_internal.h" struct ksym_relo_desc { const char *name; @@ -50,6 +51,7 @@ struct bpf_gen { __u32 nr_ksyms; int fd_array; int nr_fd_array; + int hash_insn_offset[SHA256_DWORD_SIZE]; }; void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps); diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 113ae4abd345..6945dd99a846 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -110,6 +110,7 @@ static void emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn in static int add_data(struct bpf_gen *gen, const void *data, __u32 size); static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off); +static void emit_signature_match(struct bpf_gen *gen); void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps) { @@ -152,6 +153,8 @@ void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps /* R7 contains the error code from sys_bpf. Copy it into R0 and exit. */ emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7)); emit(gen, BPF_EXIT_INSN()); + if (OPTS_GET(gen->opts, gen_hash, false)) + emit_signature_match(gen); } static int add_data(struct bpf_gen *gen, const void *data, __u32 size) @@ -368,6 +371,8 @@ static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off) __emit_sys_close(gen); } +static void compute_sha_update_offsets(struct bpf_gen *gen); + int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) { int i; @@ -394,6 +399,9 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) blob_fd_array_off(gen, i)); emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0)); emit(gen, BPF_EXIT_INSN()); + if (OPTS_GET(gen->opts, gen_hash, false)) + compute_sha_update_offsets(gen); + pr_debug("gen: finish %s\n", errstr(gen->error)); if (!gen->error) { struct gen_loader_opts *opts = gen->opts; @@ -446,6 +454,22 @@ void bpf_gen__free(struct bpf_gen *gen) _val; \ }) +static void compute_sha_update_offsets(struct bpf_gen *gen) +{ + __u64 sha[SHA256_DWORD_SIZE]; + __u64 sha_dw; + int i; + + libbpf_sha256(gen->data_start, gen->data_cur - gen->data_start, (__u8 *)sha); + for (i = 0; i < SHA256_DWORD_SIZE; i++) { + struct bpf_insn *insn = + (struct bpf_insn *)(gen->insn_start + gen->hash_insn_offset[i]); + sha_dw = tgt_endian(sha[i]); + insn[0].imm = (__u32)sha_dw; + insn[1].imm = sha_dw >> 32; + } +} + void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data, __u32 btf_raw_size) { @@ -557,6 +581,29 @@ void bpf_gen__map_create(struct bpf_gen *gen, emit_sys_close_stack(gen, stack_off(inner_map_fd)); } +static void emit_signature_match(struct bpf_gen *gen) +{ + __s64 off; + int i; + + for (i = 0; i < SHA256_DWORD_SIZE; i++) { + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX, + 0, 0, 0, 0)); + emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, i * sizeof(__u64))); + gen->hash_insn_offset[i] = gen->insn_cur - gen->insn_start; + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_3, 0, 0, 0, 0, 0)); + + off = -(gen->insn_cur - gen->insn_start - gen->cleanup_label) / 8 - 1; + if (is_simm16(off)) { + emit(gen, BPF_MOV64_IMM(BPF_REG_7, -EINVAL)); + emit(gen, BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, off)); + } else { + gen->error = -ERANGE; + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, -1)); + } + } +} + void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *attach_name, enum bpf_attach_type type) { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8f5a81b672e1..f92083f51bdb 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -35,6 +35,7 @@ #include <linux/perf_event.h> #include <linux/bpf_perf_event.h> #include <linux/ring_buffer.h> +#include <linux/unaligned.h> #include <sys/epoll.h> #include <sys/ioctl.h> #include <sys/mman.h> @@ -496,6 +497,7 @@ struct bpf_program { __u32 line_info_rec_size; __u32 line_info_cnt; __u32 prog_flags; + __u8 hash[SHA256_DIGEST_LENGTH]; }; struct bpf_struct_ops { @@ -575,6 +577,7 @@ struct bpf_map { bool autocreate; bool autoattach; __u64 map_extra; + struct bpf_program *excl_prog; }; enum extern_type { @@ -1013,35 +1016,33 @@ find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw, const struct btf_member *kern_data_member; struct btf *btf = NULL; __s32 kern_vtype_id, kern_type_id; - char tname[256]; + char tname[192], stname[256]; __u32 i; snprintf(tname, sizeof(tname), "%.*s", (int)bpf_core_essential_name_len(tname_raw), tname_raw); - kern_type_id = find_ksym_btf_id(obj, tname, BTF_KIND_STRUCT, - &btf, mod_btf); - if (kern_type_id < 0) { - pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", - tname); - return kern_type_id; - } - kern_type = btf__type_by_id(btf, kern_type_id); + snprintf(stname, sizeof(stname), "%s%s", STRUCT_OPS_VALUE_PREFIX, tname); - /* Find the corresponding "map_value" type that will be used - * in map_update(BPF_MAP_TYPE_STRUCT_OPS). For example, - * find "struct bpf_struct_ops_tcp_congestion_ops" from the - * btf_vmlinux. + /* Look for the corresponding "map_value" type that will be used + * in map_update(BPF_MAP_TYPE_STRUCT_OPS) first, figure out the btf + * and the mod_btf. + * For example, find "struct bpf_struct_ops_tcp_congestion_ops". */ - kern_vtype_id = find_btf_by_prefix_kind(btf, STRUCT_OPS_VALUE_PREFIX, - tname, BTF_KIND_STRUCT); + kern_vtype_id = find_ksym_btf_id(obj, stname, BTF_KIND_STRUCT, &btf, mod_btf); if (kern_vtype_id < 0) { - pr_warn("struct_ops init_kern: struct %s%s is not found in kernel BTF\n", - STRUCT_OPS_VALUE_PREFIX, tname); + pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", stname); return kern_vtype_id; } kern_vtype = btf__type_by_id(btf, kern_vtype_id); + kern_type_id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT); + if (kern_type_id < 0) { + pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", tname); + return kern_type_id; + } + kern_type = btf__type_by_id(btf, kern_type_id); + /* Find "struct tcp_congestion_ops" from * struct bpf_struct_ops_tcp_congestion_ops { * [ ... ] @@ -1054,8 +1055,8 @@ find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw, break; } if (i == btf_vlen(kern_vtype)) { - pr_warn("struct_ops init_kern: struct %s data is not found in struct %s%s\n", - tname, STRUCT_OPS_VALUE_PREFIX, tname); + pr_warn("struct_ops init_kern: struct %s data is not found in struct %s\n", + tname, stname); return -EINVAL; } @@ -4485,6 +4486,44 @@ bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) } } +static int bpf_prog_compute_hash(struct bpf_program *prog) +{ + struct bpf_insn *purged; + int i, err = 0; + + purged = calloc(prog->insns_cnt, BPF_INSN_SZ); + if (!purged) + return -ENOMEM; + + /* If relocations have been done, the map_fd needs to be + * discarded for the digest calculation. + */ + for (i = 0; i < prog->insns_cnt; i++) { + purged[i] = prog->insns[i]; + if (purged[i].code == (BPF_LD | BPF_IMM | BPF_DW) && + (purged[i].src_reg == BPF_PSEUDO_MAP_FD || + purged[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { + purged[i].imm = 0; + i++; + if (i >= prog->insns_cnt || + prog->insns[i].code != 0 || + prog->insns[i].dst_reg != 0 || + prog->insns[i].src_reg != 0 || + prog->insns[i].off != 0) { + err = -EINVAL; + goto out; + } + purged[i] = prog->insns[i]; + purged[i].imm = 0; + } + } + libbpf_sha256(purged, prog->insns_cnt * sizeof(struct bpf_insn), + prog->hash); +out: + free(purged); + return err; +} + static int bpf_program__record_reloc(struct bpf_program *prog, struct reloc_desc *reloc_desc, __u32 insn_idx, const char *sym_name, @@ -5093,6 +5132,16 @@ static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) return false; } + /* + * bpf_get_map_info_by_fd() for DEVMAP will always return flags with + * BPF_F_RDONLY_PROG set, but it generally is not set at map creation time. + * Thus, ignore the BPF_F_RDONLY_PROG flag in the flags returned from + * bpf_get_map_info_by_fd() when checking for compatibility with an + * existing DEVMAP. + */ + if (map->def.type == BPF_MAP_TYPE_DEVMAP || map->def.type == BPF_MAP_TYPE_DEVMAP_HASH) + map_info.map_flags &= ~BPF_F_RDONLY_PROG; + return (map_info.type == map->def.type && map_info.key_size == map->def.key_size && map_info.value_size == map->def.value_size && @@ -5224,6 +5273,14 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b create_attr.token_fd = obj->token_fd; if (obj->token_fd) create_attr.map_flags |= BPF_F_TOKEN_FD; + if (map->excl_prog) { + err = bpf_prog_compute_hash(map->excl_prog); + if (err) + return err; + + create_attr.excl_prog_hash = map->excl_prog->hash; + create_attr.excl_prog_hash_size = SHA256_DIGEST_LENGTH; + } if (bpf_map__is_struct_ops(map)) { create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; @@ -10514,6 +10571,27 @@ int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd) return 0; } +int bpf_map__set_exclusive_program(struct bpf_map *map, struct bpf_program *prog) +{ + if (map_is_created(map)) { + pr_warn("exclusive programs must be set before map creation\n"); + return libbpf_err(-EINVAL); + } + + if (map->obj != prog->obj) { + pr_warn("excl_prog and map must be from the same bpf object\n"); + return libbpf_err(-EINVAL); + } + + map->excl_prog = prog; + return 0; +} + +struct bpf_program *bpf_map__exclusive_program(struct bpf_map *map) +{ + return map->excl_prog; +} + static struct bpf_map * __bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i) { @@ -14207,3 +14285,100 @@ void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s) free(s->progs); free(s); } + +static inline __u32 ror32(__u32 v, int bits) +{ + return (v >> bits) | (v << (32 - bits)); +} + +#define SHA256_BLOCK_LENGTH 64 +#define Ch(x, y, z) (((x) & (y)) ^ (~(x) & (z))) +#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) +#define Sigma_0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22)) +#define Sigma_1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25)) +#define sigma_0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3)) +#define sigma_1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10)) + +static const __u32 sha256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, + 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, + 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, + 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, + 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, + 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; + +#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) \ + { \ + __u32 tmp = h + Sigma_1(e) + Ch(e, f, g) + sha256_K[i] + w[i]; \ + d += tmp; \ + h = tmp + Sigma_0(a) + Maj(a, b, c); \ + } + +static void sha256_blocks(__u32 state[8], const __u8 *data, size_t nblocks) +{ + while (nblocks--) { + __u32 a = state[0]; + __u32 b = state[1]; + __u32 c = state[2]; + __u32 d = state[3]; + __u32 e = state[4]; + __u32 f = state[5]; + __u32 g = state[6]; + __u32 h = state[7]; + __u32 w[64]; + int i; + + for (i = 0; i < 16; i++) + w[i] = get_unaligned_be32(&data[4 * i]); + for (; i < ARRAY_SIZE(w); i++) + w[i] = sigma_1(w[i - 2]) + w[i - 7] + + sigma_0(w[i - 15]) + w[i - 16]; + for (i = 0; i < ARRAY_SIZE(w); i += 8) { + SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h); + SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g); + SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f); + SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e); + SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d); + SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c); + SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b); + SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a); + } + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + data += SHA256_BLOCK_LENGTH; + } +} + +void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]) +{ + __u32 state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; + const __be64 bitcount = cpu_to_be64((__u64)len * 8); + __u8 final_data[2 * SHA256_BLOCK_LENGTH] = { 0 }; + size_t final_len = len % SHA256_BLOCK_LENGTH; + int i; + + sha256_blocks(state, data, len / SHA256_BLOCK_LENGTH); + + memcpy(final_data, data + len - final_len, final_len); + final_data[final_len] = 0x80; + final_len = round_up(final_len + 9, SHA256_BLOCK_LENGTH); + memcpy(&final_data[final_len - 8], &bitcount, 8); + + sha256_blocks(state, final_data, final_len / SHA256_BLOCK_LENGTH); + + for (i = 0; i < ARRAY_SIZE(state); i++) + put_unaligned_be32(state[i], &out[4 * i]); +} diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 455a957cb702..5118d0a90e24 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -24,8 +24,25 @@ extern "C" { #endif +/** + * @brief **libbpf_major_version()** provides the major version of libbpf. + * @return An integer, the major version number + */ LIBBPF_API __u32 libbpf_major_version(void); + +/** + * @brief **libbpf_minor_version()** provides the minor version of libbpf. + * @return An integer, the minor version number + */ LIBBPF_API __u32 libbpf_minor_version(void); + +/** + * @brief **libbpf_version_string()** provides the version of libbpf in a + * human-readable form, e.g., "v1.7". + * @return Pointer to a static string containing the version + * + * The format is *not* a part of a stable API and may change in the future. + */ LIBBPF_API const char *libbpf_version_string(void); enum libbpf_errno { @@ -49,6 +66,14 @@ enum libbpf_errno { __LIBBPF_ERRNO__END, }; +/** + * @brief **libbpf_strerror()** converts the provided error code into a + * human-readable string. + * @param err The error code to convert + * @param buf Pointer to a buffer where the error message will be stored + * @param size The number of bytes in the buffer + * @return 0, on success; negative error code, otherwise + */ LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); /** @@ -252,7 +277,7 @@ bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, * @return 0, on success; negative error code, otherwise, error code is * stored in errno */ -int bpf_object__prepare(struct bpf_object *obj); +LIBBPF_API int bpf_object__prepare(struct bpf_object *obj); /** * @brief **bpf_object__load()** loads BPF object into kernel. @@ -1266,6 +1291,28 @@ LIBBPF_API int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, */ LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, const void *cur_key, void *next_key, size_t key_sz); +/** + * @brief **bpf_map__set_exclusive_program()** sets a map to be exclusive to the + * specified program. This must be called *before* the map is created. + * + * @param map BPF map to make exclusive. + * @param prog BPF program to be the exclusive user of the map. Must belong + * to the same bpf_object as the map. + * @return 0 on success; a negative error code otherwise. + * + * This function must be called after the BPF object is opened but before + * it is loaded. Once the object is loaded, only the specified program + * will be able to access the map's contents. + */ +LIBBPF_API int bpf_map__set_exclusive_program(struct bpf_map *map, struct bpf_program *prog); + +/** + * @brief **bpf_map__exclusive_program()** returns the exclusive program + * that is registered with the map (if any). + * @param map BPF map to which the exclusive program is registered. + * @return the registered exclusive program. + */ +LIBBPF_API struct bpf_program *bpf_map__exclusive_program(struct bpf_map *map); struct bpf_xdp_set_link_opts { size_t sz; @@ -1810,9 +1857,10 @@ struct gen_loader_opts { const char *insns; __u32 data_sz; __u32 insns_sz; + bool gen_hash; }; -#define gen_loader_opts__last_field insns_sz +#define gen_loader_opts__last_field gen_hash LIBBPF_API int bpf_object__gen_loader(struct bpf_object *obj, struct gen_loader_opts *opts); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index d7bd463e7017..8ed8749907d4 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -448,4 +448,7 @@ LIBBPF_1.6.0 { } LIBBPF_1.5.0; LIBBPF_1.7.0 { + global: + bpf_map__set_exclusive_program; + bpf_map__exclusive_program; } LIBBPF_1.6.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 477a3b3389a0..c93797dcaf5b 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -736,4 +736,8 @@ int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern, int probe_fd(int fd); +#define SHA256_DIGEST_LENGTH 32 +#define SHA256_DWORD_SIZE SHA256_DIGEST_LENGTH / sizeof(__u64) + +void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]); #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h index 4d5fa079b5d6..6a8f5c7a02eb 100644 --- a/tools/lib/bpf/skel_internal.h +++ b/tools/lib/bpf/skel_internal.h @@ -13,10 +13,15 @@ #include <unistd.h> #include <sys/syscall.h> #include <sys/mman.h> +#include <linux/keyctl.h> #include <stdlib.h> #include "bpf.h" #endif +#ifndef SHA256_DIGEST_LENGTH +#define SHA256_DIGEST_LENGTH 32 +#endif + #ifndef __NR_bpf # if defined(__mips__) && defined(_ABIO32) # define __NR_bpf 4355 @@ -64,6 +69,11 @@ struct bpf_load_and_run_opts { __u32 data_sz; __u32 insns_sz; const char *errstr; + void *signature; + __u32 signature_sz; + __s32 keyring_id; + void *excl_prog_hash; + __u32 excl_prog_hash_sz; }; long kern_sys_bpf(__u32 cmd, void *attr, __u32 attr_size); @@ -220,14 +230,19 @@ static inline int skel_map_create(enum bpf_map_type map_type, const char *map_name, __u32 key_size, __u32 value_size, - __u32 max_entries) + __u32 max_entries, + const void *excl_prog_hash, + __u32 excl_prog_hash_sz) { - const size_t attr_sz = offsetofend(union bpf_attr, map_extra); + const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size); union bpf_attr attr; memset(&attr, 0, attr_sz); attr.map_type = map_type; + attr.excl_prog_hash = (unsigned long) excl_prog_hash; + attr.excl_prog_hash_size = excl_prog_hash_sz; + strncpy(attr.map_name, map_name, sizeof(attr.map_name)); attr.key_size = key_size; attr.value_size = value_size; @@ -300,6 +315,35 @@ static inline int skel_link_create(int prog_fd, int target_fd, return skel_sys_bpf(BPF_LINK_CREATE, &attr, attr_sz); } +static inline int skel_obj_get_info_by_fd(int fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, info); + __u8 sha[SHA256_DIGEST_LENGTH]; + struct bpf_map_info info; + __u32 info_len = sizeof(info); + union bpf_attr attr; + + memset(&info, 0, sizeof(info)); + info.hash = (long) &sha; + info.hash_size = SHA256_DIGEST_LENGTH; + + memset(&attr, 0, attr_sz); + attr.info.bpf_fd = fd; + attr.info.info = (long) &info; + attr.info.info_len = info_len; + return skel_sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, attr_sz); +} + +static inline int skel_map_freeze(int fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_fd); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + + return skel_sys_bpf(BPF_MAP_FREEZE, &attr, attr_sz); +} #ifdef __KERNEL__ #define set_err #else @@ -308,12 +352,13 @@ static inline int skel_link_create(int prog_fd, int target_fd, static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) { - const size_t prog_load_attr_sz = offsetofend(union bpf_attr, fd_array); + const size_t prog_load_attr_sz = offsetofend(union bpf_attr, keyring_id); const size_t test_run_attr_sz = offsetofend(union bpf_attr, test); int map_fd = -1, prog_fd = -1, key = 0, err; union bpf_attr attr; - err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1); + err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1, + opts->excl_prog_hash, opts->excl_prog_hash_sz); if (map_fd < 0) { opts->errstr = "failed to create loader map"; set_err; @@ -327,11 +372,34 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) goto out; } +#ifndef __KERNEL__ + err = skel_map_freeze(map_fd); + if (err < 0) { + opts->errstr = "failed to freeze map"; + set_err; + goto out; + } + err = skel_obj_get_info_by_fd(map_fd); + if (err < 0) { + opts->errstr = "failed to fetch obj info"; + set_err; + goto out; + } +#endif + memset(&attr, 0, prog_load_attr_sz); attr.prog_type = BPF_PROG_TYPE_SYSCALL; attr.insns = (long) opts->insns; attr.insn_cnt = opts->insns_sz / sizeof(struct bpf_insn); attr.license = (long) "Dual BSD/GPL"; +#ifndef __KERNEL__ + attr.signature = (long) opts->signature; + attr.signature_size = opts->signature_sz; +#else + if (opts->signature || opts->signature_sz) + pr_warn("signatures are not supported from bpf_preload\n"); +#endif + attr.keyring_id = opts->keyring_id; memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog")); attr.fd_array = (long) &map_fd; attr.log_level = opts->ctx->log_level; diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index 2a7865c8e3fe..43deb05a5197 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -34,13 +34,32 @@ enum __bpf_usdt_arg_type { BPF_USDT_ARG_CONST, BPF_USDT_ARG_REG, BPF_USDT_ARG_REG_DEREF, + BPF_USDT_ARG_SIB, }; +/* + * This struct layout is designed specifically to be backwards/forward + * compatible between libbpf versions for ARG_CONST, ARG_REG, and + * ARG_REG_DEREF modes. ARG_SIB requires libbpf v1.7+. + */ struct __bpf_usdt_arg_spec { /* u64 scalar interpreted depending on arg_type, see below */ __u64 val_off; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* arg location case, see bpf_usdt_arg() for details */ - enum __bpf_usdt_arg_type arg_type; + enum __bpf_usdt_arg_type arg_type: 8; + /* index register offset within struct pt_regs */ + __u16 idx_reg_off: 12; + /* scale factor for index register (1, 2, 4, or 8) */ + __u16 scale_bitshift: 4; + /* reserved for future use, keeps reg_off offset stable */ + __u8 __reserved: 8; +#else + __u8 __reserved: 8; + __u16 idx_reg_off: 12; + __u16 scale_bitshift: 4; + enum __bpf_usdt_arg_type arg_type: 8; +#endif /* offset of referenced register within struct pt_regs */ short reg_off; /* whether arg should be interpreted as signed value */ @@ -149,7 +168,7 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) { struct __bpf_usdt_spec *spec; struct __bpf_usdt_arg_spec *arg_spec; - unsigned long val; + unsigned long val, idx; int err, spec_id; *res = 0; @@ -204,6 +223,27 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) val >>= arg_spec->arg_bitshift; #endif break; + case BPF_USDT_ARG_SIB: + /* Arg is in memory addressed by SIB (Scale-Index-Base) mode + * (e.g., "-1@-96(%rbp,%rax,8)" in USDT arg spec). We first + * fetch the base register contents and the index register + * contents from pt_regs. Then we calculate the final address + * as base + (index * scale) + offset, and do a user-space + * probe read to fetch the argument value. + */ + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + if (err) + return err; + err = bpf_probe_read_kernel(&idx, sizeof(idx), (void *)ctx + arg_spec->idx_reg_off); + if (err) + return err; + err = bpf_probe_read_user(&val, sizeof(val), (void *)(val + (idx << arg_spec->scale_bitshift) + arg_spec->val_off)); + if (err) + return err; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + val >>= arg_spec->arg_bitshift; +#endif + break; default: return -EINVAL; } diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 3373b9d45ac4..fc2785eecc17 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -200,12 +200,23 @@ enum usdt_arg_type { USDT_ARG_CONST, USDT_ARG_REG, USDT_ARG_REG_DEREF, + USDT_ARG_SIB, }; /* should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h */ struct usdt_arg_spec { __u64 val_off; - enum usdt_arg_type arg_type; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + enum usdt_arg_type arg_type: 8; + __u16 idx_reg_off: 12; + __u16 scale_bitshift: 4; + __u8 __reserved: 8; /* keep reg_off offset stable */ +#else + __u8 __reserved: 8; /* keep reg_off offset stable */ + __u16 idx_reg_off: 12; + __u16 scale_bitshift: 4; + enum usdt_arg_type arg_type: 8; +#endif short reg_off; bool arg_signed; char arg_bitshift; @@ -570,9 +581,8 @@ static struct elf_seg *find_vma_seg(struct elf_seg *segs, size_t seg_cnt, long o return NULL; } -static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, - const char *data, size_t name_off, size_t desc_off, - struct usdt_note *usdt_note); +static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off, + size_t desc_off, struct usdt_note *usdt_note); static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie); @@ -626,7 +636,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * struct elf_seg *seg = NULL; void *tmp; - err = parse_usdt_note(elf, path, &nhdr, data->d_buf, name_off, desc_off, ¬e); + err = parse_usdt_note(&nhdr, data->d_buf, name_off, desc_off, ¬e); if (err) goto err_out; @@ -1132,8 +1142,7 @@ err_out: /* Parse out USDT ELF note from '.note.stapsdt' section. * Logic inspired by perf's code. */ -static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, - const char *data, size_t name_off, size_t desc_off, +static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, struct usdt_note *note) { const char *provider, *name, *args; @@ -1283,11 +1292,51 @@ static int calc_pt_regs_off(const char *reg_name) static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { - char reg_name[16]; - int len, reg_off; - long off; + char reg_name[16] = {0}, idx_reg_name[16] = {0}; + int len, reg_off, idx_reg_off, scale = 1; + long off = 0; + + if (sscanf(arg_str, " %d @ %ld ( %%%15[^,] , %%%15[^,] , %d ) %n", + arg_sz, &off, reg_name, idx_reg_name, &scale, &len) == 5 || + sscanf(arg_str, " %d @ ( %%%15[^,] , %%%15[^,] , %d ) %n", + arg_sz, reg_name, idx_reg_name, &scale, &len) == 4 || + sscanf(arg_str, " %d @ %ld ( %%%15[^,] , %%%15[^)] ) %n", + arg_sz, &off, reg_name, idx_reg_name, &len) == 4 || + sscanf(arg_str, " %d @ ( %%%15[^,] , %%%15[^)] ) %n", + arg_sz, reg_name, idx_reg_name, &len) == 3 + ) { + /* + * Scale Index Base case: + * 1@-96(%rbp,%rax,8) + * 1@(%rbp,%rax,8) + * 1@-96(%rbp,%rax) + * 1@(%rbp,%rax) + */ + arg->arg_type = USDT_ARG_SIB; + arg->val_off = off; - if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", arg_sz, &off, reg_name, &len) == 3) { + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + + idx_reg_off = calc_pt_regs_off(idx_reg_name); + if (idx_reg_off < 0) + return idx_reg_off; + arg->idx_reg_off = idx_reg_off; + + /* validate scale factor and set fields directly */ + switch (scale) { + case 1: arg->scale_bitshift = 0; break; + case 2: arg->scale_bitshift = 1; break; + case 4: arg->scale_bitshift = 2; break; + case 8: arg->scale_bitshift = 3; break; + default: + pr_warn("usdt: invalid SIB scale %d, expected 1, 2, 4, 8\n", scale); + return -EINVAL; + } + } else if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", + arg_sz, &off, reg_name, &len) == 3) { /* Memory dereference case, e.g., -4@-20(%rbp) */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = off; @@ -1306,6 +1355,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec } else if (sscanf(arg_str, " %d @ %%%15s %n", arg_sz, reg_name, &len) == 2) { /* Register read case, e.g., -4@%eax */ arg->arg_type = USDT_ARG_REG; + /* register read has no memory offset */ arg->val_off = 0; reg_off = calc_pt_regs_off(reg_name); diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index 9ef569492560..ddaeb4eb3e24 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -75,6 +75,9 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) size_t ci, cj, ei; int cmp; + if (!excludes->cnt) + return; + ci = cj = ei = 0; while (ci < cmds->cnt && ei < excludes->cnt) { cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name); diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index ef032e17fec4..eb295756c3bf 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -830,7 +830,7 @@ class TypeArrayNest(Type): 'ynl_attr_for_each_nested(attr2, attr) {', '\tif (ynl_attr_validate(yarg, attr2))', '\t\treturn YNL_PARSE_CB_ERROR;', - f'\t{var}->_count.{self.c_name}++;', + f'\tn_{self.c_name}++;', '}'] return get_lines, None, local_vars diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index b6fdc68053cc..2e555c4060c5 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -278,6 +278,25 @@ static bool decode_insn_reg2i16_fomat(union loongarch_instruction inst, return true; } +static bool decode_insn_reg3_fomat(union loongarch_instruction inst, + struct instruction *insn) +{ + switch (inst.reg3_format.opcode) { + case amswapw_op: + if (inst.reg3_format.rd == LOONGARCH_GPR_ZERO && + inst.reg3_format.rk == LOONGARCH_GPR_RA && + inst.reg3_format.rj == LOONGARCH_GPR_ZERO) { + /* amswap.w $zero, $ra, $zero */ + insn->type = INSN_BUG; + } + break; + default: + return false; + } + + return true; +} + int arch_decode_instruction(struct objtool_file *file, const struct section *sec, unsigned long offset, unsigned int maxlen, struct instruction *insn) @@ -309,11 +328,19 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec return 0; if (decode_insn_reg2i16_fomat(inst, insn)) return 0; + if (decode_insn_reg3_fomat(inst, insn)) + return 0; - if (inst.word == 0) + if (inst.word == 0) { + /* andi $zero, $zero, 0x0 */ insn->type = INSN_NOP; - else if (inst.reg0i15_format.opcode == break_op) { - /* break */ + } else if (inst.reg0i15_format.opcode == break_op && + inst.reg0i15_format.immediate == 0x0) { + /* break 0x0 */ + insn->type = INSN_TRAP; + } else if (inst.reg0i15_format.opcode == break_op && + inst.reg0i15_format.immediate == 0x1) { + /* break 0x1 */ insn->type = INSN_BUG; } else if (inst.reg2_format.opcode == ertn_op) { /* ertn */ diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c index e39f86d97002..a80b75f7b061 100644 --- a/tools/objtool/arch/loongarch/special.c +++ b/tools/objtool/arch/loongarch/special.c @@ -27,6 +27,7 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file, struct table_info *next_table; unsigned long tmp_insn_offset; unsigned long tmp_rodata_offset; + bool is_valid_list = false; rsec = find_section_by_name(file->elf, ".rela.discard.tablejump_annotate"); if (!rsec) @@ -35,6 +36,12 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file, INIT_LIST_HEAD(&table_list); for_each_reloc(rsec, reloc) { + if (reloc->sym->sec->rodata) + continue; + + if (strcmp(insn->sec->name, reloc->sym->sec->name)) + continue; + orig_table = malloc(sizeof(struct table_info)); if (!orig_table) { WARN("malloc failed"); @@ -49,6 +56,22 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file, if (reloc_idx(reloc) + 1 == sec_num_entries(rsec)) break; + + if (strcmp(insn->sec->name, (reloc + 1)->sym->sec->name)) { + list_for_each_entry(orig_table, &table_list, jump_info) { + if (orig_table->insn_offset == insn->offset) { + is_valid_list = true; + break; + } + } + + if (!is_valid_list) { + list_del_init(&table_list); + continue; + } + + break; + } } list_for_each_entry(orig_table, &table_list, jump_info) { diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 98c4713c1b09..0ad5cc70ecbe 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -880,3 +880,15 @@ unsigned int arch_reloc_size(struct reloc *reloc) return 8; } } + +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_X86_64_32: + case R_X86_64_32S: + case R_X86_64_64: + return true; + default: + return false; + } +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 80239843e9f0..0f6b197cfcb0 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -87,6 +87,7 @@ static const struct option check_options[] = { OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), OPT_BOOLEAN(0 , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), + OPT_BOOLEAN(0 , "noabs", &opts.noabs, "reject absolute references in allocatable sections"), OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), @@ -162,6 +163,7 @@ static bool opts_valid(void) opts.hack_noinstr || opts.ibt || opts.mcount || + opts.noabs || opts.noinstr || opts.orc || opts.retpoline || diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d14f20ef1db1..093fcd01dd6e 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3564,7 +3564,9 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { /* Ignore KCFI type preambles, which always fall through */ if (!strncmp(func->name, "__cfi_", 6) || - !strncmp(func->name, "__pfx_", 6)) + !strncmp(func->name, "__pfx_", 6) || + !strncmp(func->name, "__pi___cfi_", 11) || + !strncmp(func->name, "__pi___pfx_", 11)) return 0; if (file->ignore_unreachables) @@ -4644,6 +4646,47 @@ static void disas_warned_funcs(struct objtool_file *file) disas_funcs(funcs); } +__weak bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + unsigned int type = reloc_type(reloc); + size_t sz = elf_addr_size(elf); + + return (sz == 8) ? (type == R_ABS64) : (type == R_ABS32); +} + +static int check_abs_references(struct objtool_file *file) +{ + struct section *sec; + struct reloc *reloc; + int ret = 0; + + for_each_sec(file, sec) { + /* absolute references in non-loadable sections are fine */ + if (!(sec->sh.sh_flags & SHF_ALLOC)) + continue; + + /* section must have an associated .rela section */ + if (!sec->rsec) + continue; + + /* + * Special case for compiler generated metadata that is not + * consumed until after boot. + */ + if (!strcmp(sec->name, "__patchable_function_entries")) + continue; + + for_each_reloc(sec->rsec, reloc) { + if (arch_absolute_reloc(file->elf, reloc)) { + WARN("section %s has absolute relocation at offset 0x%lx", + sec->name, reloc_offset(reloc)); + ret++; + } + } + } + return ret; +} + struct insn_chunk { void *addr; struct insn_chunk *next; @@ -4777,6 +4820,9 @@ int check(struct objtool_file *file) goto out; } + if (opts.noabs) + warnings += check_abs_references(file); + if (opts.orc && nr_insns) { ret = orc_create(file); if (ret) diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 01ef6f415adf..be33c7b43180 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -97,6 +97,7 @@ bool arch_is_embedded_insn(struct symbol *sym); int arch_rewrite_retpolines(struct objtool_file *file); bool arch_pc_relative_reloc(struct reloc *reloc); +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc); unsigned int arch_reloc_size(struct reloc *reloc); unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table); diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 6b08666fa69d..ab22673862e1 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -26,6 +26,7 @@ struct opts { bool uaccess; int prefix; bool cfi; + bool noabs; /* options: */ bool backtrace; diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 6a922d046b8e..802895fae3ca 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -45,7 +45,6 @@ NORETURN(rewind_stack_and_make_dead) NORETURN(rust_begin_unwind) NORETURN(rust_helper_BUG) NORETURN(sev_es_terminate) -NORETURN(snp_abort) NORETURN(start_kernel) NORETURN(stop_this_cpu) NORETURN(usercopy_abort) diff --git a/tools/perf/arch/arm/entry/syscalls/syscall.tbl b/tools/perf/arch/arm/entry/syscalls/syscall.tbl index 27c1d5ebcd91..b07e699aaa3c 100644 --- a/tools/perf/arch/arm/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/arm/entry/syscalls/syscall.tbl @@ -482,3 +482,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index 1e8c44c7b614..7a7049c2c307 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -382,3 +382,5 @@ 465 n64 listxattrat sys_listxattrat 466 n64 removexattrat sys_removexattrat 467 n64 open_tree_attr sys_open_tree_attr +468 n64 file_getattr sys_file_getattr +469 n64 file_setattr sys_file_setattr diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 9a084bdb8926..b453e80dfc00 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -558,3 +558,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index a4569b96ef06..8a6744d658db 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -470,3 +470,5 @@ 465 common listxattrat sys_listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr sys_file_setattr diff --git a/tools/perf/arch/sh/entry/syscalls/syscall.tbl b/tools/perf/arch/sh/entry/syscalls/syscall.tbl index 52a7652fcff6..5e9c9eff5539 100644 --- a/tools/perf/arch/sh/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/sh/entry/syscalls/syscall.tbl @@ -471,3 +471,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/perf/arch/sparc/entry/syscalls/syscall.tbl b/tools/perf/arch/sparc/entry/syscalls/syscall.tbl index 83e45eb6c095..ebb7d06d1044 100644 --- a/tools/perf/arch/sparc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/sparc/entry/syscalls/syscall.tbl @@ -513,3 +513,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl index ac007ea00979..4877e16da69a 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl @@ -473,3 +473,5 @@ 465 i386 listxattrat sys_listxattrat 466 i386 removexattrat sys_removexattrat 467 i386 open_tree_attr sys_open_tree_attr +468 i386 file_getattr sys_file_getattr +469 i386 file_setattr sys_file_setattr diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index cfb5ca41e30d..92cf0fe2291e 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -391,6 +391,8 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr # # Due to a historical design error, certain syscalls are numbered differently diff --git a/tools/perf/arch/x86/tests/topdown.c b/tools/perf/arch/x86/tests/topdown.c index 8d0ea7a4bbc1..1eba3b4594ef 100644 --- a/tools/perf/arch/x86/tests/topdown.c +++ b/tools/perf/arch/x86/tests/topdown.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "arch-tests.h" #include "../util/topdown.h" +#include "debug.h" #include "evlist.h" #include "parse-events.h" #include "pmu.h" diff --git a/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl b/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl index f657a77314f8..374e4cb788d8 100644 --- a/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl @@ -438,3 +438,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c index aad572a78d7f..12387ea88b9a 100644 --- a/tools/perf/bench/inject-buildid.c +++ b/tools/perf/bench/inject-buildid.c @@ -85,7 +85,7 @@ static int add_dso(const char *fpath, const struct stat *sb __maybe_unused, if (typeflag == FTW_D || typeflag == FTW_SL) return 0; - if (filename__read_build_id(fpath, &bid) < 0) + if (filename__read_build_id(fpath, &bid, /*block=*/true) < 0) return 0; dso->name = realpath(fpath, NULL); diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index c98104481c8a..2e0f2004696a 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -180,7 +180,7 @@ static int build_id_cache__add_file(const char *filename, struct nsinfo *nsi) struct nscookie nsc; nsinfo__mountns_enter(nsi, &nsc); - err = filename__read_build_id(filename, &bid); + err = filename__read_build_id(filename, &bid, /*block=*/true); nsinfo__mountns_exit(&nsc); if (err < 0) { pr_debug("Couldn't read a build-id in %s\n", filename); @@ -204,7 +204,7 @@ static int build_id_cache__remove_file(const char *filename, struct nsinfo *nsi) int err; nsinfo__mountns_enter(nsi, &nsc); - err = filename__read_build_id(filename, &bid); + err = filename__read_build_id(filename, &bid, /*block=*/true); nsinfo__mountns_exit(&nsc); if (err < 0) { pr_debug("Couldn't read a build-id in %s\n", filename); @@ -280,7 +280,7 @@ static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused) if (!dso__build_id_filename(dso, filename, sizeof(filename), false)) return true; - if (filename__read_build_id(filename, &bid) == -1) { + if (filename__read_build_id(filename, &bid, /*block=*/true) == -1) { if (errno == ENOENT) return false; @@ -309,7 +309,7 @@ static int build_id_cache__update_file(const char *filename, struct nsinfo *nsi) int err; nsinfo__mountns_enter(nsi, &nsc); - err = filename__read_build_id(filename, &bid); + err = filename__read_build_id(filename, &bid, /*block=*/true); nsinfo__mountns_exit(&nsc); if (err < 0) { pr_debug("Couldn't read a build-id in %s\n", filename); diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 40ba6a94f719..a114b3fa1bea 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -680,12 +680,12 @@ static int dso__read_build_id(struct dso *dso) mutex_lock(dso__lock(dso)); nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); - if (filename__read_build_id(dso__long_name(dso), &bid) > 0) + if (filename__read_build_id(dso__long_name(dso), &bid, /*block=*/true) > 0) dso__set_build_id(dso, &bid); else if (dso__nsinfo(dso)) { char *new_name = dso__filename_with_chroot(dso, dso__long_name(dso)); - if (new_name && filename__read_build_id(new_name, &bid) > 0) + if (new_name && filename__read_build_id(new_name, &bid, /*block=*/true) > 0) dso__set_build_id(dso, &bid); free(new_name); } diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index fd49703021fd..078634461df2 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -2009,6 +2009,7 @@ static int __cmd_contention(int argc, const char **argv) .owner = show_lock_owner, .cgroups = RB_ROOT, }; + struct perf_env host_env; lockhash_table = calloc(LOCKHASH_SIZE, sizeof(*lockhash_table)); if (!lockhash_table) @@ -2024,7 +2025,10 @@ static int __cmd_contention(int argc, const char **argv) eops.mmap = perf_event__process_mmap; eops.tracing_data = perf_event__process_tracing_data; - session = perf_session__new(use_bpf ? NULL : &data, &eops); + perf_env__init(&host_env); + session = __perf_session__new(use_bpf ? NULL : &data, &eops, + /*trace_event_repipe=*/false, &host_env); + if (IS_ERR(session)) { pr_err("Initializing perf session failed\n"); err = PTR_ERR(session); @@ -2142,6 +2146,7 @@ out_delete: evlist__delete(con.evlist); lock_contention_finish(&con); perf_session__delete(session); + perf_env__exit(&host_env); zfree(&lockhash_table); return err; } diff --git a/tools/perf/tests/pe-file-parsing.c b/tools/perf/tests/pe-file-parsing.c index 30c7da79e109..8b31d1d05f90 100644 --- a/tools/perf/tests/pe-file-parsing.c +++ b/tools/perf/tests/pe-file-parsing.c @@ -37,7 +37,7 @@ static int run_dir(const char *d) size_t idx; scnprintf(filename, PATH_MAX, "%s/pe-file.exe", d); - ret = filename__read_build_id(filename, &bid); + ret = filename__read_build_id(filename, &bid, /*block=*/true); TEST_ASSERT_VAL("Failed to read build_id", ret == sizeof(expect_build_id)); TEST_ASSERT_VAL("Wrong build_id", !memcmp(bid.data, expect_build_id, @@ -49,7 +49,7 @@ static int run_dir(const char *d) !strcmp(debuglink, expect_debuglink)); scnprintf(debugfile, PATH_MAX, "%s/%s", d, debuglink); - ret = filename__read_build_id(debugfile, &bid); + ret = filename__read_build_id(debugfile, &bid, /*block=*/true); TEST_ASSERT_VAL("Failed to read debug file build_id", ret == sizeof(expect_build_id)); TEST_ASSERT_VAL("Wrong build_id", !memcmp(bid.data, expect_build_id, diff --git a/tools/perf/tests/sdt.c b/tools/perf/tests/sdt.c index 93baee2eae42..6132f1af3e22 100644 --- a/tools/perf/tests/sdt.c +++ b/tools/perf/tests/sdt.c @@ -31,7 +31,7 @@ static int build_id_cache__add_file(const char *filename) struct build_id bid = { .size = 0, }; int err; - err = filename__read_build_id(filename, &bid); + err = filename__read_build_id(filename, &bid, /*block=*/true); if (err < 0) { pr_debug("Failed to read build id of %s\n", filename); return err; diff --git a/tools/perf/tests/shell/test_bpf_metadata.sh b/tools/perf/tests/shell/test_bpf_metadata.sh index 69e3c2055134..be67d56e0f09 100755 --- a/tools/perf/tests/shell/test_bpf_metadata.sh +++ b/tools/perf/tests/shell/test_bpf_metadata.sh @@ -61,7 +61,7 @@ test_bpf_metadata() { /perf_version/ { if (entry) print $NF; } - ' | egrep "$VERS" > /dev/null + ' | grep -qF "$VERS" then echo "Basic BPF metadata test [Failed invalid output]" err=1 diff --git a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h index a15ac2fa4b20..f291ab4f94eb 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h @@ -90,10 +90,28 @@ #define DN_ATTRIB 0x00000020 /* File changed attibutes */ #define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ +/* Reserved kernel ranges [-100], [-10000, -40000]. */ #define AT_FDCWD -100 /* Special value for dirfd used to indicate openat should use the current working directory. */ +/* + * The concept of process and threads in userland and the kernel is a confusing + * one - within the kernel every thread is a 'task' with its own individual PID, + * however from userland's point of view threads are grouped by a single PID, + * which is that of the 'thread group leader', typically the first thread + * spawned. + * + * To cut the Gideon knot, for internal kernel usage, we refer to + * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel + * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread + * group leader... + */ +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ + +#define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ +#define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h index 0098b0ce8ccb..0bd678a4a10e 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fs.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h @@ -60,6 +60,17 @@ #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ #define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ +/* + * The root inode of procfs is guaranteed to always have the same inode number. + * For programs that make heavy use of procfs, verifying that the root is a + * real procfs root and using openat2(RESOLVE_{NO_{XDEV,MAGICLINKS},BENEATH}) + * will allow you to make sure you are never tricked into operating on the + * wrong procfs file. + */ +enum procfs_ino { + PROCFS_ROOT_INO = 1, +}; + struct file_clone_range { __s64 src_fd; __u64 src_offset; @@ -91,6 +102,63 @@ struct fs_sysfs_path { __u8 name[128]; }; +/* Protection info capability flags */ +#define LBMD_PI_CAP_INTEGRITY (1 << 0) +#define LBMD_PI_CAP_REFTAG (1 << 1) + +/* Checksum types for Protection Information */ +#define LBMD_PI_CSUM_NONE 0 +#define LBMD_PI_CSUM_IP 1 +#define LBMD_PI_CSUM_CRC16_T10DIF 2 +#define LBMD_PI_CSUM_CRC64_NVME 4 + +/* sizeof first published struct */ +#define LBMD_SIZE_VER0 16 + +/* + * Logical block metadata capability descriptor + * If the device does not support metadata, all the fields will be zero. + * Applications must check lbmd_flags to determine whether metadata is + * supported or not. + */ +struct logical_block_metadata_cap { + /* Bitmask of logical block metadata capability flags */ + __u32 lbmd_flags; + /* + * The amount of data described by each unit of logical block + * metadata + */ + __u16 lbmd_interval; + /* + * Size in bytes of the logical block metadata associated with each + * interval + */ + __u8 lbmd_size; + /* + * Size in bytes of the opaque block tag associated with each + * interval + */ + __u8 lbmd_opaque_size; + /* + * Offset in bytes of the opaque block tag within the logical block + * metadata + */ + __u8 lbmd_opaque_offset; + /* Size in bytes of the T10 PI tuple associated with each interval */ + __u8 lbmd_pi_size; + /* Offset in bytes of T10 PI tuple within the logical block metadata */ + __u8 lbmd_pi_offset; + /* T10 PI guard tag type */ + __u8 lbmd_guard_tag_type; + /* Size in bytes of the T10 PI application tag */ + __u8 lbmd_app_tag_size; + /* Size in bytes of the T10 PI reference tag */ + __u8 lbmd_ref_tag_size; + /* Size in bytes of the T10 PI storage tag */ + __u8 lbmd_storage_tag_size; + __u8 pad; +}; + /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ #define FILE_DEDUPE_RANGE_SAME 0 #define FILE_DEDUPE_RANGE_DIFFERS 1 @@ -149,6 +217,24 @@ struct fsxattr { }; /* + * Variable size structure for file_[sg]et_attr(). + * + * Note. This is alternative to the structure 'struct file_kattr'/'struct fsxattr'. + * As this structure is passed to/from userspace with its size, this can + * be versioned based on the size. + */ +struct file_attr { + __u64 fa_xflags; /* xflags field value (get/set) */ + __u32 fa_extsize; /* extsize field value (get/set)*/ + __u32 fa_nextents; /* nextents field value (get) */ + __u32 fa_projid; /* project identifier (get/set) */ + __u32 fa_cowextsize; /* CoW extsize field value (get/set) */ +}; + +#define FILE_ATTR_SIZE_VER0 24 +#define FILE_ATTR_SIZE_LATEST FILE_ATTR_SIZE_VER0 + +/* * Flags for the fsx_xflags field */ #define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ @@ -247,6 +333,8 @@ struct fsxattr { * also /sys/kernel/debug/ for filesystems with debugfs exports */ #define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path) +/* Get logical block metadata capability details */ +#define FS_IOC_GETLBMD_CAP _IOWR(0x15, 2, struct logical_block_metadata_cap) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h index 3b93fb906e3c..ed3aed264aeb 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h +++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h @@ -244,6 +244,8 @@ struct prctl_mm_map { # define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) /* Unused; kept only for source compatibility */ # define PR_MTE_TCF_SHIFT 1 +/* MTE tag check store only */ +# define PR_MTE_STORE_ONLY (1UL << 19) /* RISC-V pointer masking tag length */ # define PR_PMLEN_SHIFT 24 # define PR_PMLEN_MASK (0x7fUL << PR_PMLEN_SHIFT) @@ -255,7 +257,12 @@ struct prctl_mm_map { /* Dispatch syscalls to a userspace handler */ #define PR_SET_SYSCALL_USER_DISPATCH 59 # define PR_SYS_DISPATCH_OFF 0 -# define PR_SYS_DISPATCH_ON 1 +/* Enable dispatch except for the specified range */ +# define PR_SYS_DISPATCH_EXCLUSIVE_ON 1 +/* Enable dispatch for the specified range */ +# define PR_SYS_DISPATCH_INCLUSIVE_ON 2 +/* Legacy name for backwards compatibility */ +# define PR_SYS_DISPATCH_ON PR_SYS_DISPATCH_EXCLUSIVE_ON /* The control values for the user space selector when dispatch is enabled */ # define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 diff --git a/tools/perf/trace/beauty/include/uapi/linux/vhost.h b/tools/perf/trace/beauty/include/uapi/linux/vhost.h index d4b3e2ae1314..c57674a6aa0d 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/vhost.h +++ b/tools/perf/trace/beauty/include/uapi/linux/vhost.h @@ -235,4 +235,39 @@ */ #define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) + +/* Extended features manipulation */ +#define VHOST_GET_FEATURES_ARRAY _IOR(VHOST_VIRTIO, 0x83, \ + struct vhost_features_array) +#define VHOST_SET_FEATURES_ARRAY _IOW(VHOST_VIRTIO, 0x83, \ + struct vhost_features_array) + +/* fork_owner values for vhost */ +#define VHOST_FORK_OWNER_KTHREAD 0 +#define VHOST_FORK_OWNER_TASK 1 + +/** + * VHOST_SET_FORK_FROM_OWNER - Set the fork_owner flag for the vhost device, + * This ioctl must called before VHOST_SET_OWNER. + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y + * + * @param fork_owner: An 8-bit value that determines the vhost thread mode + * + * When fork_owner is set to VHOST_FORK_OWNER_TASK(default value): + * - Vhost will create vhost worker as tasks forked from the owner, + * inheriting all of the owner's attributes. + * + * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD: + * - Vhost will create vhost workers as kernel threads. + */ +#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x84, __u8) + +/** + * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device. + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y + * + * @return: An 8-bit value indicating the current thread mode. + */ +#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x85, __u8) + #endif diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 5b6d3e899e11..2298cd396c42 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -657,9 +657,15 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, info_node->info_linear = info_linear; info_node->metadata = NULL; if (!perf_env__insert_bpf_prog_info(env, info_node)) { - free(info_linear); + /* + * Insert failed, likely because of a duplicate event + * made by the sideband thread. Ignore synthesizing the + * metadata. + */ free(info_node); + goto out; } + /* info_linear is now owned by info_node and shouldn't be freed below. */ info_linear = NULL; /* @@ -827,18 +833,18 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, return err; } -static void perf_env__add_bpf_info(struct perf_env *env, u32 id) +static int perf_env__add_bpf_info(struct perf_env *env, u32 id) { struct bpf_prog_info_node *info_node; struct perf_bpil *info_linear; struct btf *btf = NULL; u64 arrays; u32 btf_id; - int fd; + int fd, err = 0; fd = bpf_prog_get_fd_by_id(id); if (fd < 0) - return; + return -EINVAL; arrays = 1UL << PERF_BPIL_JITED_KSYMS; arrays |= 1UL << PERF_BPIL_JITED_FUNC_LENS; @@ -852,6 +858,7 @@ static void perf_env__add_bpf_info(struct perf_env *env, u32 id) info_linear = get_bpf_prog_info_linear(fd, arrays); if (IS_ERR_OR_NULL(info_linear)) { pr_debug("%s: failed to get BPF program info. aborting\n", __func__); + err = PTR_ERR(info_linear); goto out; } @@ -862,38 +869,46 @@ static void perf_env__add_bpf_info(struct perf_env *env, u32 id) info_node->info_linear = info_linear; info_node->metadata = bpf_metadata_create(&info_linear->info); if (!perf_env__insert_bpf_prog_info(env, info_node)) { + pr_debug("%s: duplicate add bpf info request for id %u\n", + __func__, btf_id); free(info_linear); free(info_node); + goto out; } - } else + } else { free(info_linear); + err = -ENOMEM; + goto out; + } if (btf_id == 0) goto out; btf = btf__load_from_kernel_by_id(btf_id); - if (libbpf_get_error(btf)) { - pr_debug("%s: failed to get BTF of id %u, aborting\n", - __func__, btf_id); - goto out; + if (!btf) { + err = -errno; + pr_debug("%s: failed to get BTF of id %u %d\n", __func__, btf_id, err); + } else { + perf_env__fetch_btf(env, btf_id, btf); } - perf_env__fetch_btf(env, btf_id, btf); out: btf__free(btf); close(fd); + return err; } static int bpf_event__sb_cb(union perf_event *event, void *data) { struct perf_env *env = data; + int ret = 0; if (event->header.type != PERF_RECORD_BPF_EVENT) return -1; switch (event->bpf.type) { case PERF_BPF_EVENT_PROG_LOAD: - perf_env__add_bpf_info(env, event->bpf.id); + ret = perf_env__add_bpf_info(env, event->bpf.id); case PERF_BPF_EVENT_PROG_UNLOAD: /* @@ -907,7 +922,7 @@ static int bpf_event__sb_cb(union perf_event *event, void *data) break; } - return 0; + return ret; } int evlist__add_bpf_sb_event(struct evlist *evlist, struct perf_env *env) diff --git a/tools/perf/util/bpf-utils.c b/tools/perf/util/bpf-utils.c index 80b1d2b3729b..5a66dc8594aa 100644 --- a/tools/perf/util/bpf-utils.c +++ b/tools/perf/util/bpf-utils.c @@ -20,7 +20,7 @@ struct bpil_array_desc { */ }; -static struct bpil_array_desc bpil_array_desc[] = { +static const struct bpil_array_desc bpil_array_desc[] = { [PERF_BPIL_JITED_INSNS] = { offsetof(struct bpf_prog_info, jited_prog_insns), offsetof(struct bpf_prog_info, jited_prog_len), @@ -115,7 +115,7 @@ get_bpf_prog_info_linear(int fd, __u64 arrays) __u32 info_len = sizeof(info); __u32 data_len = 0; int i, err; - void *ptr; + __u8 *ptr; if (arrays >> PERF_BPIL_LAST_ARRAY) return ERR_PTR(-EINVAL); @@ -126,15 +126,15 @@ get_bpf_prog_info_linear(int fd, __u64 arrays) pr_debug("can't get prog info: %s", strerror(errno)); return ERR_PTR(-EFAULT); } + if (info.type >= __MAX_BPF_PROG_TYPE) + pr_debug("%s:%d: unexpected program type %u\n", __func__, __LINE__, info.type); /* step 2: calculate total size of all arrays */ for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) { + const struct bpil_array_desc *desc = &bpil_array_desc[i]; bool include_array = (arrays & (1UL << i)) > 0; - struct bpil_array_desc *desc; __u32 count, size; - desc = bpil_array_desc + i; - /* kernel is too old to support this field */ if (info_len < desc->array_offset + sizeof(__u32) || info_len < desc->count_offset + sizeof(__u32) || @@ -163,19 +163,20 @@ get_bpf_prog_info_linear(int fd, __u64 arrays) ptr = info_linear->data; for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) { - struct bpil_array_desc *desc; + const struct bpil_array_desc *desc = &bpil_array_desc[i]; __u32 count, size; if ((arrays & (1UL << i)) == 0) continue; - desc = bpil_array_desc + i; count = bpf_prog_info_read_offset_u32(&info, desc->count_offset); size = bpf_prog_info_read_offset_u32(&info, desc->size_offset); bpf_prog_info_set_offset_u32(&info_linear->info, desc->count_offset, count); bpf_prog_info_set_offset_u32(&info_linear->info, desc->size_offset, size); + assert(ptr >= info_linear->data); + assert(ptr < &info_linear->data[data_len]); bpf_prog_info_set_offset_u64(&info_linear->info, desc->array_offset, ptr_to_u64(ptr)); @@ -189,27 +190,45 @@ get_bpf_prog_info_linear(int fd, __u64 arrays) free(info_linear); return ERR_PTR(-EFAULT); } + if (info_linear->info.type >= __MAX_BPF_PROG_TYPE) { + pr_debug("%s:%d: unexpected program type %u\n", + __func__, __LINE__, info_linear->info.type); + } /* step 6: verify the data */ + ptr = info_linear->data; for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) { - struct bpil_array_desc *desc; - __u32 v1, v2; + const struct bpil_array_desc *desc = &bpil_array_desc[i]; + __u32 count1, count2, size1, size2; + __u64 ptr2; if ((arrays & (1UL << i)) == 0) continue; - desc = bpil_array_desc + i; - v1 = bpf_prog_info_read_offset_u32(&info, desc->count_offset); - v2 = bpf_prog_info_read_offset_u32(&info_linear->info, + count1 = bpf_prog_info_read_offset_u32(&info, desc->count_offset); + count2 = bpf_prog_info_read_offset_u32(&info_linear->info, desc->count_offset); - if (v1 != v2) - pr_warning("%s: mismatch in element count\n", __func__); + if (count1 != count2) { + pr_warning("%s: mismatch in element count %u vs %u\n", __func__, count1, count2); + free(info_linear); + return ERR_PTR(-ERANGE); + } - v1 = bpf_prog_info_read_offset_u32(&info, desc->size_offset); - v2 = bpf_prog_info_read_offset_u32(&info_linear->info, + size1 = bpf_prog_info_read_offset_u32(&info, desc->size_offset); + size2 = bpf_prog_info_read_offset_u32(&info_linear->info, desc->size_offset); - if (v1 != v2) - pr_warning("%s: mismatch in rec size\n", __func__); + if (size1 != size2) { + pr_warning("%s: mismatch in rec size %u vs %u\n", __func__, size1, size2); + free(info_linear); + return ERR_PTR(-ERANGE); + } + ptr2 = bpf_prog_info_read_offset_u64(&info_linear->info, desc->array_offset); + if (ptr_to_u64(ptr) != ptr2) { + pr_warning("%s: mismatch in array %p vs %llx\n", __func__, ptr, ptr2); + free(info_linear); + return ERR_PTR(-ERANGE); + } + ptr += roundup(count1 * size1, sizeof(__u64)); } /* step 7: update info_len and data_len */ @@ -224,13 +243,12 @@ void bpil_addr_to_offs(struct perf_bpil *info_linear) int i; for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) { - struct bpil_array_desc *desc; + const struct bpil_array_desc *desc = &bpil_array_desc[i]; __u64 addr, offs; if ((info_linear->arrays & (1UL << i)) == 0) continue; - desc = bpil_array_desc + i; addr = bpf_prog_info_read_offset_u64(&info_linear->info, desc->array_offset); offs = addr - ptr_to_u64(info_linear->data); @@ -244,13 +262,12 @@ void bpil_offs_to_addr(struct perf_bpil *info_linear) int i; for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) { - struct bpil_array_desc *desc; + const struct bpil_array_desc *desc = &bpil_array_desc[i]; __u64 addr, offs; if ((info_linear->arrays & (1UL << i)) == 0) continue; - desc = bpil_array_desc + i; offs = bpf_prog_info_read_offset_u64(&info_linear->info, desc->array_offset); addr = offs + ptr_to_u64(info_linear->data); diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index a7018a3b0437..bf7f3268b9a2 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -115,7 +115,7 @@ int filename__snprintf_build_id(const char *pathname, char *sbuild_id, size_t sb struct build_id bid = { .size = 0, }; int ret; - ret = filename__read_build_id(pathname, &bid); + ret = filename__read_build_id(pathname, &bid, /*block=*/true); if (ret < 0) return ret; @@ -841,7 +841,7 @@ static int filename__read_build_id_ns(const char *filename, int ret; nsinfo__mountns_enter(nsi, &nsc); - ret = filename__read_build_id(filename, bid); + ret = filename__read_build_id(filename, bid, /*block=*/true); nsinfo__mountns_exit(&nsc); return ret; diff --git a/tools/perf/util/debuginfo.c b/tools/perf/util/debuginfo.c index a44c70f93156..bb9ebd84ec2d 100644 --- a/tools/perf/util/debuginfo.c +++ b/tools/perf/util/debuginfo.c @@ -110,8 +110,12 @@ struct debuginfo *debuginfo__new(const char *path) if (!dso) goto out; - /* Set the build id for DSO_BINARY_TYPE__BUILDID_DEBUGINFO */ - if (is_regular_file(path) && filename__read_build_id(path, &bid) > 0) + /* + * Set the build id for DSO_BINARY_TYPE__BUILDID_DEBUGINFO. Don't block + * incase the path isn't for a regular file. + */ + assert(!dso__has_build_id(dso)); + if (filename__read_build_id(path, &bid, /*block=*/false) > 0) dso__set_build_id(dso, &bid); for (type = distro_dwarf_types; diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index 0a7645c7fae7..64c1d65b0149 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -81,13 +81,13 @@ static int dsos__read_build_ids_cb(struct dso *dso, void *data) return 0; } nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); - if (filename__read_build_id(dso__long_name(dso), &bid) > 0) { + if (filename__read_build_id(dso__long_name(dso), &bid, /*block=*/true) > 0) { dso__set_build_id(dso, &bid); args->have_build_id = true; } else if (errno == ENOENT && dso__nsinfo(dso)) { char *new_name = dso__filename_with_chroot(dso, dso__long_name(dso)); - if (new_name && filename__read_build_id(new_name, &bid) > 0) { + if (new_name && filename__read_build_id(new_name, &bid, /*block=*/true) > 0) { dso__set_build_id(dso, &bid); args->have_build_id = true; } diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h index 89979ca23c3f..34e2fdfe7300 100644 --- a/tools/perf/util/include/linux/linkage.h +++ b/tools/perf/util/include/linux/linkage.h @@ -120,7 +120,7 @@ #endif // In the kernel sources (include/linux/cfi_types.h), this has a different -// definition when CONFIG_CFI_CLANG is used, for tools/ just use the !clang +// definition when CONFIG_CFI is used, for tools/ just use the !cfi // definition: #ifndef SYM_TYPED_START #define SYM_TYPED_START(name, linkage, align...) \ diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c index 8fabddc1c0da..72c7a4e15d61 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -32,7 +32,7 @@ static void intel_pt_insn_decoder(struct insn *insn, intel_pt_insn->rel = 0; intel_pt_insn->emulated_ptwrite = false; - if (insn_is_avx(insn)) { + if (insn_is_avx_or_xop(insn)) { intel_pt_insn->op = INTEL_PT_OP_OTHER; intel_pt_insn->branch = INTEL_PT_BR_NO_BRANCH; intel_pt_insn->length = insn->length; diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 85b2a93a59ac..779f6230130a 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -477,6 +477,7 @@ static int __maps__insert(struct maps *maps, struct map *new) } /* Insert the value at the end. */ maps_by_address[nr_maps] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) maps_by_name[nr_maps] = map__get(new); @@ -502,8 +503,6 @@ static int __maps__insert(struct maps *maps, struct map *new) if (map__end(new) < map__start(new)) RC_CHK_ACCESS(maps)->ends_broken = true; - map__set_kmap_maps(new, maps); - return 0; } @@ -891,6 +890,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) if (before) { map__put(maps_by_address[i]); maps_by_address[i] = before; + map__set_kmap_maps(before, maps); if (maps_by_name) { map__put(maps_by_name[ni]); @@ -918,6 +918,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) */ map__put(maps_by_address[i]); maps_by_address[i] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) { map__put(maps_by_name[ni]); @@ -942,14 +943,13 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) */ map__put(maps_by_address[i]); maps_by_address[i] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) { map__put(maps_by_name[ni]); maps_by_name[ni] = map__get(new); } - map__set_kmap_maps(new, maps); - check_invariants(maps); return err; } @@ -1019,6 +1019,7 @@ int maps__copy_from(struct maps *dest, struct maps *parent) err = unwind__prepare_access(dest, new, NULL); if (!err) { dest_maps_by_address[i] = new; + map__set_kmap_maps(new, dest); if (dest_maps_by_name) dest_maps_by_name[i] = map__get(new); RC_CHK_ACCESS(dest)->nr_maps = i + 1; diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 6d2c280a1730..1346fd180653 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -873,13 +873,17 @@ out: #ifdef HAVE_LIBBFD_BUILDID_SUPPORT -static int read_build_id(const char *filename, struct build_id *bid) +static int read_build_id(const char *filename, struct build_id *bid, bool block) { size_t size = sizeof(bid->data); - int err = -1; + int err = -1, fd; bfd *abfd; - abfd = bfd_openr(filename, NULL); + fd = open(filename, block ? O_RDONLY : (O_RDONLY | O_NONBLOCK)); + if (fd < 0) + return -1; + + abfd = bfd_fdopenr(filename, /*target=*/NULL, fd); if (!abfd) return -1; @@ -902,7 +906,7 @@ out_close: #else // HAVE_LIBBFD_BUILDID_SUPPORT -static int read_build_id(const char *filename, struct build_id *bid) +static int read_build_id(const char *filename, struct build_id *bid, bool block) { size_t size = sizeof(bid->data); int fd, err = -1; @@ -911,7 +915,7 @@ static int read_build_id(const char *filename, struct build_id *bid) if (size < BUILD_ID_SIZE) goto out; - fd = open(filename, O_RDONLY); + fd = open(filename, block ? O_RDONLY : (O_RDONLY | O_NONBLOCK)); if (fd < 0) goto out; @@ -934,7 +938,7 @@ out: #endif // HAVE_LIBBFD_BUILDID_SUPPORT -int filename__read_build_id(const char *filename, struct build_id *bid) +int filename__read_build_id(const char *filename, struct build_id *bid, bool block) { struct kmod_path m = { .name = NULL, }; char path[PATH_MAX]; @@ -958,9 +962,10 @@ int filename__read_build_id(const char *filename, struct build_id *bid) } close(fd); filename = path; + block = true; } - err = read_build_id(filename, bid); + err = read_build_id(filename, bid, block); if (m.comp) unlink(filename); diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c index 7201494c5c20..41e4ebe5eac5 100644 --- a/tools/perf/util/symbol-minimal.c +++ b/tools/perf/util/symbol-minimal.c @@ -4,7 +4,6 @@ #include <errno.h> #include <unistd.h> -#include <stdio.h> #include <fcntl.h> #include <string.h> #include <stdlib.h> @@ -86,13 +85,10 @@ int filename__read_debuglink(const char *filename __maybe_unused, /* * Just try PT_NOTE header otherwise fails */ -int filename__read_build_id(const char *filename, struct build_id *bid) +int filename__read_build_id(const char *filename, struct build_id *bid, bool block) { - FILE *fp; - int ret = -1; + int fd, ret = -1; bool need_swap = false, elf32; - u8 e_ident[EI_NIDENT]; - int i; union { struct { Elf32_Ehdr ehdr32; @@ -103,28 +99,27 @@ int filename__read_build_id(const char *filename, struct build_id *bid) Elf64_Phdr *phdr64; }; } hdrs; - void *phdr; - size_t phdr_size; - void *buf = NULL; - size_t buf_size = 0; + void *phdr, *buf = NULL; + ssize_t phdr_size, ehdr_size, buf_size = 0; - fp = fopen(filename, "r"); - if (fp == NULL) + fd = open(filename, block ? O_RDONLY : (O_RDONLY | O_NONBLOCK)); + if (fd < 0) return -1; - if (fread(e_ident, sizeof(e_ident), 1, fp) != 1) + if (read(fd, hdrs.ehdr32.e_ident, EI_NIDENT) != EI_NIDENT) goto out; - if (memcmp(e_ident, ELFMAG, SELFMAG) || - e_ident[EI_VERSION] != EV_CURRENT) + if (memcmp(hdrs.ehdr32.e_ident, ELFMAG, SELFMAG) || + hdrs.ehdr32.e_ident[EI_VERSION] != EV_CURRENT) goto out; - need_swap = check_need_swap(e_ident[EI_DATA]); - elf32 = e_ident[EI_CLASS] == ELFCLASS32; + need_swap = check_need_swap(hdrs.ehdr32.e_ident[EI_DATA]); + elf32 = hdrs.ehdr32.e_ident[EI_CLASS] == ELFCLASS32; + ehdr_size = (elf32 ? sizeof(hdrs.ehdr32) : sizeof(hdrs.ehdr64)) - EI_NIDENT; - if (fread(elf32 ? (void *)&hdrs.ehdr32 : (void *)&hdrs.ehdr64, - elf32 ? sizeof(hdrs.ehdr32) : sizeof(hdrs.ehdr64), - 1, fp) != 1) + if (read(fd, + (elf32 ? (void *)&hdrs.ehdr32 : (void *)&hdrs.ehdr64) + EI_NIDENT, + ehdr_size) != ehdr_size) goto out; if (need_swap) { @@ -138,14 +133,18 @@ int filename__read_build_id(const char *filename, struct build_id *bid) hdrs.ehdr64.e_phnum = bswap_16(hdrs.ehdr64.e_phnum); } } - phdr_size = elf32 ? hdrs.ehdr32.e_phentsize * hdrs.ehdr32.e_phnum - : hdrs.ehdr64.e_phentsize * hdrs.ehdr64.e_phnum; + if ((elf32 && hdrs.ehdr32.e_phentsize != sizeof(Elf32_Phdr)) || + (!elf32 && hdrs.ehdr64.e_phentsize != sizeof(Elf64_Phdr))) + goto out; + + phdr_size = elf32 ? sizeof(Elf32_Phdr) * hdrs.ehdr32.e_phnum + : sizeof(Elf64_Phdr) * hdrs.ehdr64.e_phnum; phdr = malloc(phdr_size); if (phdr == NULL) goto out; - fseek(fp, elf32 ? hdrs.ehdr32.e_phoff : hdrs.ehdr64.e_phoff, SEEK_SET); - if (fread(phdr, phdr_size, 1, fp) != 1) + lseek(fd, elf32 ? hdrs.ehdr32.e_phoff : hdrs.ehdr64.e_phoff, SEEK_SET); + if (read(fd, phdr, phdr_size) != phdr_size) goto out_free; if (elf32) @@ -153,8 +152,8 @@ int filename__read_build_id(const char *filename, struct build_id *bid) else hdrs.phdr64 = phdr; - for (i = 0; i < elf32 ? hdrs.ehdr32.e_phnum : hdrs.ehdr64.e_phnum; i++) { - size_t p_filesz; + for (int i = 0; i < (elf32 ? hdrs.ehdr32.e_phnum : hdrs.ehdr64.e_phnum); i++) { + ssize_t p_filesz; if (need_swap) { if (elf32) { @@ -180,8 +179,8 @@ int filename__read_build_id(const char *filename, struct build_id *bid) goto out_free; buf = tmp; } - fseek(fp, elf32 ? hdrs.phdr32[i].p_offset : hdrs.phdr64[i].p_offset, SEEK_SET); - if (fread(buf, p_filesz, 1, fp) != 1) + lseek(fd, elf32 ? hdrs.phdr32[i].p_offset : hdrs.phdr64[i].p_offset, SEEK_SET); + if (read(fd, buf, p_filesz) != p_filesz) goto out_free; ret = read_build_id(buf, p_filesz, bid, need_swap); @@ -194,7 +193,7 @@ out_free: free(buf); free(phdr); out: - fclose(fp); + close(fd); return ret; } @@ -324,7 +323,7 @@ int dso__load_sym(struct dso *dso, struct map *map __maybe_unused, if (ret >= 0) RC_CHK_ACCESS(dso)->is_64_bit = ret; - if (filename__read_build_id(ss->name, &bid) > 0) + if (filename__read_build_id(ss->name, &bid, /*block=*/true) > 0) dso__set_build_id(dso, &bid); return 0; } diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index e816e4220d33..3fed54de5401 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1869,14 +1869,14 @@ int dso__load(struct dso *dso, struct map *map) /* * Read the build id if possible. This is required for - * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work + * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work. Don't block in case path + * isn't for a regular file. */ - if (!dso__has_build_id(dso) && - is_regular_file(dso__long_name(dso))) { + if (!dso__has_build_id(dso)) { struct build_id bid = { .size = 0, }; __symbol__join_symfs(name, PATH_MAX, dso__long_name(dso)); - if (filename__read_build_id(name, &bid) > 0) + if (filename__read_build_id(name, &bid, /*block=*/false) > 0) dso__set_build_id(dso, &bid); } diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 3fb5d146d9b1..347106218799 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -140,7 +140,7 @@ struct symbol *dso__next_symbol(struct symbol *sym); enum dso_type dso__type_fd(int fd); -int filename__read_build_id(const char *filename, struct build_id *id); +int filename__read_build_id(const char *filename, struct build_id *id, bool block); int sysfs__read_build_id(const char *filename, struct build_id *bid); int modules__parse(const char *filename, void *arg, int (*process_module)(void *arg, const char *name, diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index cb2c1ace304a..fcd1fd13c30e 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -401,7 +401,7 @@ static void perf_record_mmap2__read_build_id(struct perf_record_mmap2 *event, nsi = nsinfo__new(event->pid); nsinfo__mountns_enter(nsi, &nc); - rc = filename__read_build_id(event->filename, &bid) > 0 ? 0 : -1; + rc = filename__read_build_id(event->filename, &bid, /*block=*/false) > 0 ? 0 : -1; nsinfo__mountns_exit(&nc); nsinfo__put(nsi); diff --git a/tools/power/cpupower/man/cpupower-set.1 b/tools/power/cpupower/man/cpupower-set.1 index 500653ef98c7..8ac82b6f9189 100644 --- a/tools/power/cpupower/man/cpupower-set.1 +++ b/tools/power/cpupower/man/cpupower-set.1 @@ -81,10 +81,11 @@ Refer to the AMD P-State kernel documentation for further information. .RE .PP -\-\-turbo\-boost, \-t +\-\-turbo\-boost, \-\-boost, \-t .RS 4 -This option is used to enable or disable the turbo boost feature on -supported Intel and AMD processors. +This option is used to enable or disable the boost feature on +supported Intel and AMD processors, and other boost supported systems. +(The --boost option is an alias for the --turbo-boost option) This option takes as parameter either \fB1\fP to enable, or \fB0\fP to disable the feature. diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index fc750e127404..7d3732f5f2f6 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -128,7 +128,7 @@ static int get_boost_mode_x86(unsigned int cpu) /* ToDo: Make this more global */ unsigned long pstates[MAX_HW_PSTATES] = {0,}; - ret = cpufreq_has_boost_support(cpu, &support, &active, &b_states); + ret = cpufreq_has_x86_boost_support(cpu, &support, &active, &b_states); if (ret) { printf(_("Error while evaluating Boost Capabilities" " on CPU %d -- are you root?\n"), cpu); @@ -204,6 +204,18 @@ static int get_boost_mode_x86(unsigned int cpu) return 0; } +static int get_boost_mode_generic(unsigned int cpu) +{ + bool active; + + if (!cpufreq_has_generic_boost_support(&active)) { + printf(_(" boost state support:\n")); + printf(_(" Active: %s\n"), active ? _("yes") : _("no")); + } + + return 0; +} + /* --boost / -b */ static int get_boost_mode(unsigned int cpu) @@ -214,6 +226,8 @@ static int get_boost_mode(unsigned int cpu) cpupower_cpu_info.vendor == X86_VENDOR_HYGON || cpupower_cpu_info.vendor == X86_VENDOR_INTEL) return get_boost_mode_x86(cpu); + else + get_boost_mode_generic(cpu); freqs = cpufreq_get_boost_frequencies(cpu); if (freqs) { diff --git a/tools/power/cpupower/utils/cpupower-set.c b/tools/power/cpupower/utils/cpupower-set.c index 0677b58374ab..c2117e5650dd 100644 --- a/tools/power/cpupower/utils/cpupower-set.c +++ b/tools/power/cpupower/utils/cpupower-set.c @@ -21,6 +21,7 @@ static struct option set_opts[] = { {"epp", required_argument, NULL, 'e'}, {"amd-pstate-mode", required_argument, NULL, 'm'}, {"turbo-boost", required_argument, NULL, 't'}, + {"boost", required_argument, NULL, 't'}, { }, }; @@ -62,8 +63,8 @@ int cmd_set(int argc, char **argv) params.params = 0; /* parameter parsing */ - while ((ret = getopt_long(argc, argv, "b:e:m:", - set_opts, NULL)) != -1) { + while ((ret = getopt_long(argc, argv, "b:e:m:t:", + set_opts, NULL)) != -1) { switch (ret) { case 'b': if (params.perf_bias) diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 95749b8ee475..82ea62bdf5a2 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -103,6 +103,9 @@ extern struct cpupower_cpu_info cpupower_cpu_info; /* cpuid and cpuinfo helpers **************************/ +int cpufreq_has_generic_boost_support(bool *active); +int cpupower_set_turbo_boost(int turbo_boost); + /* X86 ONLY ****************************************/ #if defined(__i386__) || defined(__x86_64__) @@ -118,7 +121,6 @@ extern unsigned long long msr_intel_get_turbo_ratio(unsigned int cpu); extern int cpupower_set_epp(unsigned int cpu, char *epp); extern int cpupower_set_amd_pstate_mode(char *mode); -extern int cpupower_set_turbo_boost(int turbo_boost); /* Read/Write msr ****************************/ @@ -139,8 +141,8 @@ extern int decode_pstates(unsigned int cpu, int boost_states, /* AMD HW pstate decoding **************************/ -extern int cpufreq_has_boost_support(unsigned int cpu, int *support, - int *active, int * states); +int cpufreq_has_x86_boost_support(unsigned int cpu, int *support, + int *active, int *states); /* AMD P-State stuff **************************/ bool cpupower_amd_pstate_enabled(void); @@ -181,13 +183,11 @@ static inline int cpupower_set_epp(unsigned int cpu, char *epp) { return -1; }; static inline int cpupower_set_amd_pstate_mode(char *mode) { return -1; }; -static inline int cpupower_set_turbo_boost(int turbo_boost) -{ return -1; }; /* Read/Write msr ****************************/ -static inline int cpufreq_has_boost_support(unsigned int cpu, int *support, - int *active, int * states) +static inline int cpufreq_has_x86_boost_support(unsigned int cpu, int *support, + int *active, int *states) { return -1; } static inline bool cpupower_amd_pstate_enabled(void) diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index 76e461ff4f74..166dc1e470ea 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -8,15 +8,14 @@ #include "helpers/helpers.h" #include "helpers/sysfs.h" #include "cpufreq.h" +#include "cpupower_intern.h" #if defined(__i386__) || defined(__x86_64__) -#include "cpupower_intern.h" - #define MSR_AMD_HWCR 0xc0010015 -int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, - int *states) +int cpufreq_has_x86_boost_support(unsigned int cpu, int *support, int *active, + int *states) { int ret; unsigned long long val; @@ -124,24 +123,6 @@ int cpupower_set_amd_pstate_mode(char *mode) return 0; } -int cpupower_set_turbo_boost(int turbo_boost) -{ - char path[SYSFS_PATH_MAX]; - char linebuf[2] = {}; - - snprintf(path, sizeof(path), PATH_TO_CPU "cpufreq/boost"); - - if (!is_valid_path(path)) - return -1; - - snprintf(linebuf, sizeof(linebuf), "%d", turbo_boost); - - if (cpupower_write_sysfs(path, linebuf, 2) <= 0) - return -1; - - return 0; -} - bool cpupower_amd_pstate_enabled(void) { char *driver = cpufreq_get_driver(0); @@ -160,6 +141,39 @@ bool cpupower_amd_pstate_enabled(void) #endif /* #if defined(__i386__) || defined(__x86_64__) */ +int cpufreq_has_generic_boost_support(bool *active) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[2] = {}; + unsigned long val; + char *endp; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpufreq/boost"); + + if (!is_valid_path(path)) + return -EACCES; + + if (cpupower_read_sysfs(path, linebuf, 2) <= 0) + return -EINVAL; + + val = strtoul(linebuf, &endp, 0); + if (endp == linebuf || errno == ERANGE) + return -EINVAL; + + switch (val) { + case 0: + *active = false; + break; + case 1: + *active = true; + break; + default: + return -EINVAL; + } + + return 0; +} + /* get_cpustate * * Gather the information of all online CPUs into bitmask struct @@ -259,3 +273,21 @@ void print_speed(unsigned long speed, int no_rounding) } } } + +int cpupower_set_turbo_boost(int turbo_boost) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[2] = {}; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpufreq/boost"); + + if (!is_valid_path(path)) + return -1; + + snprintf(linebuf, sizeof(linebuf), "%d", turbo_boost); + + if (cpupower_write_sysfs(path, linebuf, 2) <= 0) + return -1; + + return 0; +} diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 72a280e7a9d5..47eb2d4d13a5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1195,7 +1195,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_EMERALDRAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_D, &spr_features }, - { INTEL_PANTHERCOVE_X, &dmr_features }, + { INTEL_DIAMONDRAPIDS_X, &dmr_features }, { INTEL_LAKEFIELD, &cnl_features }, { INTEL_ALDERLAKE, &adl_features }, { INTEL_ALDERLAKE_L, &adl_features }, diff --git a/tools/sched_ext/include/scx/bpf_arena_common.bpf.h b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h new file mode 100644 index 000000000000..4366fb3c91ce --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef PAGE_SIZE +#define PAGE_SIZE __PAGE_SIZE +/* + * for older kernels try sizeof(struct genradix_node) + * or flexible: + * static inline long __bpf_page_size(void) { + * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); + * } + * but generated code is not great. + */ +#endif + +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) +#define __arena __attribute__((address_space(1))) +#define __arena_global __attribute__((address_space(1))) +#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ +#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ +#else + +/* emit instruction: + * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as + * + * This is a workaround for LLVM compiler versions without + * __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena + * pointers and native kernel/userspace ones. In this case we explicitly do so + * with cast_kern() and cast_user(). E.g., in the Linux kernel tree, + * tools/testing/selftests/bpf includes tests that use these macros to implement + * linked lists and hashtables backed by arena memory. In sched_ext, we use + * cast_kern() and cast_user() for compatibility with older LLVM toolchains. + */ +#ifndef bpf_addr_space_cast +#define bpf_addr_space_cast(var, dst_as, src_as)\ + asm volatile(".byte 0xBF; \ + .ifc %[reg], r0; \ + .byte 0x00; \ + .endif; \ + .ifc %[reg], r1; \ + .byte 0x11; \ + .endif; \ + .ifc %[reg], r2; \ + .byte 0x22; \ + .endif; \ + .ifc %[reg], r3; \ + .byte 0x33; \ + .endif; \ + .ifc %[reg], r4; \ + .byte 0x44; \ + .endif; \ + .ifc %[reg], r5; \ + .byte 0x55; \ + .endif; \ + .ifc %[reg], r6; \ + .byte 0x66; \ + .endif; \ + .ifc %[reg], r7; \ + .byte 0x77; \ + .endif; \ + .ifc %[reg], r8; \ + .byte 0x88; \ + .endif; \ + .ifc %[reg], r9; \ + .byte 0x99; \ + .endif; \ + .short %[off]; \ + .long %[as]" \ + : [reg]"+r"(var) \ + : [off]"i"(BPF_ADDR_SPACE_CAST) \ + , [as]"i"((dst_as << 16) | src_as)); +#endif + +#define __arena +#define __arena_global SEC(".addr_space.1") +#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) +#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) +#endif + +void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, + int node_id, __u64 flags) __ksym __weak; +void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; + +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef TEST +#define can_loop true +#define __cond_break(expr) expr +#else +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */ +#endif /* __BPF_FEATURE_MAY_GOTO */ +#endif /* TEST */ + +#define cond_break __cond_break(break) +#define cond_break_label(label) __cond_break(goto label) + + +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; diff --git a/tools/sched_ext/include/scx/bpf_arena_common.h b/tools/sched_ext/include/scx/bpf_arena_common.h new file mode 100644 index 000000000000..10141db0b59d --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef arena_container_of +#define arena_container_of(ptr, type, member) \ + ({ \ + void __arena *__mptr = (void __arena *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + +/* Provide the definition of PAGE_SIZE. */ +#include <sys/user.h> + +#define __arena +#define __arg_arena +#define cast_kern(ptr) /* nop for user space */ +#define cast_user(ptr) /* nop for user space */ +char __attribute__((weak)) arena[1]; + +#ifndef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) +#endif + +static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, + int node_id, __u64 flags) +{ + return NULL; +} +static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) +{ +} diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index d4e21558e982..06e2551033cb 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -24,14 +24,26 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <asm-generic/errno.h> -#include "user_exit_info.h" +#include "user_exit_info.bpf.h" #include "enum_defs.autogen.h" +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ +#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_EXITING 0x00000004 #define CLOCK_MONOTONIC 1 +#ifndef NR_CPUS +#define NR_CPUS 1024 +#endif + +#ifndef NUMA_NO_NODE +#define NUMA_NO_NODE (-1) +#endif + extern int LINUX_KERNEL_VERSION __kconfig; extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak; extern const char CONFIG_LOCALVERSION[64] __kconfig __weak; @@ -91,6 +103,8 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; +struct rq *scx_bpf_locked_rq(void) __ksym; +struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; @@ -107,6 +121,9 @@ void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __ static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} +#define SCX_STRINGIFY(x) #x +#define SCX_TOSTRING(x) SCX_STRINGIFY(x) + /* * Helper macro for initializing the fmt and variadic argument inputs to both * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to @@ -141,13 +158,15 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments * instead of an array of u64. Invoking this macro will cause the scheduler to * exit in an erroneous state, with diagnostic information being passed to the - * user. + * user. It appends the file and line number to aid debugging. */ #define scx_bpf_error(fmt, args...) \ ({ \ - scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_bstr_preamble( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args) \ scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ - ___scx_bpf_bstr_format_checker(fmt, ##args); \ + ___scx_bpf_bstr_format_checker( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args); \ }) /* @@ -229,6 +248,7 @@ BPF_PROG(name, ##args) * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of * `MEMBER_VPTR(ptr, ->member)`. */ +#ifndef MEMBER_VPTR #define MEMBER_VPTR(base, member) (typeof((base) member) *) \ ({ \ u64 __base = (u64)&(base); \ @@ -245,6 +265,7 @@ BPF_PROG(name, ##args) [max]"i"(sizeof(base) - sizeof((base) member))); \ __addr; \ }) +#endif /* MEMBER_VPTR */ /** * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element @@ -260,6 +281,7 @@ BPF_PROG(name, ##args) * size of the array to compute the max, which will result in rejection by * the verifier. */ +#ifndef ARRAY_ELEM_PTR #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ ({ \ u64 __base = (u64)arr; \ @@ -274,7 +296,7 @@ BPF_PROG(name, ##args) [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ __addr; \ }) - +#endif /* ARRAY_ELEM_PTR */ /* * BPF declarations and helpers @@ -438,8 +460,27 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) */ static inline bool is_migration_disabled(const struct task_struct *p) { - if (bpf_core_field_exists(p->migration_disabled)) - return p->migration_disabled; + /* + * Testing p->migration_disabled in a BPF code is tricky because the + * migration is _always_ disabled while running the BPF code. + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF + * code execution disable and re-enable the migration of the current + * task, respectively. So, the _current_ task of the sched_ext ops is + * always migration-disabled. Moreover, p->migration_disabled could be + * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is + * executed in the middle of the other BPF code execution. + * + * Therefore, we should decide that the _current_ task is + * migration-disabled only when its migration_disabled count is greater + * than one. In other words, when p->migration_disabled == 1, there is + * an ambiguity, so we should check if @p is the current task or not. + */ + if (bpf_core_field_exists(p->migration_disabled)) { + if (p->migration_disabled == 1) + return bpf_get_current_task_btf() != p; + else + return p->migration_disabled; + } return false; } @@ -476,7 +517,7 @@ static inline s64 time_delta(u64 after, u64 before) */ static inline bool time_after(u64 a, u64 b) { - return (s64)(b - a) < 0; + return (s64)(b - a) < 0; } /** @@ -500,7 +541,7 @@ static inline bool time_before(u64 a, u64 b) */ static inline bool time_after_eq(u64 a, u64 b) { - return (s64)(a - b) >= 0; + return (s64)(a - b) >= 0; } /** @@ -547,9 +588,15 @@ static inline bool time_in_range_open(u64 a, u64 b, u64 c) */ /* useful compiler attributes */ +#ifndef likely #define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#ifndef __maybe_unused #define __maybe_unused __attribute__((__unused__)) +#endif /* * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They @@ -633,6 +680,26 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s }) /* + * __calc_avg - Calculate exponential weighted moving average (EWMA) with + * @old and @new values. @decay represents how large the @old value remains. + * With a larger @decay value, the moving average changes slowly, exhibiting + * fewer fluctuations. + */ +#define __calc_avg(old, new, decay) ({ \ + typeof(decay) thr = 1 << (decay); \ + typeof(old) ret; \ + if (((old) < thr) || ((new) < thr)) { \ + if (((old) == 1) && ((new) == 0)) \ + ret = 0; \ + else \ + ret = ((old) - ((old) >> 1)) + ((new) >> 1); \ + } else { \ + ret = ((old) - ((old) >> (decay))) + ((new) >> (decay)); \ + } \ + ret; \ +}) + +/* * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. * @v: The value for which we're computing the base 2 logarithm. */ @@ -663,6 +730,25 @@ static inline u32 log2_u64(u64 v) } /* + * sqrt_u64 - Calculate the square root of value @x using Newton's method. + */ +static inline u64 __sqrt_u64(u64 x) +{ + if (x == 0 || x == 1) + return x; + + u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32); + + for (int i = 0; i < 8; ++i) { + u64 q = x / r; + if (r <= q) + break; + r = (r + q) >> 1; + } + return r; +} + +/* * Return a value proportionally scaled to the task's weight. */ static inline u64 scale_by_task_weight(const struct task_struct *p, u64 value) diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h index 1dc76bd84296..b3c6372bcf81 100644 --- a/tools/sched_ext/include/scx/common.h +++ b/tools/sched_ext/include/scx/common.h @@ -75,8 +75,9 @@ typedef int64_t s64; #include "enums.h" /* not available when building kernel tools/sched_ext */ -#if __has_include(<lib/sdt_task.h>) -#include <lib/sdt_task.h> +#if __has_include(<lib/sdt_task_defs.h>) +#include "bpf_arena_common.h" +#include <lib/sdt_task_defs.h> #endif #endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 9252e1a00556..dd9144624dc9 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -38,6 +38,7 @@ void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__i void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ (bpf_ksym_exists(scx_bpf_dsq_insert) ? \ @@ -82,6 +83,10 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) +#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ + (bpf_ksym_exists(bpf_cpumask_populate) ? \ + (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) + #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") @@ -226,6 +231,23 @@ static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) scx_bpf_pick_any_cpu(cpus_allowed, flags)) /* + * v6.18: Add a helper to retrieve the current task running on a CPU. + * + * Keep this helper available until v6.20 for compatibility. + */ +static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu) +{ + struct rq *rq; + + if (bpf_ksym_exists(scx_bpf_cpu_curr)) + return scx_bpf_cpu_curr(cpu); + + rq = scx_bpf_cpu_rq(cpu); + + return rq ? rq->curr : NULL; +} + +/* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). */ diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h new file mode 100644 index 000000000000..e7ac6611a990 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ + +#ifndef __USER_EXIT_INFO_BPF_H +#define __USER_EXIT_INFO_BPF_H + +#ifndef LSP +#include "vmlinux.h" +#endif +#include <bpf/bpf_core_read.h> + +#include "user_exit_info_common.h" + +#define UEI_DEFINE(__name) \ + char RESIZABLE_ARRAY(data, __name##_dump); \ + const volatile u32 __name##_dump_len; \ + struct user_exit_info __name SEC(".data") + +#define UEI_RECORD(__uei_name, __ei) ({ \ + bpf_probe_read_kernel_str(__uei_name.reason, \ + sizeof(__uei_name.reason), (__ei)->reason); \ + bpf_probe_read_kernel_str(__uei_name.msg, \ + sizeof(__uei_name.msg), (__ei)->msg); \ + bpf_probe_read_kernel_str(__uei_name##_dump, \ + __uei_name##_dump_len, (__ei)->dump); \ + if (bpf_core_field_exists((__ei)->exit_code)) \ + __uei_name.exit_code = (__ei)->exit_code; \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ + (__ei)->kind); \ +}) + +#endif /* __USER_EXIT_INFO_BPF_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index 66f856640ee7..399697fa372f 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -10,55 +10,11 @@ #ifndef __USER_EXIT_INFO_H #define __USER_EXIT_INFO_H -#ifdef LSP -#define __bpf__ -#include "../vmlinux.h" -#endif - -enum uei_sizes { - UEI_REASON_LEN = 128, - UEI_MSG_LEN = 1024, - UEI_DUMP_DFL_LEN = 32768, -}; - -struct user_exit_info { - int kind; - s64 exit_code; - char reason[UEI_REASON_LEN]; - char msg[UEI_MSG_LEN]; -}; - -#ifdef __bpf__ - -#ifndef LSP -#include "vmlinux.h" -#endif -#include <bpf/bpf_core_read.h> - -#define UEI_DEFINE(__name) \ - char RESIZABLE_ARRAY(data, __name##_dump); \ - const volatile u32 __name##_dump_len; \ - struct user_exit_info __name SEC(".data") - -#define UEI_RECORD(__uei_name, __ei) ({ \ - bpf_probe_read_kernel_str(__uei_name.reason, \ - sizeof(__uei_name.reason), (__ei)->reason); \ - bpf_probe_read_kernel_str(__uei_name.msg, \ - sizeof(__uei_name.msg), (__ei)->msg); \ - bpf_probe_read_kernel_str(__uei_name##_dump, \ - __uei_name##_dump_len, (__ei)->dump); \ - if (bpf_core_field_exists((__ei)->exit_code)) \ - __uei_name.exit_code = (__ei)->exit_code; \ - /* use __sync to force memory barrier */ \ - __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ - (__ei)->kind); \ -}) - -#else /* !__bpf__ */ - #include <stdio.h> #include <stdbool.h> +#include "user_exit_info_common.h" + /* no need to call the following explicitly if SCX_OPS_LOAD() is used */ #define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ @@ -114,5 +70,4 @@ enum uei_ecode_mask { #define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) -#endif /* __bpf__ */ #endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h new file mode 100644 index 000000000000..2d0981aedd89 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info_common.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifndef __USER_EXIT_INFO_COMMON_H +#define __USER_EXIT_INFO_COMMON_H + +#ifdef LSP +#include "../vmlinux.h" +#endif + +enum uei_sizes { + UEI_REASON_LEN = 128, + UEI_MSG_LEN = 1024, + UEI_DUMP_DFL_LEN = 32768, +}; + +struct user_exit_info { + int kind; + s64 exit_code; + char reason[UEI_REASON_LEN]; + char msg[UEI_MSG_LEN]; +}; + +#endif /* __USER_EXIT_INFO_COMMON_H */ diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 50bc1737c167..55df8b798865 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * A central FIFO sched_ext scheduler which demonstrates the followings: + * A central FIFO sched_ext scheduler which demonstrates the following: * * a. Making all scheduling decisions from one CPU: * diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 6ba6e610eeaa..55931a4cd71c 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -61,6 +61,7 @@ restart: skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + assert(skel->rodata->nr_cpu_ids > 0); assert(skel->rodata->nr_cpu_ids <= INT32_MAX); while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index fdc7170639e6..2c720e3ecad5 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -950,5 +950,5 @@ SCX_OPS_DEFINE(flatcg_ops, .cgroup_move = (void *)fcg_cgroup_move, .init = (void *)fcg_init, .exit = (void *)fcg_exit, - .flags = SCX_OPS_ENQ_EXITING, + .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, .name = "flatcg"); diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c index 6dd423eeb4ff..cd85eb401179 100644 --- a/tools/sched_ext/scx_flatcg.c +++ b/tools/sched_ext/scx_flatcg.c @@ -6,6 +6,7 @@ */ #include <stdio.h> #include <signal.h> +#include <assert.h> #include <unistd.h> #include <libgen.h> #include <limits.h> @@ -137,6 +138,7 @@ restart: skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + assert(skel->rodata->nr_cpus > 0); skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 69d877501cb7..3072b593f898 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -39,7 +39,8 @@ const volatile u32 stall_kernel_nth; const volatile u32 dsp_inf_loop_after; const volatile u32 dsp_batch; const volatile bool highpri_boosting; -const volatile bool print_shared_dsq; +const volatile bool print_dsqs_and_events; +const volatile bool print_msgs; const volatile s32 disallow_tgid; const volatile bool suppress_dump; @@ -56,7 +57,8 @@ struct qmap { queue1 SEC(".maps"), queue2 SEC(".maps"), queue3 SEC(".maps"), - queue4 SEC(".maps"); + queue4 SEC(".maps"), + dump_store SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); @@ -578,11 +580,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) return; scx_bpf_dump("QMAP FIFO[%d]:", i); + + /* + * Dump can be invoked anytime and there is no way to iterate in + * a non-destructive way. Pop and store in dump_store and then + * restore afterwards. If racing against new enqueues, ordering + * can get mixed up. + */ bpf_repeat(4096) { if (bpf_map_pop_elem(fifo, &pid)) break; + bpf_map_push_elem(&dump_store, &pid, 0); scx_bpf_dump(" %d", pid); } + + bpf_repeat(4096) { + if (bpf_map_pop_elem(&dump_store, &pid)) + break; + bpf_map_push_elem(fifo, &pid, 0); + } + scx_bpf_dump("\n"); } } @@ -617,22 +634,25 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args) { - bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", - cgrp->kn->id, args->weight, args->bw_period_us, - args->bw_quota_us, args->bw_burst_us); + if (print_msgs) + bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", + cgrp->kn->id, args->weight, args->bw_period_us, + args->bw_quota_us, args->bw_burst_us); return 0; } void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight) { - bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); + if (print_msgs) + bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); } void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) { - bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id, - period_us, quota_us, burst_us); + if (print_msgs) + bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", + cgrp->kn->id, period_us, quota_us, burst_us); } /* @@ -676,16 +696,20 @@ static void print_cpus(void) void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) { - bpf_printk("CPU %d coming online", cpu); - /* @cpu is already online at this point */ - print_cpus(); + if (print_msgs) { + bpf_printk("CPU %d coming online", cpu); + /* @cpu is already online at this point */ + print_cpus(); + } } void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) { - bpf_printk("CPU %d going offline", cpu); - /* @cpu is still online at this point */ - print_cpus(); + if (print_msgs) { + bpf_printk("CPU %d going offline", cpu); + /* @cpu is still online at this point */ + print_cpus(); + } } struct monitor_timer { @@ -783,35 +807,36 @@ static void dump_shared_dsq(void) static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { - struct scx_event_stats events; - bpf_rcu_read_lock(); dispatch_highpri(true); bpf_rcu_read_unlock(); monitor_cpuperf(); - if (print_shared_dsq) + if (print_dsqs_and_events) { + struct scx_event_stats events; + dump_shared_dsq(); - __COMPAT_scx_bpf_events(&events, sizeof(events)); - - bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", - scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", - scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", - scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); - bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", - scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); - bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", - scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", - scx_read_event(&events, SCX_EV_BYPASS_DURATION)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", - scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", - scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + __COMPAT_scx_bpf_events(&events, sizeof(events)); + + bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", + scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + } bpf_timer_start(timer, ONE_SEC_IN_NS, 0); return 0; @@ -823,7 +848,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) struct bpf_timer *timer; s32 ret; - print_cpus(); + if (print_msgs) + print_cpus(); ret = scx_bpf_create_dsq(SHARED_DSQ, -1); if (ret) diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index c4912ab2e76f..ef701d45ba43 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -20,7 +20,7 @@ const char help_fmt[] = "See the top-level comment in .bpf.c for more details.\n" "\n" "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" -" [-P] [-d PID] [-D LEN] [-p] [-v]\n" +" [-P] [-M] [-d PID] [-D LEN] [-p] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" @@ -28,7 +28,8 @@ const char help_fmt[] = " -T COUNT Stall every COUNT'th kernel thread\n" " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -b COUNT Dispatch upto COUNT tasks together\n" -" -P Print out DSQ content to trace_pipe every second, use with -b\n" +" -P Print out DSQ content and event counters to trace_pipe every second\n" +" -M Print out debug messages to trace_pipe\n" " -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" @@ -66,7 +67,7 @@ int main(int argc, char **argv) skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -87,7 +88,10 @@ int main(int argc, char **argv) skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); break; case 'P': - skel->rodata->print_shared_dsq = true; + skel->rodata->print_dsqs_and_events = true; + break; + case 'M': + skel->rodata->print_msgs = true; break; case 'H': skel->rodata->highpri_boosting = true; diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c index 76d83199545c..06d4b13bf76b 100644 --- a/tools/sched_ext/scx_simple.c +++ b/tools/sched_ext/scx_simple.c @@ -7,6 +7,7 @@ #include <stdio.h> #include <unistd.h> #include <signal.h> +#include <assert.h> #include <libgen.h> #include <bpf/bpf.h> #include <scx/common.h> @@ -41,6 +42,7 @@ static void sigint_handler(int simple) static void read_stats(struct scx_simple *skel, __u64 *stats) { int nr_cpus = libbpf_num_possible_cpus(); + assert(nr_cpus > 0); __u64 cnts[2][nr_cpus]; __u32 idx; diff --git a/tools/scripts/syscall.tbl b/tools/scripts/syscall.tbl index 580b4e246aec..d1ae5e92c615 100644 --- a/tools/scripts/syscall.tbl +++ b/tools/scripts/syscall.tbl @@ -408,3 +408,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index 002ec38a8bbb..3b96d090c5eb 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -17,6 +17,8 @@ #include <asm/sigcontext.h> #include <asm/unistd.h> +#include <linux/auxvec.h> + #include "../../kselftest.h" #define TESTS_PER_HWCAP 3 @@ -55,7 +57,6 @@ static void cmpbr_sigill(void) /* Not implemented, too complicated and unreliable anyway */ } - static void crc32_sigill(void) { /* CRC32W W0, W0, W1 */ @@ -169,6 +170,18 @@ static void lse128_sigill(void) : "cc", "memory"); } +static void lsfe_sigill(void) +{ + float __attribute__ ((aligned (16))) mem; + register float *memp asm ("x0") = &mem; + + /* STFADD H0, [X0] */ + asm volatile(".inst 0x7c20801f" + : "+r" (memp) + : + : "memory"); +} + static void lut_sigill(void) { /* LUTI2 V0.16B, { V0.16B }, V[0] */ @@ -763,6 +776,13 @@ static const struct hwcap_data { .sigill_fn = lse128_sigill, }, { + .name = "LSFE", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_LSFE, + .cpuinfo = "lsfe", + .sigill_fn = lsfe_sigill, + }, + { .name = "LUT", .at_hwcap = AT_HWCAP2, .hwcap_bit = HWCAP2_LUT, diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index 3b520b7efa49..1703543fb7c7 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -227,10 +227,10 @@ int main(int argc, char **argv) ret = open("/proc/sys/abi/sme_default_vector_length", O_RDONLY, 0); if (ret >= 0) { ksft_test_result(default_value(), "default_value\n"); - ksft_test_result(write_read, "write_read\n"); - ksft_test_result(write_sleep_read, "write_sleep_read\n"); - ksft_test_result(write_fork_read, "write_fork_read\n"); - ksft_test_result(write_clone_read, "write_clone_read\n"); + ksft_test_result(write_read(), "write_read\n"); + ksft_test_result(write_sleep_read(), "write_sleep_read\n"); + ksft_test_result(write_fork_read(), "write_fork_read\n"); + ksft_test_result(write_clone_read(), "write_clone_read\n"); } else { ksft_print_msg("SME support not present\n"); diff --git a/tools/testing/selftests/arm64/bti/assembler.h b/tools/testing/selftests/arm64/bti/assembler.h index 04e7b72880ef..141cdcbf0b8f 100644 --- a/tools/testing/selftests/arm64/bti/assembler.h +++ b/tools/testing/selftests/arm64/bti/assembler.h @@ -14,7 +14,6 @@ #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) - .macro startfn name:req .globl \name \name: diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index 124bc883365e..a85c19e9524e 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -1187,7 +1187,7 @@ static void sve_write_sve(pid_t child, struct test_config *config) if (!vl) return; - iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE); + iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_SVE); iov.iov_base = malloc(iov.iov_len); if (!iov.iov_base) { ksft_print_msg("Failed allocating %lu byte SVE write buffer\n", @@ -1234,8 +1234,7 @@ static void sve_write_fpsimd(pid_t child, struct test_config *config) if (!vl) return; - iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, - SVE_PT_REGS_FPSIMD); + iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_FPSIMD); iov.iov_base = malloc(iov.iov_len); if (!iov.iov_base) { ksft_print_msg("Failed allocating %lu byte SVE write buffer\n", @@ -1569,7 +1568,6 @@ static void run_sve_tests(void) &test_config); } } - } static void run_sme_tests(void) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index 74e23208b94c..9349aa630c84 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -105,8 +105,8 @@ static void child_start(struct child_data *child, const char *program) /* * Read from the startup pipe, there should be no data - * and we should block until it is closed. We just - * carry on on error since this isn't super critical. + * and we should block until it is closed. We just + * carry-on on error since this isn't super critical. */ ret = read(3, &i, sizeof(i)); if (ret < 0) @@ -549,7 +549,7 @@ int main(int argc, char **argv) evs = calloc(tests, sizeof(*evs)); if (!evs) - ksft_exit_fail_msg("Failed to allocated %d epoll events\n", + ksft_exit_fail_msg("Failed to allocate %d epoll events\n", tests); for (i = 0; i < cpus; i++) { diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c index e3cec3723ffa..0c40007d1282 100644 --- a/tools/testing/selftests/arm64/fp/kernel-test.c +++ b/tools/testing/selftests/arm64/fp/kernel-test.c @@ -188,13 +188,13 @@ static bool create_socket(void) ref = malloc(digest_len); if (!ref) { - printf("Failed to allocated %d byte reference\n", digest_len); + printf("Failed to allocate %d byte reference\n", digest_len); return false; } digest = malloc(digest_len); if (!digest) { - printf("Failed to allocated %d byte digest\n", digest_len); + printf("Failed to allocate %d byte digest\n", digest_len); return false; } diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index b22303778fb0..e0fc3a001e28 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -66,7 +66,7 @@ static const struct vec_type vec_types[] = { }; #define VL_TESTS (((TEST_VQ_MAX - SVE_VQ_MIN) + 1) * 4) -#define FLAG_TESTS 2 +#define FLAG_TESTS 4 #define FPSIMD_TESTS 2 #define EXPECTED_TESTS ((VL_TESTS + FLAG_TESTS + FPSIMD_TESTS) * ARRAY_SIZE(vec_types)) @@ -95,19 +95,27 @@ static int do_child(void) static int get_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd) { struct iovec iov; + int ret; iov.iov_base = fpsimd; iov.iov_len = sizeof(*fpsimd); - return ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); + ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_GETREGSET)"); + return ret; } static int set_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd) { struct iovec iov; + int ret; iov.iov_base = fpsimd; iov.iov_len = sizeof(*fpsimd); - return ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); + ret = ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_SETREGSET)"); + return ret; } static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, @@ -115,8 +123,9 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, { struct user_sve_header *sve; void *p; - size_t sz = sizeof *sve; + size_t sz = sizeof(*sve); struct iovec iov; + int ret; while (1) { if (*size < sz) { @@ -132,8 +141,11 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, iov.iov_base = *buf; iov.iov_len = sz; - if (ptrace(PTRACE_GETREGSET, pid, type->regset, &iov)) + ret = ptrace(PTRACE_GETREGSET, pid, type->regset, &iov); + if (ret) { + ksft_perror("ptrace(PTRACE_GETREGSET)"); goto error; + } sve = *buf; if (sve->size <= sz) @@ -152,10 +164,46 @@ static int set_sve(pid_t pid, const struct vec_type *type, const struct user_sve_header *sve) { struct iovec iov; + int ret; iov.iov_base = (void *)sve; iov.iov_len = sve->size; - return ptrace(PTRACE_SETREGSET, pid, type->regset, &iov); + ret = ptrace(PTRACE_SETREGSET, pid, type->regset, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_SETREGSET)"); + return ret; +} + +/* A read operation fails */ +static void read_fails(pid_t child, const struct vec_type *type) +{ + struct user_sve_header *new_sve = NULL; + size_t new_sve_size = 0; + void *ret; + + ret = get_sve(child, type, (void **)&new_sve, &new_sve_size); + + ksft_test_result(ret == NULL, "%s unsupported read fails\n", + type->name); + + free(new_sve); +} + +/* A write operation fails */ +static void write_fails(pid_t child, const struct vec_type *type) +{ + struct user_sve_header sve; + int ret; + + /* Just the header, no data */ + memset(&sve, 0, sizeof(sve)); + sve.size = sizeof(sve); + sve.flags = SVE_PT_REGS_SVE; + sve.vl = SVE_VL_MIN; + ret = set_sve(child, type, &sve); + + ksft_test_result(ret != 0, "%s unsupported write fails\n", + type->name); } /* Validate setting and getting the inherit flag */ @@ -270,6 +318,25 @@ static void check_u32(unsigned int vl, const char *reg, } } +/* Set out of range VLs */ +static void ptrace_set_vl_ranges(pid_t child, const struct vec_type *type) +{ + struct user_sve_header sve; + int ret; + + memset(&sve, 0, sizeof(sve)); + sve.flags = SVE_PT_REGS_SVE; + sve.size = sizeof(sve); + + ret = set_sve(child, type, &sve); + ksft_test_result(ret != 0, "%s Set invalid VL 0\n", type->name); + + sve.vl = SVE_VL_MAX + SVE_VQ_BYTES; + ret = set_sve(child, type, &sve); + ksft_test_result(ret != 0, "%s Set invalid VL %d\n", type->name, + SVE_VL_MAX + SVE_VQ_BYTES); +} + /* Access the FPSIMD registers via the SVE regset */ static void ptrace_sve_fpsimd(pid_t child, const struct vec_type *type) { @@ -683,6 +750,20 @@ static int do_parent(pid_t child) } for (i = 0; i < ARRAY_SIZE(vec_types); i++) { + /* + * If the vector type isn't supported reads and writes + * should fail. + */ + if (!(getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap)) { + read_fails(child, &vec_types[i]); + write_fails(child, &vec_types[i]); + } else { + ksft_test_result_skip("%s unsupported read fails\n", + vec_types[i].name); + ksft_test_result_skip("%s unsupported write fails\n", + vec_types[i].name); + } + /* FPSIMD via SVE regset */ if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) { ptrace_sve_fpsimd(child, &vec_types[i]); @@ -703,6 +784,17 @@ static int do_parent(pid_t child) vec_types[i].name); } + /* Setting out of bounds VLs should fail */ + if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) { + ptrace_set_vl_ranges(child, &vec_types[i]); + } else { + ksft_test_result_skip("%s Set invalid VL 0\n", + vec_types[i].name); + ksft_test_result_skip("%s Set invalid VL %d\n", + vec_types[i].name, + SVE_VL_MAX + SVE_VQ_BYTES); + } + /* Step through every possible VQ */ for (vq = SVE_VQ_MIN; vq <= TEST_VQ_MAX; vq++) { vl = sve_vl_from_vq(vq); diff --git a/tools/testing/selftests/arm64/fp/vec-syscfg.c b/tools/testing/selftests/arm64/fp/vec-syscfg.c index ea9c7d47790f..2d75d342eeb9 100644 --- a/tools/testing/selftests/arm64/fp/vec-syscfg.c +++ b/tools/testing/selftests/arm64/fp/vec-syscfg.c @@ -690,7 +690,6 @@ static inline void smstop(void) asm volatile("msr S0_3_C4_C6_3, xzr"); } - /* * Verify we can change the SVE vector length while SME is active and * continue to use SME afterwards. diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c index 584b8d59b7ea..a7f34040fbf1 100644 --- a/tools/testing/selftests/arm64/fp/zt-ptrace.c +++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c @@ -108,7 +108,6 @@ static int get_zt(pid_t pid, char zt[ZT_SIG_REG_BYTES]) return ptrace(PTRACE_GETREGSET, pid, NT_ARM_ZT, &iov); } - static int set_zt(pid_t pid, const char zt[ZT_SIG_REG_BYTES]) { struct iovec iov; diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index d2f3497a9103..1fbbf0ca1f02 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -14,11 +14,11 @@ LDLIBS+=-lpthread include ../../lib.mk $(OUTPUT)/basic-gcs: basic-gcs.c - $(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ - -static -include ../../../../include/nolibc/nolibc.h \ + $(CC) $(CFLAGS) -fno-asynchronous-unwind-tables -fno-ident -s -nostdlib -nostdinc \ + -static -I../../../../include/nolibc -include ../../../../include/nolibc/nolibc.h \ -I../../../../../usr/include \ -std=gnu99 -I../.. -g \ - -ffreestanding -Wall $^ -o $@ -lgcc + -ffreestanding $^ -o $@ -lgcc $(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S $(CC) -nostdlib $^ -o $@ diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c index 54f9c888249d..250977abc398 100644 --- a/tools/testing/selftests/arm64/gcs/basic-gcs.c +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -10,6 +10,7 @@ #include <sys/mman.h> #include <asm/mman.h> +#include <asm/hwcap.h> #include <linux/sched.h> #include "kselftest.h" @@ -386,14 +387,13 @@ int main(void) ksft_print_header(); - /* - * We don't have getauxval() with nolibc so treat a failure to - * read GCS state as a lack of support and skip. - */ + if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) + ksft_exit_skip("SKIP GCS not supported\n"); + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode, 0, 0, 0); if (ret != 0) - ksft_exit_skip("Failed to read GCS state: %d\n", ret); + ksft_exit_fail_msg("Failed to read GCS state: %d\n", ret); if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) { gcs_mode = PR_SHADOW_STACK_ENABLE; @@ -410,7 +410,7 @@ int main(void) } /* One last test: disable GCS, we can do this one time */ - my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); + ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); if (ret != 0) ksft_print_msg("Failed to disable GCS: %d\n", ret); diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c b/tools/testing/selftests/arm64/gcs/gcs-locking.c index 989f75a491b7..1e6abb136ffd 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-locking.c +++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c @@ -165,7 +165,6 @@ TEST_F(valid_modes, lock_enable_disable_others) ASSERT_EQ(ret, 0); ASSERT_EQ(mode, PR_SHADOW_STACK_ALL_MODES); - ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, variant->mode); ASSERT_EQ(ret, 0); diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c index bbc7f4950c13..cf316d78ea97 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-stress.c +++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c @@ -433,7 +433,7 @@ int main(int argc, char **argv) evs = calloc(tests, sizeof(*evs)); if (!evs) - ksft_exit_fail_msg("Failed to allocated %d epoll events\n", + ksft_exit_fail_msg("Failed to allocate %d epoll events\n", tests); for (i = 0; i < gcs_threads; i++) diff --git a/tools/testing/selftests/arm64/pauth/exec_target.c b/tools/testing/selftests/arm64/pauth/exec_target.c index 4435600ca400..e597861b26d6 100644 --- a/tools/testing/selftests/arm64/pauth/exec_target.c +++ b/tools/testing/selftests/arm64/pauth/exec_target.c @@ -13,7 +13,12 @@ int main(void) unsigned long hwcaps; size_t val; - fread(&val, sizeof(size_t), 1, stdin); + size_t size = fread(&val, sizeof(size_t), 1, stdin); + + if (size != 1) { + fprintf(stderr, "Could not read input from stdin\n"); + return EXIT_FAILURE; + } /* don't try to execute illegal (unimplemented) instructions) caller * should have checked this and keep worker simple diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 3d8378972d26..be1ee7ba7ce0 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -44,3 +44,4 @@ xdp_redirect_multi xdp_synproxy xdp_hw_metadata xdp_features +verification_cert.h diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 3ebd77206f98..a17baf8c6fd7 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -2,4 +2,3 @@ # Alphabetical order get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) stacktrace_build_id # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2 (?) -verifier_iterating_callbacks diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4863106034df..f00587d4ede6 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -120,7 +120,7 @@ TEST_PROGS_EXTENDED := \ test_bpftool.py TEST_KMODS := bpf_testmod.ko bpf_test_no_cfi.ko bpf_test_modorder_x.ko \ - bpf_test_modorder_y.ko + bpf_test_modorder_y.ko bpf_test_rqspinlock.ko TEST_KMOD_TARGETS = $(addprefix $(OUTPUT)/,$(TEST_KMODS)) # Compile but not part of 'make run_tests' @@ -137,7 +137,7 @@ TEST_GEN_PROGS_EXTENDED = \ xdping \ xskxceiver -TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi +TEST_GEN_FILES += $(TEST_KMODS) liburandom_read.so urandom_read sign-file uprobe_multi ifneq ($(V),1) submake_extras := feature_display=0 @@ -398,7 +398,7 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif -# vmlinux.h is first dumped to a temprorary file and then compared to +# vmlinux.h is first dumped to a temporary file and then compared to # the previous version. This helps to avoid unnecessary re-builds of # $(TRUNNER_BPF_OBJS) $(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) @@ -496,15 +496,16 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ test_subskeleton.skel.h test_subskeleton_lib.skel.h \ test_usdt.skel.h -LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ - trace_printk.c trace_vprintk.c map_ptr_kern.c \ +LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c +LSKELS_SIGNED := fentry_test.c fexit_test.c atomics.c + # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ kfunc_call_test_subprog.c -SKEL_BLACKLIST += $$(LSKELS) +SKEL_BLACKLIST += $$(LSKELS) $$(LSKELS_SIGNED) test_static_linked.skel.h-deps := test_static_linked1.bpf.o test_static_linked2.bpf.o linked_funcs.skel.h-deps := linked_funcs1.bpf.o linked_funcs2.bpf.o @@ -535,6 +536,7 @@ HEADERS_FOR_BPF_OBJS := $(wildcard $(BPFDIR)/*.bpf.h) \ # $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER +LSKEL_SIGN := -S -k $(PRIVATE_KEY) -i $(VERIFICATION_CERT) TRUNNER_OUTPUT := $(OUTPUT)$(if $2,/)$2 TRUNNER_BINARY := $1$(if $2,-)$2 TRUNNER_TEST_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.test.o, \ @@ -550,6 +552,7 @@ TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ $$(TRUNNER_BPF_SRCS))) TRUNNER_BPF_LSKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.lskel.h, $$(LSKELS) $$(LSKELS_EXTRA)) TRUNNER_BPF_SKELS_LINKED := $$(addprefix $$(TRUNNER_OUTPUT)/,$(LINKED_SKELS)) +TRUNNER_BPF_LSKELS_SIGNED := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.lskel.h, $$(LSKELS_SIGNED)) TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) # Evaluate rules now with extra TRUNNER_XXX variables above already defined @@ -604,6 +607,15 @@ $(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) +$(TRUNNER_BPF_LSKELS_SIGNED): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) + $$(call msg,GEN-SKEL,$(TRUNNER_BINARY) (signed),$$@) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< + $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) + $(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) + $(Q)$$(BPFTOOL) gen skeleton $(LSKEL_SIGN) $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ + $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) + $(LINKED_BPF_OBJS): %: $(TRUNNER_OUTPUT)/% # .SECONDEXPANSION here allows to correctly expand %-deps variables as prerequisites @@ -653,6 +665,7 @@ $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_EXTRA_HDRS) \ $(TRUNNER_BPF_SKELS) \ $(TRUNNER_BPF_LSKELS) \ + $(TRUNNER_BPF_LSKELS_SIGNED) \ $(TRUNNER_BPF_SKELS_LINKED) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) @@ -667,6 +680,7 @@ $(foreach N,$(patsubst $(TRUNNER_OUTPUT)/%.o,%,$(TRUNNER_EXTRA_OBJS)), \ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ $(TRUNNER_EXTRA_HDRS) \ + $(VERIFY_SIG_HDR) \ $(TRUNNER_TESTS_HDR) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@) @@ -697,6 +711,18 @@ $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ endef +VERIFY_SIG_SETUP := $(CURDIR)/verify_sig_setup.sh +VERIFY_SIG_HDR := verification_cert.h +VERIFICATION_CERT := $(BUILD_DIR)/signing_key.der +PRIVATE_KEY := $(BUILD_DIR)/signing_key.pem + +$(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP) + $(Q)mkdir -p $(BUILD_DIR) + $(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR) + +$(VERIFY_SIG_HDR): $(VERIFICATION_CERT) + $(Q)xxd -i -n test_progs_verification_cert $< > $@ + # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs @@ -716,6 +742,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ disasm.c \ disasm_helpers.c \ json_writer.c \ + $(VERIFY_SIG_HDR) \ flow_dissector_load.h \ ip_check_defrag_frags.h TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ @@ -725,7 +752,7 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/uprobe_multi \ $(TEST_KMOD_TARGETS) \ ima_setup.sh \ - verify_sig_setup.sh \ + $(VERIFY_SIG_SETUP) \ $(wildcard progs/btf_dump_test_case_*.c) \ $(wildcard progs/*.bpf.o) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE @@ -816,6 +843,7 @@ $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h $(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench_sockmap.o: $(OUTPUT)/bench_sockmap_prog.skel.h +$(OUTPUT)/bench_lpm_trie_map.o: $(OUTPUT)/lpm_trie_bench.skel.h $(OUTPUT)/lpm_trie_map.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -837,6 +865,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_htab_mem.o \ $(OUTPUT)/bench_bpf_crypto.o \ $(OUTPUT)/bench_sockmap.o \ + $(OUTPUT)/bench_lpm_trie_map.o \ # $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index ddd73d06a1eb..bd29bb2e6cb5 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -284,6 +284,7 @@ extern struct argp bench_htab_mem_argp; extern struct argp bench_trigger_batch_argp; extern struct argp bench_crypto_argp; extern struct argp bench_sockmap_argp; +extern struct argp bench_lpm_trie_map_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -299,6 +300,7 @@ static const struct argp_child bench_parsers[] = { { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, { &bench_sockmap_argp, 0, "bpf sockmap benchmark", 0 }, + { &bench_lpm_trie_map_argp, 0, "LPM trie map benchmark", 0 }, {}, }; @@ -499,7 +501,7 @@ extern const struct bench bench_rename_rawtp; extern const struct bench bench_rename_fentry; extern const struct bench bench_rename_fexit; -/* pure counting benchmarks to establish theoretical lmits */ +/* pure counting benchmarks to establish theoretical limits */ extern const struct bench bench_trig_usermode_count; extern const struct bench bench_trig_syscall_count; extern const struct bench bench_trig_kernel_count; @@ -510,6 +512,8 @@ extern const struct bench bench_trig_kretprobe; extern const struct bench bench_trig_kprobe_multi; extern const struct bench bench_trig_kretprobe_multi; extern const struct bench bench_trig_fentry; +extern const struct bench bench_trig_kprobe_multi_all; +extern const struct bench bench_trig_kretprobe_multi_all; extern const struct bench bench_trig_fexit; extern const struct bench bench_trig_fmodret; extern const struct bench bench_trig_tp; @@ -558,6 +562,13 @@ extern const struct bench bench_htab_mem; extern const struct bench bench_crypto_encrypt; extern const struct bench bench_crypto_decrypt; extern const struct bench bench_sockmap; +extern const struct bench bench_lpm_trie_noop; +extern const struct bench bench_lpm_trie_baseline; +extern const struct bench bench_lpm_trie_lookup; +extern const struct bench bench_lpm_trie_insert; +extern const struct bench bench_lpm_trie_update; +extern const struct bench bench_lpm_trie_delete; +extern const struct bench bench_lpm_trie_free; static const struct bench *benchs[] = { &bench_count_global, @@ -578,6 +589,8 @@ static const struct bench *benchs[] = { &bench_trig_kprobe_multi, &bench_trig_kretprobe_multi, &bench_trig_fentry, + &bench_trig_kprobe_multi_all, + &bench_trig_kretprobe_multi_all, &bench_trig_fexit, &bench_trig_fmodret, &bench_trig_tp, @@ -625,6 +638,13 @@ static const struct bench *benchs[] = { &bench_crypto_encrypt, &bench_crypto_decrypt, &bench_sockmap, + &bench_lpm_trie_noop, + &bench_lpm_trie_baseline, + &bench_lpm_trie_lookup, + &bench_lpm_trie_insert, + &bench_lpm_trie_update, + &bench_lpm_trie_delete, + &bench_lpm_trie_free, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index 005c401b3e22..bea323820ffb 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -46,6 +46,7 @@ struct bench_res { unsigned long gp_ns; unsigned long gp_ct; unsigned int stime; + unsigned long duration_ns; }; struct bench { diff --git a/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c b/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c new file mode 100644 index 000000000000..246f6cb3387d --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Cloudflare */ + +/* + * All of these benchmarks operate on tries with keys in the range + * [0, args.nr_entries), i.e. there are no gaps or partially filled + * branches of the trie for any key < args.nr_entries. + * + * This gives an idea of worst-case behaviour. + */ + +#include <argp.h> +#include <linux/time64.h> +#include <linux/if_ether.h> +#include "lpm_trie_bench.skel.h" +#include "lpm_trie_map.skel.h" +#include "bench.h" +#include "testing_helpers.h" +#include "progs/lpm_trie.h" + +static struct ctx { + struct lpm_trie_bench *bench; +} ctx; + +static struct { + __u32 nr_entries; + __u32 prefixlen; + bool random; +} args = { + .nr_entries = 0, + .prefixlen = 32, + .random = false, +}; + +enum { + ARG_NR_ENTRIES = 9000, + ARG_PREFIX_LEN, + ARG_RANDOM, +}; + +static const struct argp_option opts[] = { + { "nr_entries", ARG_NR_ENTRIES, "NR_ENTRIES", 0, + "Number of unique entries in the LPM trie" }, + { "prefix_len", ARG_PREFIX_LEN, "PREFIX_LEN", 0, + "Number of prefix bits to use in the LPM trie" }, + { "random", ARG_RANDOM, NULL, 0, "Access random keys during op" }, + {}, +}; + +static error_t lpm_parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_NR_ENTRIES: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "Invalid nr_entries count."); + argp_usage(state); + } + args.nr_entries = ret; + break; + case ARG_PREFIX_LEN: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "Invalid prefix_len value."); + argp_usage(state); + } + args.prefixlen = ret; + break; + case ARG_RANDOM: + args.random = true; + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +const struct argp bench_lpm_trie_map_argp = { + .options = opts, + .parser = lpm_parse_arg, +}; + +static void validate_common(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "benchmark doesn't support consumer\n"); + exit(1); + } + + if (args.nr_entries == 0) { + fprintf(stderr, "Missing --nr_entries parameter\n"); + exit(1); + } + + if ((1UL << args.prefixlen) < args.nr_entries) { + fprintf(stderr, "prefix_len value too small for nr_entries\n"); + exit(1); + } +} + +static void lpm_insert_validate(void) +{ + validate_common(); + + if (env.producer_cnt != 1) { + fprintf(stderr, "lpm-trie-insert requires a single producer\n"); + exit(1); + } + + if (args.random) { + fprintf(stderr, "lpm-trie-insert does not support --random\n"); + exit(1); + } +} + +static void lpm_delete_validate(void) +{ + validate_common(); + + if (env.producer_cnt != 1) { + fprintf(stderr, "lpm-trie-delete requires a single producer\n"); + exit(1); + } + + if (args.random) { + fprintf(stderr, "lpm-trie-delete does not support --random\n"); + exit(1); + } +} + +static void lpm_free_validate(void) +{ + validate_common(); + + if (env.producer_cnt != 1) { + fprintf(stderr, "lpm-trie-free requires a single producer\n"); + exit(1); + } + + if (args.random) { + fprintf(stderr, "lpm-trie-free does not support --random\n"); + exit(1); + } +} + +static struct trie_key *keys; +static __u32 *vals; + +static void fill_map(int map_fd) +{ + int err; + + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + err = bpf_map_update_batch(map_fd, keys, vals, &args.nr_entries, &opts); + if (err) { + fprintf(stderr, "failed to batch update keys to map: %d\n", + -err); + exit(1); + } +} + +static void empty_map(int map_fd) +{ + int err; + + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + err = bpf_map_delete_batch(map_fd, keys, &args.nr_entries, &opts); + if (err) { + fprintf(stderr, "failed to batch delete keys for map: %d\n", + -err); + exit(1); + } +} + +static void attach_prog(void) +{ + int i; + + ctx.bench = lpm_trie_bench__open_and_load(); + if (!ctx.bench) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + ctx.bench->bss->nr_entries = args.nr_entries; + ctx.bench->bss->prefixlen = args.prefixlen; + ctx.bench->bss->random = args.random; + + if (lpm_trie_bench__attach(ctx.bench)) { + fprintf(stderr, "failed to attach skeleton\n"); + exit(1); + } + + keys = calloc(args.nr_entries, sizeof(*keys)); + vals = calloc(args.nr_entries, sizeof(*vals)); + + for (i = 0; i < args.nr_entries; i++) { + struct trie_key *k = &keys[i]; + __u32 *v = &vals[i]; + + k->prefixlen = args.prefixlen; + k->data = i; + *v = 1; + } +} + +static void attach_prog_and_fill_map(void) +{ + int fd; + + attach_prog(); + + fd = bpf_map__fd(ctx.bench->maps.trie_map); + fill_map(fd); +} + +static void lpm_noop_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_NOOP; +} + +static void lpm_baseline_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_BASELINE; +} + +static void lpm_lookup_setup(void) +{ + attach_prog_and_fill_map(); + ctx.bench->bss->op = LPM_OP_LOOKUP; +} + +static void lpm_insert_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_INSERT; +} + +static void lpm_update_setup(void) +{ + attach_prog_and_fill_map(); + ctx.bench->bss->op = LPM_OP_UPDATE; +} + +static void lpm_delete_setup(void) +{ + attach_prog_and_fill_map(); + ctx.bench->bss->op = LPM_OP_DELETE; +} + +static void lpm_free_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_FREE; +} + +static void lpm_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.bench->bss->hits, 0); + res->duration_ns = atomic_swap(&ctx.bench->bss->duration_ns, 0); +} + +static void bench_reinit_map(void) +{ + int fd = bpf_map__fd(ctx.bench->maps.trie_map); + + switch (ctx.bench->bss->op) { + case LPM_OP_INSERT: + /* trie_map needs to be emptied */ + empty_map(fd); + break; + case LPM_OP_DELETE: + /* trie_map needs to be refilled */ + fill_map(fd); + break; + default: + fprintf(stderr, "Unexpected REINIT return code for op %d\n", + ctx.bench->bss->op); + exit(1); + } +} + +/* For NOOP, BASELINE, LOOKUP, INSERT, UPDATE, and DELETE */ +static void *lpm_producer(void *unused __always_unused) +{ + int err; + char in[ETH_HLEN]; /* unused */ + + LIBBPF_OPTS(bpf_test_run_opts, opts, .data_in = in, + .data_size_in = sizeof(in), .repeat = 1, ); + + while (true) { + int fd = bpf_program__fd(ctx.bench->progs.run_bench); + err = bpf_prog_test_run_opts(fd, &opts); + if (err) { + fprintf(stderr, "failed to run BPF prog: %d\n", err); + exit(1); + } + + /* Check for kernel error code */ + if ((int)opts.retval < 0) { + fprintf(stderr, "BPF prog returned error: %d\n", + opts.retval); + exit(1); + } + + switch (opts.retval) { + case LPM_BENCH_SUCCESS: + break; + case LPM_BENCH_REINIT_MAP: + bench_reinit_map(); + break; + default: + fprintf(stderr, "Unexpected BPF prog return code %d for op %d\n", + opts.retval, ctx.bench->bss->op); + exit(1); + } + } + + return NULL; +} + +static void *lpm_free_producer(void *unused __always_unused) +{ + while (true) { + struct lpm_trie_map *skel; + + skel = lpm_trie_map__open_and_load(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + fill_map(bpf_map__fd(skel->maps.trie_free_map)); + lpm_trie_map__destroy(skel); + } + + return NULL; +} + +/* + * The standard bench op_report_*() functions assume measurements are + * taken over a 1-second interval but operations that modify the map + * (INSERT, DELETE, and FREE) cannot run indefinitely without + * "resetting" the map to the initial state. Depending on the size of + * the map, this likely needs to happen before the 1-second timer fires. + * + * Calculate the fraction of a second over which the op measurement was + * taken (to ignore any time spent doing the reset) and report the + * throughput results per second. + */ +static void frac_second_report_progress(int iter, struct bench_res *res, + long delta_ns, double rate_divisor, + char rate) +{ + double hits_per_sec, hits_per_prod; + + hits_per_sec = res->hits / rate_divisor / + (res->duration_ns / (double)NSEC_PER_SEC); + hits_per_prod = hits_per_sec / env.producer_cnt; + + printf("Iter %3d (%7.3lfus): ", iter, + (delta_ns - NSEC_PER_SEC) / 1000.0); + printf("hits %8.3lf%c/s (%7.3lf%c/prod)\n", hits_per_sec, rate, + hits_per_prod, rate); +} + +static void frac_second_report_final(struct bench_res res[], int res_cnt, + double lat_divisor, double rate_divisor, + char rate, const char *unit) +{ + double hits_mean = 0.0, hits_stddev = 0.0; + double latency = 0.0; + int i; + + for (i = 0; i < res_cnt; i++) { + double val = res[i].hits / rate_divisor / + (res[i].duration_ns / (double)NSEC_PER_SEC); + hits_mean += val / (0.0 + res_cnt); + latency += res[i].duration_ns / res[i].hits / (0.0 + res_cnt); + } + + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) { + double val = + res[i].hits / rate_divisor / + (res[i].duration_ns / (double)NSEC_PER_SEC); + hits_stddev += (hits_mean - val) * (hits_mean - val) / + (res_cnt - 1.0); + } + + hits_stddev = sqrt(hits_stddev); + } + printf("Summary: throughput %8.3lf \u00B1 %5.3lf %c ops/s (%7.3lf%c ops/prod), ", + hits_mean, hits_stddev, rate, hits_mean / env.producer_cnt, + rate); + printf("latency %8.3lf %s/op\n", + latency / lat_divisor / env.producer_cnt, unit); +} + +static void insert_ops_report_progress(int iter, struct bench_res *res, + long delta_ns) +{ + double rate_divisor = 1000000.0; + char rate = 'M'; + + frac_second_report_progress(iter, res, delta_ns, rate_divisor, rate); +} + +static void delete_ops_report_progress(int iter, struct bench_res *res, + long delta_ns) +{ + double rate_divisor = 1000000.0; + char rate = 'M'; + + frac_second_report_progress(iter, res, delta_ns, rate_divisor, rate); +} + +static void free_ops_report_progress(int iter, struct bench_res *res, + long delta_ns) +{ + double rate_divisor = 1000.0; + char rate = 'K'; + + frac_second_report_progress(iter, res, delta_ns, rate_divisor, rate); +} + +static void insert_ops_report_final(struct bench_res res[], int res_cnt) +{ + double lat_divisor = 1.0; + double rate_divisor = 1000000.0; + const char *unit = "ns"; + char rate = 'M'; + + frac_second_report_final(res, res_cnt, lat_divisor, rate_divisor, rate, + unit); +} + +static void delete_ops_report_final(struct bench_res res[], int res_cnt) +{ + double lat_divisor = 1.0; + double rate_divisor = 1000000.0; + const char *unit = "ns"; + char rate = 'M'; + + frac_second_report_final(res, res_cnt, lat_divisor, rate_divisor, rate, + unit); +} + +static void free_ops_report_final(struct bench_res res[], int res_cnt) +{ + double lat_divisor = 1000000.0; + double rate_divisor = 1000.0; + const char *unit = "ms"; + char rate = 'K'; + + frac_second_report_final(res, res_cnt, lat_divisor, rate_divisor, rate, + unit); +} + +/* noop bench measures harness-overhead */ +const struct bench bench_lpm_trie_noop = { + .name = "lpm-trie-noop", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_noop_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* baseline overhead for lookup and update */ +const struct bench bench_lpm_trie_baseline = { + .name = "lpm-trie-baseline", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_baseline_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* measure cost of doing a lookup on existing entries in a full trie */ +const struct bench bench_lpm_trie_lookup = { + .name = "lpm-trie-lookup", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_lookup_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* measure cost of inserting new entries into an empty trie */ +const struct bench bench_lpm_trie_insert = { + .name = "lpm-trie-insert", + .argp = &bench_lpm_trie_map_argp, + .validate = lpm_insert_validate, + .setup = lpm_insert_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = insert_ops_report_progress, + .report_final = insert_ops_report_final, +}; + +/* measure cost of updating existing entries in a full trie */ +const struct bench bench_lpm_trie_update = { + .name = "lpm-trie-update", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_update_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* measure cost of deleting existing entries from a full trie */ +const struct bench bench_lpm_trie_delete = { + .name = "lpm-trie-delete", + .argp = &bench_lpm_trie_map_argp, + .validate = lpm_delete_validate, + .setup = lpm_delete_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = delete_ops_report_progress, + .report_final = delete_ops_report_final, +}; + +/* measure cost of freeing a full trie */ +const struct bench bench_lpm_trie_free = { + .name = "lpm-trie-free", + .argp = &bench_lpm_trie_map_argp, + .validate = lpm_free_validate, + .setup = lpm_free_setup, + .producer_thread = lpm_free_producer, + .measure = lpm_measure, + .report_progress = free_ops_report_progress, + .report_final = free_ops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_sockmap.c b/tools/testing/selftests/bpf/benchs/bench_sockmap.c index 8ebf563a67a2..cfc072aa7fff 100644 --- a/tools/testing/selftests/bpf/benchs/bench_sockmap.c +++ b/tools/testing/selftests/bpf/benchs/bench_sockmap.c @@ -10,6 +10,7 @@ #include <argp.h> #include "bench.h" #include "bench_sockmap_prog.skel.h" +#include "bpf_util.h" #define FILE_SIZE (128 * 1024) #define DATA_REPEAT_SIZE 10 @@ -124,8 +125,8 @@ static void bench_sockmap_prog_destroy(void) { int i; - for (i = 0; i < sizeof(ctx.fds); i++) { - if (ctx.fds[0] > 0) + for (i = 0; i < ARRAY_SIZE(ctx.fds); i++) { + if (ctx.fds[i] > 0) close(ctx.fds[i]); } diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 82327657846e..1e2aff007c2a 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -226,6 +226,65 @@ static void trigger_fentry_setup(void) attach_bpf(ctx.skel->progs.bench_trigger_fentry); } +static void attach_ksyms_all(struct bpf_program *empty, bool kretprobe) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + char **syms = NULL; + size_t cnt = 0; + + /* Some recursive functions will be skipped in + * bpf_get_ksyms -> skip_entry, as they can introduce sufficient + * overhead. However, it's difficut to skip all the recursive + * functions for a debug kernel. + * + * So, don't run the kprobe-multi-all and kretprobe-multi-all on + * a debug kernel. + */ + if (bpf_get_ksyms(&syms, &cnt, true)) { + fprintf(stderr, "failed to get ksyms\n"); + exit(1); + } + + opts.syms = (const char **) syms; + opts.cnt = cnt; + opts.retprobe = kretprobe; + /* attach empty to all the kernel functions except bpf_get_numa_node_id. */ + if (!bpf_program__attach_kprobe_multi_opts(empty, NULL, &opts)) { + fprintf(stderr, "failed to attach bpf_program__attach_kprobe_multi_opts to all\n"); + exit(1); + } +} + +static void trigger_kprobe_multi_all_setup(void) +{ + struct bpf_program *prog, *empty; + + setup_ctx(); + empty = ctx.skel->progs.bench_kprobe_multi_empty; + prog = ctx.skel->progs.bench_trigger_kprobe_multi; + bpf_program__set_autoload(empty, true); + bpf_program__set_autoload(prog, true); + load_ctx(); + + attach_ksyms_all(empty, false); + attach_bpf(prog); +} + +static void trigger_kretprobe_multi_all_setup(void) +{ + struct bpf_program *prog, *empty; + + setup_ctx(); + empty = ctx.skel->progs.bench_kretprobe_multi_empty; + prog = ctx.skel->progs.bench_trigger_kretprobe_multi; + bpf_program__set_autoload(empty, true); + bpf_program__set_autoload(prog, true); + load_ctx(); + + attach_ksyms_all(empty, true); + attach_bpf(prog); +} + static void trigger_fexit_setup(void) { setup_ctx(); @@ -512,6 +571,8 @@ BENCH_TRIG_KERNEL(kretprobe, "kretprobe"); BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi"); BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi"); BENCH_TRIG_KERNEL(fentry, "fentry"); +BENCH_TRIG_KERNEL(kprobe_multi_all, "kprobe-multi-all"); +BENCH_TRIG_KERNEL(kretprobe_multi_all, "kretprobe-multi-all"); BENCH_TRIG_KERNEL(fexit, "fexit"); BENCH_TRIG_KERNEL(fmodret, "fmodret"); BENCH_TRIG_KERNEL(tp, "tp"); diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index a690f5a68b6b..f7573708a0c3 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -6,8 +6,8 @@ def_tests=( \ usermode-count kernel-count syscall-count \ fentry fexit fmodret \ rawtp tp \ - kprobe kprobe-multi \ - kretprobe kretprobe-multi \ + kprobe kprobe-multi kprobe-multi-all \ + kretprobe kretprobe-multi kretprobe-multi-all \ ) tests=("$@") diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index da7e230f2781..d89eda3fd8a3 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -599,4 +599,58 @@ extern void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it) __weak __ksym; extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, struct bpf_dynptr *value_p) __weak __ksym; +#define PREEMPT_BITS 8 +#define SOFTIRQ_BITS 8 +#define HARDIRQ_BITS 4 +#define NMI_BITS 4 + +#define PREEMPT_SHIFT 0 +#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) +#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) +#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) + +#define __IRQ_MASK(x) ((1UL << (x))-1) + +#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) +#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) +#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) + +extern bool CONFIG_PREEMPT_RT __kconfig __weak; +#ifdef bpf_target_x86 +extern const int __preempt_count __ksym; +#endif + +struct task_struct___preempt_rt { + int softirq_disable_cnt; +} __attribute__((preserve_access_index)); + +static inline int get_preempt_count(void) +{ +#if defined(bpf_target_x86) + return *(int *) bpf_this_cpu_ptr(&__preempt_count); +#elif defined(bpf_target_arm64) + return bpf_get_current_task_btf()->thread_info.preempt.count; +#endif + return 0; +} + +/* Description + * Report whether it is in interrupt context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_interrupt(void) +{ + struct task_struct___preempt_rt *tsk; + int pcnt; + + pcnt = get_preempt_count(); + if (!CONFIG_PREEMPT_RT) + return pcnt & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK); + + tsk = (void *) bpf_get_current_task_btf(); + return (pcnt & (NMI_MASK | HARDIRQ_MASK)) | + (tsk->softirq_disable_cnt & SOFTIRQ_MASK); +} + #endif diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 9386dfe8b884..794d44d19c88 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -19,6 +19,9 @@ extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags, extern int bpf_dynptr_from_xdp(struct xdp_md *xdp, __u64 flags, struct bpf_dynptr *ptr__uninit) __ksym __weak; +extern int bpf_dynptr_from_skb_meta(struct __sk_buff *skb, __u64 flags, + struct bpf_dynptr *ptr__uninit) __ksym __weak; + /* Description * Obtain a read-only pointer to the dynptr's data * Returns diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index 5f6963a320d7..4bc2d25f33e1 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -67,6 +67,9 @@ static inline void bpf_strlcpy(char *dst, const char *src, size_t sz) #define sys_gettid() syscall(SYS_gettid) #endif +/* and poison usage to ensure it does not creep back in. */ +#pragma GCC poison gettid + #ifndef ENOTSUPP #define ENOTSUPP 524 #endif diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 15f626014872..20cede4db3ce 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -412,6 +412,26 @@ void remove_cgroup(const char *relative_path) log_err("rmdiring cgroup %s .. %s", relative_path, cgroup_path); } +/* + * remove_cgroup_pid() - Remove a cgroup setup by process identified by PID + * @relative_path: The cgroup path, relative to the workdir, to remove + * @pid: PID to be used to find cgroup_path + * + * This function expects a cgroup to already be created, relative to the cgroup + * work dir. It also expects the cgroup doesn't have any children or live + * processes and it removes the cgroup. + * + * On failure, it will print an error to stderr. + */ +void remove_cgroup_pid(const char *relative_path, int pid) +{ + char cgroup_path[PATH_MAX + 1]; + + format_cgroup_path_pid(cgroup_path, relative_path, pid); + if (rmdir(cgroup_path)) + log_err("rmdiring cgroup %s .. %s", relative_path, cgroup_path); +} + /** * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD * @relative_path: The cgroup path, relative to the workdir, to join diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h index 182e1ac36c95..3857304be874 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.h +++ b/tools/testing/selftests/bpf/cgroup_helpers.h @@ -19,6 +19,7 @@ int cgroup_setup_and_join(const char *relative_path); int get_root_cgroup(void); int create_and_get_cgroup(const char *relative_path); void remove_cgroup(const char *relative_path); +void remove_cgroup_pid(const char *relative_path, int pid); unsigned long long get_cgroup_id(const char *relative_path); int get_cgroup1_hierarchy_id(const char *subsys_name); diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 8916ab814a3e..70b28c1e653e 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -61,6 +61,7 @@ CONFIG_MPLS_IPTUNNEL=y CONFIG_MPLS_ROUTING=y CONFIG_MPTCP=y CONFIG_NET_ACT_GACT=y +CONFIG_NET_ACT_MIRRED=y CONFIG_NET_ACT_SKBMOD=y CONFIG_NET_CLS=y CONFIG_NET_CLS_ACT=y diff --git a/tools/testing/selftests/bpf/config.aarch64 b/tools/testing/selftests/bpf/config.aarch64 index e1495a4bbc99..7efad36ceb26 100644 --- a/tools/testing/selftests/bpf/config.aarch64 +++ b/tools/testing/selftests/bpf/config.aarch64 @@ -31,10 +31,7 @@ CONFIG_COMPAT=y CONFIG_CPUSETS=y CONFIG_CRASH_DUMP=y CONFIG_CRYPTO_USER_API_RNG=y -CONFIG_CRYPTO_USER_API_SKCIPHER=y CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_INFO_BTF=y -CONFIG_DEBUG_INFO_DWARF4=y CONFIG_DEBUG_INFO_REDUCED=n CONFIG_DEBUG_LIST=y CONFIG_DEBUG_LOCKDEP=y @@ -46,7 +43,6 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEVTMPFS=y CONFIG_DRM=y -CONFIG_DUMMY=y CONFIG_EXPERT=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -70,13 +66,11 @@ CONFIG_HZ_100=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_IKHEADERS=y CONFIG_INET6_ESP=y -CONFIG_INET_ESP=y CONFIG_INET=y CONFIG_INPUT_EVDEV=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTICAST=y CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_IPTABLES=y CONFIG_IPV6_SEG6_LWTUNNEL=y CONFIG_IPVLAN=y CONFIG_JUMP_LABEL=y @@ -97,22 +91,18 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_NAMESPACES=y CONFIG_NET_ACT_BPF=y -CONFIG_NET_ACT_GACT=y CONFIG_NETDEVICES=y CONFIG_NETFILTER_XT_MATCH_BPF=y CONFIG_NETFILTER_XT_TARGET_MARK=y CONFIG_NET_KEY=y -CONFIG_NET_SCH_FQ=y CONFIG_NET_VRF=y CONFIG_NET=y -CONFIG_NF_TABLES=y CONFIG_NLMON=y CONFIG_NO_HZ_IDLE=y CONFIG_NR_CPUS=256 CONFIG_NUMA=y CONFIG_OVERLAY_FS=y CONFIG_PACKET_DIAG=y -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI_HOST_GENERIC=y @@ -149,7 +139,6 @@ CONFIG_TASK_XACCT=y CONFIG_TCG_TIS=y CONFIG_TCG_TPM=y CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_DCTCP=y CONFIG_TLS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS=y @@ -161,6 +150,5 @@ CONFIG_UPROBES=y CONFIG_USER_NS=y CONFIG_VETH=y CONFIG_VLAN_8021Q=y -CONFIG_VSOCKETS=y CONFIG_VSOCKETS_LOOPBACK=y CONFIG_XFRM_USER=y diff --git a/tools/testing/selftests/bpf/config.ppc64el b/tools/testing/selftests/bpf/config.ppc64el index 9acf389dc4ce..b53afb5e0b71 100644 --- a/tools/testing/selftests/bpf/config.ppc64el +++ b/tools/testing/selftests/bpf/config.ppc64el @@ -54,7 +54,6 @@ CONFIG_NET=y CONFIG_NO_HZ_IDLE=y CONFIG_NONPORTABLE=y CONFIG_NR_CPUS=256 -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI_HOST_GENERIC=y diff --git a/tools/testing/selftests/bpf/config.riscv64 b/tools/testing/selftests/bpf/config.riscv64 index bb7043a80e1a..7bee24a79a71 100644 --- a/tools/testing/selftests/bpf/config.riscv64 +++ b/tools/testing/selftests/bpf/config.riscv64 @@ -48,7 +48,6 @@ CONFIG_NET_VRF=y CONFIG_NONPORTABLE=y CONFIG_NO_HZ_IDLE=y CONFIG_NR_CPUS=256 -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI=y diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x index 26c3bc2ce11d..db61878148e4 100644 --- a/tools/testing/selftests/bpf/config.s390x +++ b/tools/testing/selftests/bpf/config.s390x @@ -22,10 +22,7 @@ CONFIG_CHECKPOINT_RESTORE=y CONFIG_CPUSETS=y CONFIG_CRASH_DUMP=y CONFIG_CRYPTO_USER_API_RNG=y -CONFIG_CRYPTO_USER_API_SKCIPHER=y CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_INFO_BTF=y -CONFIG_DEBUG_INFO_DWARF4=y CONFIG_DEBUG_LIST=y CONFIG_DEBUG_LOCKDEP=y CONFIG_DEBUG_NOTIFIERS=y @@ -56,11 +53,9 @@ CONFIG_IDLE_PAGE_TRACKING=y CONFIG_IKHEADERS=y CONFIG_INET6_ESP=y CONFIG_INET=y -CONFIG_INET_ESP=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTICAST=y CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_IPTABLES=y CONFIG_IPV6_SEG6_LWTUNNEL=y CONFIG_IPVLAN=y CONFIG_JUMP_LABEL=y @@ -83,18 +78,14 @@ CONFIG_MEMORY_HOTREMOVE=y CONFIG_NAMESPACES=y CONFIG_NET=y CONFIG_NET_ACT_BPF=y -CONFIG_NET_ACT_GACT=y CONFIG_NET_KEY=y -CONFIG_NET_SCH_FQ=y CONFIG_NET_VRF=y CONFIG_NETDEVICES=y CONFIG_NETFILTER_XT_MATCH_BPF=y CONFIG_NETFILTER_XT_TARGET_MARK=y -CONFIG_NF_TABLES=y CONFIG_NO_HZ_IDLE=y CONFIG_NR_CPUS=256 CONFIG_NUMA=y -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI=y @@ -119,7 +110,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_TASK_XACCT=y CONFIG_TASKSTATS=y CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_DCTCP=y CONFIG_TLS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y @@ -131,6 +121,5 @@ CONFIG_UPROBES=y CONFIG_USER_NS=y CONFIG_VETH=y CONFIG_VLAN_8021Q=y -CONFIG_VSOCKETS=y CONFIG_VSOCKETS_LOOPBACK=y CONFIG_XFRM_USER=y diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64 index 5e713ef7caa3..42ad817b00ae 100644 --- a/tools/testing/selftests/bpf/config.x86_64 +++ b/tools/testing/selftests/bpf/config.x86_64 @@ -44,7 +44,6 @@ CONFIG_CRYPTO_SEQIV=y CONFIG_CRYPTO_XXHASH=y CONFIG_DCB=y CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_INFO_BTF=y CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_DEFAULT_FQ_CODEL=y @@ -104,12 +103,10 @@ CONFIG_HZ_1000=y CONFIG_INET=y CONFIG_INPUT_EVDEV=y CONFIG_INTEL_POWERCLAMP=y -CONFIG_IP6_NF_IPTABLES=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MROUTE=y CONFIG_IP_MULTICAST=y CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_IPTABLES=y CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y CONFIG_IP_ROUTE_MULTIPATH=y @@ -162,7 +159,6 @@ CONFIG_NUMA=y CONFIG_NUMA_BALANCING=y CONFIG_NVMEM=y CONFIG_OSF_PARTITION=y -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI=y @@ -220,7 +216,6 @@ CONFIG_VALIDATE_FS_PARSER=y CONFIG_VETH=y CONFIG_VIRT_DRIVERS=y CONFIG_VLAN_8021Q=y -CONFIG_VSOCKETS=y CONFIG_VSOCKETS_LOOPBACK=y CONFIG_X86_ACPI_CPUFREQ=y CONFIG_X86_CPUID=y diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 72b5c174ab3b..cdf7b6641444 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -457,7 +457,7 @@ int append_tid(char *str, size_t sz) if (end + 8 > sz) return -1; - sprintf(&str[end], "%07d", gettid()); + sprintf(&str[end], "%07ld", sys_gettid()); str[end + 7] = '\0'; return 0; diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 1d53a8561ee2..24c509ce4e5b 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -42,11 +42,11 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "2"}, - {1, "R3_w", "4"}, - {2, "R3_w", "8"}, - {3, "R3_w", "16"}, - {4, "R3_w", "32"}, + {0, "R3", "2"}, + {1, "R3", "4"}, + {2, "R3", "8"}, + {3, "R3", "16"}, + {4, "R3", "32"}, }, }, { @@ -70,17 +70,17 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "1"}, - {1, "R3_w", "2"}, - {2, "R3_w", "4"}, - {3, "R3_w", "8"}, - {4, "R3_w", "16"}, - {5, "R3_w", "1"}, - {6, "R4_w", "32"}, - {7, "R4_w", "16"}, - {8, "R4_w", "8"}, - {9, "R4_w", "4"}, - {10, "R4_w", "2"}, + {0, "R3", "1"}, + {1, "R3", "2"}, + {2, "R3", "4"}, + {3, "R3", "8"}, + {4, "R3", "16"}, + {5, "R3", "1"}, + {6, "R4", "32"}, + {7, "R4", "16"}, + {8, "R4", "8"}, + {9, "R4", "4"}, + {10, "R4", "2"}, }, }, { @@ -99,12 +99,12 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "4"}, - {1, "R3_w", "8"}, - {2, "R3_w", "10"}, - {3, "R4_w", "8"}, - {4, "R4_w", "12"}, - {5, "R4_w", "14"}, + {0, "R3", "4"}, + {1, "R3", "8"}, + {2, "R3", "10"}, + {3, "R4", "8"}, + {4, "R4", "12"}, + {5, "R4", "14"}, }, }, { @@ -121,10 +121,10 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "7"}, - {1, "R3_w", "7"}, - {2, "R3_w", "14"}, - {3, "R3_w", "56"}, + {0, "R3", "7"}, + {1, "R3", "7"}, + {2, "R3", "14"}, + {3, "R3", "56"}, }, }, @@ -162,19 +162,19 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {6, "R0_w", "pkt(off=8,r=8)"}, - {6, "R3_w", "var_off=(0x0; 0xff)"}, - {7, "R3_w", "var_off=(0x0; 0x1fe)"}, - {8, "R3_w", "var_off=(0x0; 0x3fc)"}, - {9, "R3_w", "var_off=(0x0; 0x7f8)"}, - {10, "R3_w", "var_off=(0x0; 0xff0)"}, - {12, "R3_w", "pkt_end()"}, - {17, "R4_w", "var_off=(0x0; 0xff)"}, - {18, "R4_w", "var_off=(0x0; 0x1fe0)"}, - {19, "R4_w", "var_off=(0x0; 0xff0)"}, - {20, "R4_w", "var_off=(0x0; 0x7f8)"}, - {21, "R4_w", "var_off=(0x0; 0x3fc)"}, - {22, "R4_w", "var_off=(0x0; 0x1fe)"}, + {6, "R0", "pkt(off=8,r=8)"}, + {6, "R3", "var_off=(0x0; 0xff)"}, + {7, "R3", "var_off=(0x0; 0x1fe)"}, + {8, "R3", "var_off=(0x0; 0x3fc)"}, + {9, "R3", "var_off=(0x0; 0x7f8)"}, + {10, "R3", "var_off=(0x0; 0xff0)"}, + {12, "R3", "pkt_end()"}, + {17, "R4", "var_off=(0x0; 0xff)"}, + {18, "R4", "var_off=(0x0; 0x1fe0)"}, + {19, "R4", "var_off=(0x0; 0xff0)"}, + {20, "R4", "var_off=(0x0; 0x7f8)"}, + {21, "R4", "var_off=(0x0; 0x3fc)"}, + {22, "R4", "var_off=(0x0; 0x1fe)"}, }, }, { @@ -195,16 +195,16 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {6, "R3_w", "var_off=(0x0; 0xff)"}, - {7, "R4_w", "var_off=(0x0; 0xff)"}, - {8, "R4_w", "var_off=(0x0; 0xff)"}, - {9, "R4_w", "var_off=(0x0; 0xff)"}, - {10, "R4_w", "var_off=(0x0; 0x1fe)"}, - {11, "R4_w", "var_off=(0x0; 0xff)"}, - {12, "R4_w", "var_off=(0x0; 0x3fc)"}, - {13, "R4_w", "var_off=(0x0; 0xff)"}, - {14, "R4_w", "var_off=(0x0; 0x7f8)"}, - {15, "R4_w", "var_off=(0x0; 0xff0)"}, + {6, "R3", "var_off=(0x0; 0xff)"}, + {7, "R4", "var_off=(0x0; 0xff)"}, + {8, "R4", "var_off=(0x0; 0xff)"}, + {9, "R4", "var_off=(0x0; 0xff)"}, + {10, "R4", "var_off=(0x0; 0x1fe)"}, + {11, "R4", "var_off=(0x0; 0xff)"}, + {12, "R4", "var_off=(0x0; 0x3fc)"}, + {13, "R4", "var_off=(0x0; 0xff)"}, + {14, "R4", "var_off=(0x0; 0x7f8)"}, + {15, "R4", "var_off=(0x0; 0xff0)"}, }, }, { @@ -235,14 +235,14 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {2, "R5_w", "pkt(r=0)"}, - {4, "R5_w", "pkt(off=14,r=0)"}, - {5, "R4_w", "pkt(off=14,r=0)"}, + {2, "R5", "pkt(r=0)"}, + {4, "R5", "pkt(off=14,r=0)"}, + {5, "R4", "pkt(off=14,r=0)"}, {9, "R2", "pkt(r=18)"}, {10, "R5", "pkt(off=14,r=18)"}, - {10, "R4_w", "var_off=(0x0; 0xff)"}, - {13, "R4_w", "var_off=(0x0; 0xffff)"}, - {14, "R4_w", "var_off=(0x0; 0xffff)"}, + {10, "R4", "var_off=(0x0; 0xff)"}, + {13, "R4", "var_off=(0x0; 0xffff)"}, + {14, "R4", "var_off=(0x0; 0xffff)"}, }, }, { @@ -299,12 +299,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {7, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {7, "R6", "var_off=(0x0; 0x3fc)"}, /* Offset is added to packet pointer R5, resulting in * known fixed offset, and variable offset from R6. */ - {11, "R5_w", "pkt(id=1,off=14,"}, + {11, "R5", "pkt(id=1,off=14,"}, /* At the time the word size load is performed from R5, * it's total offset is NET_IP_ALIGN + reg->off (0) + * reg->aux_off (14) which is 16. Then the variable @@ -320,12 +320,12 @@ static struct bpf_align_test tests[] = { * instruction to validate R5 state. We also check * that R4 is what it should be in such case. */ - {18, "R4_w", "var_off=(0x0; 0x3fc)"}, - {18, "R5_w", "var_off=(0x0; 0x3fc)"}, + {18, "R4", "var_off=(0x0; 0x3fc)"}, + {18, "R5", "var_off=(0x0; 0x3fc)"}, /* Constant offset is added to R5, resulting in * reg->off of 14. */ - {19, "R5_w", "pkt(id=2,off=14,"}, + {19, "R5", "pkt(id=2,off=14,"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off * (14) which is 16. Then the variable offset is 4-byte @@ -337,21 +337,21 @@ static struct bpf_align_test tests[] = { /* Constant offset is added to R5 packet pointer, * resulting in reg->off value of 14. */ - {26, "R5_w", "pkt(off=14,r=8)"}, + {26, "R5", "pkt(off=14,r=8)"}, /* Variable offset is added to R5, resulting in a * variable offset of (4n). See comment for insn #18 * for R4 = R5 trick. */ - {28, "R4_w", "var_off=(0x0; 0x3fc)"}, - {28, "R5_w", "var_off=(0x0; 0x3fc)"}, + {28, "R4", "var_off=(0x0; 0x3fc)"}, + {28, "R5", "var_off=(0x0; 0x3fc)"}, /* Constant is added to R5 again, setting reg->off to 18. */ - {29, "R5_w", "pkt(id=3,off=18,"}, + {29, "R5", "pkt(id=3,off=18,"}, /* And once more we add a variable; resulting var_off * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ - {31, "R4_w", "var_off=(0x0; 0x7fc)"}, - {31, "R5_w", "var_off=(0x0; 0x7fc)"}, + {31, "R4", "var_off=(0x0; 0x7fc)"}, + {31, "R5", "var_off=(0x0; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so @@ -397,12 +397,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {7, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {7, "R6", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ - {8, "R6_w", "var_off=(0x2; 0x7fc)"}, + {8, "R6", "var_off=(0x2; 0x7fc)"}, /* Packet pointer has (4n+2) offset */ - {11, "R5_w", "var_off=(0x2; 0x7fc)"}, + {11, "R5", "var_off=(0x2; 0x7fc)"}, {12, "R4", "var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) @@ -414,11 +414,11 @@ static struct bpf_align_test tests[] = { /* Newly read value in R6 was shifted left by 2, so has * known alignment of 4. */ - {17, "R6_w", "var_off=(0x0; 0x3fc)"}, + {17, "R6", "var_off=(0x0; 0x3fc)"}, /* Added (4n) to packet pointer's (4n+2) var_off, giving * another (4n+2). */ - {19, "R5_w", "var_off=(0x2; 0xffc)"}, + {19, "R5", "var_off=(0x2; 0xffc)"}, {20, "R4", "var_off=(0x2; 0xffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) @@ -459,18 +459,18 @@ static struct bpf_align_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .matches = { - {3, "R5_w", "pkt_end()"}, + {3, "R5", "pkt_end()"}, /* (ptr - ptr) << 2 == unknown, (4n) */ - {5, "R5_w", "var_off=(0x0; 0xfffffffffffffffc)"}, + {5, "R5", "var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because * the add could overflow. */ - {6, "R5_w", "var_off=(0x2; 0xfffffffffffffffc)"}, + {6, "R5", "var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ {9, "R5", "var_off=(0x2; 0x7ffffffffffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, - {12, "R4_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {11, "R6", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {12, "R4", "var_off=(0x2; 0x7ffffffffffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -478,7 +478,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {15, "R6", "var_off=(0x2; 0x7ffffffffffffffc)"}, } }, { @@ -513,12 +513,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {8, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {8, "R6", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ - {9, "R6_w", "var_off=(0x2; 0x7fc)"}, + {9, "R6", "var_off=(0x2; 0x7fc)"}, /* New unknown value in R7 is (4n) */ - {10, "R7_w", "var_off=(0x0; 0x3fc)"}, + {10, "R7", "var_off=(0x0; 0x3fc)"}, /* Subtracting it from R6 blows our unsigned bounds */ {11, "R6", "var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>= 0 */ @@ -566,16 +566,16 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {9, "R6_w", "var_off=(0x0; 0x3c)"}, + {6, "R2", "pkt(r=8)"}, + {9, "R6", "var_off=(0x0; 0x3c)"}, /* Adding 14 makes R6 be (4n+2) */ - {10, "R6_w", "var_off=(0x2; 0x7c)"}, + {10, "R6", "var_off=(0x2; 0x7c)"}, /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w", "var_off=(0xffffffffffffff82; 0x7c)"}, + {13, "R5", "var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ - {14, "R7_w", "var_off=(0x0; 0x7fc)"}, + {14, "R7", "var_off=(0x0; 0x7fc)"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w", "var_off=(0x2; 0x7fc)"}, + {16, "R5", "var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so diff --git a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c index 0223fce4db2b..693fd86fbde6 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c @@ -40,8 +40,13 @@ static void *spin_lock_thread(void *arg) err = bpf_prog_test_run_opts(prog_fd, &topts); ASSERT_OK(err, "test_run err"); + + if (topts.retval == -EOPNOTSUPP) + goto end; + ASSERT_EQ((int)topts.retval, 0, "test_run retval"); +end: pthread_exit(arg); } @@ -63,6 +68,7 @@ static void test_arena_spin_lock_size(int size) skel = arena_spin_lock__open_and_load(); if (!ASSERT_OK_PTR(skel, "arena_spin_lock__open_and_load")) return; + if (skel->data->test_skip == 2) { test__skip(); goto end; @@ -86,6 +92,13 @@ static void test_arena_spin_lock_size(int size) goto end_barrier; } + if (skel->data->test_skip == 3) { + printf("%s:SKIP: CONFIG_NR_CPUS exceed the maximum supported by arena spinlock\n", + __func__); + test__skip(); + goto end_barrier; + } + ASSERT_EQ(skel->bss->counter, repeat * nthreads, "check counter value"); end_barrier: diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c index 13e101f370a1..92b5f378bfb8 100644 --- a/tools/testing/selftests/bpf/prog_tests/atomics.c +++ b/tools/testing/selftests/bpf/prog_tests/atomics.c @@ -165,11 +165,17 @@ static void test_xchg(struct atomics_lskel *skel) void test_atomics(void) { struct atomics_lskel *skel; + int err; - skel = atomics_lskel__open_and_load(); - if (!ASSERT_OK_PTR(skel, "atomics skeleton load")) + skel = atomics_lskel__open(); + if (!ASSERT_OK_PTR(skel, "atomics skeleton open")) return; + skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = atomics_lskel__load(skel); + if (!ASSERT_OK(err, "atomics skeleton load")) + goto cleanup; + if (skel->data->skip_tests) { printf("%s:SKIP:no ENABLE_ATOMICS_TESTS (missing Clang BPF atomics support)", __func__); diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index cabc51c2ca6b..9e77e5da7097 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -3,6 +3,7 @@ #include "test_attach_kprobe_sleepable.skel.h" #include "test_attach_probe_manual.skel.h" #include "test_attach_probe.skel.h" +#include "kprobe_write_ctx.skel.h" /* this is how USDT semaphore is actually defined, except volatile modifier */ volatile unsigned short uprobe_ref_ctr __attribute__((unused)) __attribute((section(".probes"))); @@ -201,6 +202,31 @@ cleanup: test_attach_probe_manual__destroy(skel); } +#ifdef __x86_64__ +/* attach kprobe/kretprobe long event name testings */ +static void test_attach_kprobe_write_ctx(void) +{ + struct kprobe_write_ctx *skel = NULL; + struct bpf_link *link = NULL; + + skel = kprobe_write_ctx__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_write_ctx__open_and_load")) + return; + + link = bpf_program__attach_kprobe_opts(skel->progs.kprobe_write_ctx, + "bpf_fentry_test1", NULL); + if (!ASSERT_ERR_PTR(link, "bpf_program__attach_kprobe_opts")) + bpf_link__destroy(link); + + kprobe_write_ctx__destroy(skel); +} +#else +static void test_attach_kprobe_write_ctx(void) +{ + test__skip(); +} +#endif + static void test_attach_probe_auto(struct test_attach_probe *skel) { struct bpf_link *uprobe_err_link; @@ -406,6 +432,8 @@ void test_attach_probe(void) test_attach_uprobe_long_event_name(); if (test__start_subtest("kprobe-long_name")) test_attach_kprobe_long_event_name(); + if (test__start_subtest("kprobe-write-ctx")) + test_attach_kprobe_write_ctx(); cleanup: test_attach_probe__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 4a0670c056ba..75f4dff7d042 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -450,8 +450,7 @@ static void pe_subtest(struct test_bpf_cookie *skel) attr.size = sizeof(attr); attr.type = PERF_TYPE_SOFTWARE; attr.config = PERF_COUNT_SW_CPU_CLOCK; - attr.freq = 1; - attr.sample_freq = 10000; + attr.sample_period = 100000; pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); if (!ASSERT_GE(pfd, 0, "perf_fd")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 82903585c870..10cba526d3e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -63,7 +63,7 @@ static int test_btf_dump_case(int n, struct btf_dump_test_case *t) /* tests with t->known_ptr_sz have no "long" or "unsigned long" type, * so it's impossible to determine correct pointer size; but if they - * do, it should be 8 regardless of host architecture, becaues BPF + * do, it should be 8 regardless of host architecture, because BPF * target is always 64-bit */ if (!t->known_ptr_sz) { diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c b/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c index e0dd966e4a3e..5ad904e9d15d 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c @@ -44,7 +44,7 @@ static void test_read_cgroup_xattr(void) if (!ASSERT_OK_PTR(skel, "read_cgroupfs_xattr__open_and_load")) goto out; - skel->bss->target_pid = gettid(); + skel->bss->target_pid = sys_gettid(); if (!ASSERT_OK(read_cgroupfs_xattr__attach(skel), "read_cgroupfs_xattr__attach")) goto out; diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c b/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c index adda85f97058..4b42fbc96efc 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c +++ b/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c @@ -4,6 +4,8 @@ #define _GNU_SOURCE #include <cgroup_helpers.h> #include <test_progs.h> +#include <sched.h> +#include <sys/wait.h> #include "cgrp_kfunc_failure.skel.h" #include "cgrp_kfunc_success.skel.h" @@ -87,6 +89,72 @@ static const char * const success_tests[] = { "test_cgrp_from_id", }; +static void test_cgrp_from_id_ns(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct cgrp_kfunc_success *skel; + struct bpf_program *prog; + int pid, pipe_fd[2]; + + skel = open_load_cgrp_kfunc_skel(); + if (!ASSERT_OK_PTR(skel, "open_load_skel")) + return; + + if (!ASSERT_OK(skel->bss->err, "pre_mkdir_err")) + goto cleanup; + + prog = skel->progs.test_cgrp_from_id_ns; + + if (!ASSERT_OK(pipe(pipe_fd), "pipe")) + goto cleanup; + + pid = fork(); + if (!ASSERT_GE(pid, 0, "fork result")) { + close(pipe_fd[0]); + close(pipe_fd[1]); + goto cleanup; + } + + if (pid == 0) { + int ret = 0; + + close(pipe_fd[0]); + + if (!ASSERT_GE(cgroup_setup_and_join("cgrp_from_id_ns"), 0, "join cgroup")) + exit(1); + + if (!ASSERT_OK(unshare(CLONE_NEWCGROUP), "unshare cgns")) + exit(1); + + ret = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + if (!ASSERT_OK(ret, "test run ret")) + exit(1); + + if (!ASSERT_OK(opts.retval, "test run retval")) + exit(1); + + if (!ASSERT_EQ(write(pipe_fd[1], &ret, sizeof(ret)), sizeof(ret), "write pipe")) + exit(1); + + exit(0); + } else { + int res; + + close(pipe_fd[1]); + + ASSERT_EQ(read(pipe_fd[0], &res, sizeof(res)), sizeof(res), "read res"); + ASSERT_EQ(waitpid(pid, NULL, 0), pid, "wait on child"); + + remove_cgroup_pid("cgrp_from_id_ns", pid); + + ASSERT_OK(res, "result from run"); + } + + close(pipe_fd[0]); +cleanup: + cgrp_kfunc_success__destroy(skel); +} + void test_cgrp_kfunc(void) { int i, err; @@ -102,6 +170,9 @@ void test_cgrp_kfunc(void) run_success_test(success_tests[i]); } + if (test__start_subtest("test_cgrp_from_id_ns")) + test_cgrp_from_id_ns(); + RUN_TESTS(cgrp_kfunc_failure); cleanup: diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index 9b2d9ceda210..b9f86cb91e81 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -32,6 +32,8 @@ static struct { {"test_ringbuf", SETUP_SYSCALL_SLEEP}, {"test_skb_readonly", SETUP_SKB_PROG}, {"test_dynptr_skb_data", SETUP_SKB_PROG}, + {"test_dynptr_skb_meta_data", SETUP_SKB_PROG}, + {"test_dynptr_skb_meta_flags", SETUP_SKB_PROG}, {"test_adjust", SETUP_SYSCALL_SLEEP}, {"test_adjust_err", SETUP_SYSCALL_SLEEP}, {"test_zero_size_dynptr", SETUP_SYSCALL_SLEEP}, diff --git a/tools/testing/selftests/bpf/prog_tests/fd_array.c b/tools/testing/selftests/bpf/prog_tests/fd_array.c index 241b2c8c6e0f..c534b4d5f9da 100644 --- a/tools/testing/selftests/bpf/prog_tests/fd_array.c +++ b/tools/testing/selftests/bpf/prog_tests/fd_array.c @@ -293,7 +293,7 @@ static int get_btf_id_by_fd(int btf_fd, __u32 *id) * 1) Create a new btf, it's referenced only by a file descriptor, so refcnt=1 * 2) Load a BPF prog with fd_array[0] = btf_fd; now btf's refcnt=2 * 3) Close the btf_fd, now refcnt=1 - * Wait and check that BTF stil exists. + * Wait and check that BTF still exists. */ static void check_fd_array_cnt__referenced_btfs(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c index 130f5b82d2e6..5ef1804e44df 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c @@ -12,13 +12,24 @@ void test_fentry_fexit(void) int err, prog_fd, i; LIBBPF_OPTS(bpf_test_run_opts, topts); - fentry_skel = fentry_test_lskel__open_and_load(); + fentry_skel = fentry_test_lskel__open(); if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load")) goto close_prog; - fexit_skel = fexit_test_lskel__open_and_load(); + + fentry_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fentry_test_lskel__load(fentry_skel); + if (!ASSERT_OK(err, "fentry_skel_load")) + goto close_prog; + + fexit_skel = fexit_test_lskel__open(); if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load")) goto close_prog; + fexit_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fexit_test_lskel__load(fexit_skel); + if (!ASSERT_OK(err, "fexit_skel_load")) + goto close_prog; + err = fentry_test_lskel__attach(fentry_skel); if (!ASSERT_OK(err, "fentry_attach")) goto close_prog; diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index aee1bc77a17f..ec882328eb59 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -43,8 +43,13 @@ static void fentry_test(void) struct fentry_test_lskel *fentry_skel = NULL; int err; - fentry_skel = fentry_test_lskel__open_and_load(); - if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load")) + fentry_skel = fentry_test_lskel__open(); + if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_open")) + goto cleanup; + + fentry_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fentry_test_lskel__load(fentry_skel); + if (!ASSERT_OK(err, "fentry_skel_load")) goto cleanup; err = fentry_test_common(fentry_skel); diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c index 1c13007e37dd..94eed753560c 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c @@ -43,8 +43,13 @@ static void fexit_test(void) struct fexit_test_lskel *fexit_skel = NULL; int err; - fexit_skel = fexit_test_lskel__open_and_load(); - if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load")) + fexit_skel = fexit_test_lskel__open(); + if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_open")) + goto cleanup; + + fexit_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fexit_test_lskel__load(fexit_skel); + if (!ASSERT_OK(err, "fexit_skel_load")) goto cleanup; err = fexit_test_common(fexit_skel); diff --git a/tools/testing/selftests/bpf/prog_tests/free_timer.c b/tools/testing/selftests/bpf/prog_tests/free_timer.c index b7b77a6b2979..0de8facca4c5 100644 --- a/tools/testing/selftests/bpf/prog_tests/free_timer.c +++ b/tools/testing/selftests/bpf/prog_tests/free_timer.c @@ -124,6 +124,10 @@ void test_free_timer(void) int err; skel = free_timer__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "open_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/kernel_flag.c b/tools/testing/selftests/bpf/prog_tests/kernel_flag.c index a133354ac9bc..97b00c7efe94 100644 --- a/tools/testing/selftests/bpf/prog_tests/kernel_flag.c +++ b/tools/testing/selftests/bpf/prog_tests/kernel_flag.c @@ -16,7 +16,7 @@ void test_kernel_flag(void) if (!ASSERT_OK_PTR(lsm_skel, "lsm_skel")) return; - lsm_skel->bss->monitored_tid = gettid(); + lsm_skel->bss->monitored_tid = sys_gettid(); ret = test_kernel_flag__attach(lsm_skel); if (!ASSERT_OK(ret, "test_kernel_flag__attach")) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index e19ef509ebf8..6cfaa978bc9a 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -7,6 +7,7 @@ #include "kprobe_multi_session.skel.h" #include "kprobe_multi_session_cookie.skel.h" #include "kprobe_multi_verifier.skel.h" +#include "kprobe_write_ctx.skel.h" #include "bpf/libbpf_internal.h" #include "bpf/hashmap.h" @@ -422,220 +423,6 @@ static void test_unique_match(void) kprobe_multi__destroy(skel); } -static size_t symbol_hash(long key, void *ctx __maybe_unused) -{ - return str_hash((const char *) key); -} - -static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) -{ - return strcmp((const char *) key1, (const char *) key2) == 0; -} - -static bool is_invalid_entry(char *buf, bool kernel) -{ - if (kernel && strchr(buf, '[')) - return true; - if (!kernel && !strchr(buf, '[')) - return true; - return false; -} - -static bool skip_entry(char *name) -{ - /* - * We attach to almost all kernel functions and some of them - * will cause 'suspicious RCU usage' when fprobe is attached - * to them. Filter out the current culprits - arch_cpu_idle - * default_idle and rcu_* functions. - */ - if (!strcmp(name, "arch_cpu_idle")) - return true; - if (!strcmp(name, "default_idle")) - return true; - if (!strncmp(name, "rcu_", 4)) - return true; - if (!strcmp(name, "bpf_dispatcher_xdp_func")) - return true; - if (!strncmp(name, "__ftrace_invalid_address__", - sizeof("__ftrace_invalid_address__") - 1)) - return true; - return false; -} - -/* Do comparision by ignoring '.llvm.<hash>' suffixes. */ -static int compare_name(const char *name1, const char *name2) -{ - const char *res1, *res2; - int len1, len2; - - res1 = strstr(name1, ".llvm."); - res2 = strstr(name2, ".llvm."); - len1 = res1 ? res1 - name1 : strlen(name1); - len2 = res2 ? res2 - name2 : strlen(name2); - - if (len1 == len2) - return strncmp(name1, name2, len1); - if (len1 < len2) - return strncmp(name1, name2, len1) <= 0 ? -1 : 1; - return strncmp(name1, name2, len2) >= 0 ? 1 : -1; -} - -static int load_kallsyms_compare(const void *p1, const void *p2) -{ - return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); -} - -static int search_kallsyms_compare(const void *p1, const struct ksym *p2) -{ - return compare_name(p1, p2->name); -} - -static int get_syms(char ***symsp, size_t *cntp, bool kernel) -{ - size_t cap = 0, cnt = 0; - char *name = NULL, *ksym_name, **syms = NULL; - struct hashmap *map; - struct ksyms *ksyms; - struct ksym *ks; - char buf[256]; - FILE *f; - int err = 0; - - ksyms = load_kallsyms_custom_local(load_kallsyms_compare); - if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local")) - return -EINVAL; - - /* - * The available_filter_functions contains many duplicates, - * but other than that all symbols are usable in kprobe multi - * interface. - * Filtering out duplicates by using hashmap__add, which won't - * add existing entry. - */ - - if (access("/sys/kernel/tracing/trace", F_OK) == 0) - f = fopen("/sys/kernel/tracing/available_filter_functions", "r"); - else - f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r"); - - if (!f) - return -EINVAL; - - map = hashmap__new(symbol_hash, symbol_equal, NULL); - if (IS_ERR(map)) { - err = libbpf_get_error(map); - goto error; - } - - while (fgets(buf, sizeof(buf), f)) { - if (is_invalid_entry(buf, kernel)) - continue; - - free(name); - if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) - continue; - if (skip_entry(name)) - continue; - - ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); - if (!ks) { - err = -EINVAL; - goto error; - } - - ksym_name = ks->name; - err = hashmap__add(map, ksym_name, 0); - if (err == -EEXIST) { - err = 0; - continue; - } - if (err) - goto error; - - err = libbpf_ensure_mem((void **) &syms, &cap, - sizeof(*syms), cnt + 1); - if (err) - goto error; - - syms[cnt++] = ksym_name; - } - - *symsp = syms; - *cntp = cnt; - -error: - free(name); - fclose(f); - hashmap__free(map); - if (err) - free(syms); - return err; -} - -static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) -{ - unsigned long *addr, *addrs, *tmp_addrs; - int err = 0, max_cnt, inc_cnt; - char *name = NULL; - size_t cnt = 0; - char buf[256]; - FILE *f; - - if (access("/sys/kernel/tracing/trace", F_OK) == 0) - f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); - else - f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); - - if (!f) - return -ENOENT; - - /* In my local setup, the number of entries is 50k+ so Let us initially - * allocate space to hold 64k entries. If 64k is not enough, incrementally - * increase 1k each time. - */ - max_cnt = 65536; - inc_cnt = 1024; - addrs = malloc(max_cnt * sizeof(long)); - if (addrs == NULL) { - err = -ENOMEM; - goto error; - } - - while (fgets(buf, sizeof(buf), f)) { - if (is_invalid_entry(buf, kernel)) - continue; - - free(name); - if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) - continue; - if (skip_entry(name)) - continue; - - if (cnt == max_cnt) { - max_cnt += inc_cnt; - tmp_addrs = realloc(addrs, max_cnt); - if (!tmp_addrs) { - err = -ENOMEM; - goto error; - } - addrs = tmp_addrs; - } - - addrs[cnt++] = (unsigned long)addr; - } - - *addrsp = addrs; - *cntp = cnt; - -error: - free(name); - fclose(f); - if (err) - free(addrs); - return err; -} - static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts) { long attach_start_ns, attach_end_ns; @@ -670,7 +457,7 @@ static void test_kprobe_multi_bench_attach(bool kernel) char **syms = NULL; size_t cnt = 0; - if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) + if (!ASSERT_OK(bpf_get_ksyms(&syms, &cnt, kernel), "bpf_get_ksyms")) return; skel = kprobe_multi_empty__open_and_load(); @@ -696,13 +483,13 @@ static void test_kprobe_multi_bench_attach_addr(bool kernel) size_t cnt = 0; int err; - err = get_addrs(&addrs, &cnt, kernel); + err = bpf_get_addrs(&addrs, &cnt, kernel); if (err == -ENOENT) { test__skip(); return; } - if (!ASSERT_OK(err, "get_addrs")) + if (!ASSERT_OK(err, "bpf_get_addrs")) return; skel = kprobe_multi_empty__open_and_load(); @@ -753,6 +540,30 @@ cleanup: kprobe_multi_override__destroy(skel); } +#ifdef __x86_64__ +static void test_attach_write_ctx(void) +{ + struct kprobe_write_ctx *skel = NULL; + struct bpf_link *link = NULL; + + skel = kprobe_write_ctx__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_write_ctx__open_and_load")) + return; + + link = bpf_program__attach_kprobe_opts(skel->progs.kprobe_multi_write_ctx, + "bpf_fentry_test1", NULL); + if (!ASSERT_ERR_PTR(link, "bpf_program__attach_kprobe_opts")) + bpf_link__destroy(link); + + kprobe_write_ctx__destroy(skel); +} +#else +static void test_attach_write_ctx(void) +{ + test__skip(); +} +#endif + void serial_test_kprobe_multi_bench_attach(void) { if (test__start_subtest("kernel")) @@ -792,5 +603,7 @@ void test_kprobe_multi_test(void) test_session_cookie_skel_api(); if (test__start_subtest("unique_match")) test_unique_match(); + if (test__start_subtest("attach_write_ctx")) + test_attach_write_ctx(); RUN_TESTS(kprobe_multi_verifier); } diff --git a/tools/testing/selftests/bpf/prog_tests/map_excl.c b/tools/testing/selftests/bpf/prog_tests/map_excl.c new file mode 100644 index 000000000000..6bdc6d6de0da --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_excl.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Google LLC. */ +#define _GNU_SOURCE +#include <unistd.h> +#include <sys/syscall.h> +#include <test_progs.h> +#include <bpf/btf.h> + +#include "map_excl.skel.h" + +static void test_map_excl_allowed(void) +{ + struct map_excl *skel = map_excl__open(); + int err; + + err = bpf_map__set_exclusive_program(skel->maps.excl_map, skel->progs.should_have_access); + if (!ASSERT_OK(err, "bpf_map__set_exclusive_program")) + goto out; + + bpf_program__set_autoload(skel->progs.should_have_access, true); + bpf_program__set_autoload(skel->progs.should_not_have_access, false); + + err = map_excl__load(skel); + ASSERT_OK(err, "map_excl__load"); +out: + map_excl__destroy(skel); +} + +static void test_map_excl_denied(void) +{ + struct map_excl *skel = map_excl__open(); + int err; + + err = bpf_map__set_exclusive_program(skel->maps.excl_map, skel->progs.should_have_access); + if (!ASSERT_OK(err, "bpf_map__make_exclusive")) + goto out; + + bpf_program__set_autoload(skel->progs.should_have_access, false); + bpf_program__set_autoload(skel->progs.should_not_have_access, true); + + err = map_excl__load(skel); + ASSERT_EQ(err, -EACCES, "exclusive map access not denied\n"); +out: + map_excl__destroy(skel); + +} + +void test_map_excl(void) +{ + if (test__start_subtest("map_excl_allowed")) + test_map_excl_allowed(); + if (test__start_subtest("map_excl_denied")) + test_map_excl_denied(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index 6d391d95f96e..70fa7ae93173 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -90,7 +90,7 @@ void test_module_attach(void) test_module_attach__detach(skel); - /* attach fentry/fexit and make sure it get's module reference */ + /* attach fentry/fexit and make sure it gets module reference */ link = bpf_program__attach(skel->progs.handle_fentry); if (!ASSERT_OK_PTR(link, "attach_fentry")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/pinning_devmap_reuse.c b/tools/testing/selftests/bpf/prog_tests/pinning_devmap_reuse.c new file mode 100644 index 000000000000..9ae49b587f3e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/pinning_devmap_reuse.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <test_progs.h> + + +#include "test_pinning_devmap.skel.h" + +void test_pinning_devmap_reuse(void) +{ + const char *pinpath1 = "/sys/fs/bpf/pinmap1"; + const char *pinpath2 = "/sys/fs/bpf/pinmap2"; + struct test_pinning_devmap *skel1 = NULL, *skel2 = NULL; + int err; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); + + /* load the object a first time */ + skel1 = test_pinning_devmap__open_and_load(); + if (!ASSERT_OK_PTR(skel1, "skel_load1")) + goto out; + + /* load the object a second time, re-using the pinned map */ + skel2 = test_pinning_devmap__open_and_load(); + if (!ASSERT_OK_PTR(skel2, "skel_load2")) + goto out; + + /* we can close the reference safely without + * the map's refcount falling to 0 + */ + test_pinning_devmap__destroy(skel1); + skel1 = NULL; + + /* now, swap the pins */ + err = renameat2(0, pinpath1, 0, pinpath2, RENAME_EXCHANGE); + if (!ASSERT_OK(err, "swap pins")) + goto out; + + /* load the object again, this time the re-use should fail */ + skel1 = test_pinning_devmap__open_and_load(); + if (!ASSERT_ERR_PTR(skel1, "skel_load3")) + goto out; + +out: + unlink(pinpath1); + unlink(pinpath2); + test_pinning_devmap__destroy(skel1); + test_pinning_devmap__destroy(skel2); +} diff --git a/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c b/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c index 14f2796076e0..7607cfc2408c 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c @@ -54,3 +54,128 @@ void test_prog_tests_framework(void) return; clear_test_state(state); } + +static void dummy_emit(const char *buf, bool force) {} + +void test_prog_tests_framework_expected_msgs(void) +{ + struct expected_msgs msgs; + int i, j, error_cnt; + const struct { + const char *name; + const char *log; + const char *expected; + struct expect_msg *pats; + } cases[] = { + { + .name = "simple-ok", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "simple-fail", + .log = "aaabbbddd", + .expected = "MATCHED SUBSTR: 'aaa'\n" + "EXPECTED SUBSTR: 'ccc'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-ok-mid", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "foo", .negative = true }, + { .substr = "bar", .negative = true }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-ok-tail", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "foo", .negative = true }, + {} + } + }, + { + .name = "negative-ok-head", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "foo", .negative = true }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-fail-head", + .log = "aaabbbccc", + .expected = "UNEXPECTED SUBSTR: 'aaa'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa", .negative = true }, + { .substr = "bbb" }, + {} + } + }, + { + .name = "negative-fail-tail", + .log = "aaabbbccc", + .expected = "UNEXPECTED SUBSTR: 'ccc'\n", + .pats = (struct expect_msg[]) { + { .substr = "bbb" }, + { .substr = "ccc", .negative = true }, + {} + } + }, + { + .name = "negative-fail-mid-1", + .log = "aaabbbccc", + .expected = "UNEXPECTED SUBSTR: 'bbb'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "bbb", .negative = true }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-fail-mid-2", + .log = "aaabbb222ccc", + .expected = "UNEXPECTED SUBSTR: '222'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "222", .negative = true }, + { .substr = "bbb", .negative = true }, + { .substr = "ccc" }, + {} + } + } + }; + + for (i = 0; i < ARRAY_SIZE(cases); i++) { + if (test__start_subtest(cases[i].name)) { + error_cnt = env.subtest_state->error_cnt; + msgs.patterns = cases[i].pats; + msgs.cnt = 0; + for (j = 0; cases[i].pats[j].substr; j++) + msgs.cnt++; + validate_msgs(cases[i].log, &msgs, dummy_emit); + fflush(stderr); + env.subtest_state->error_cnt = error_cnt; + if (cases[i].expected) + ASSERT_HAS_SUBSTR(env.subtest_state->log_buf, cases[i].expected, "expected output"); + else + ASSERT_STREQ(env.subtest_state->log_buf, "", "expected no output"); + test__end_subtest(); + } + } +} diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index e261b0e872db..d93a0c7b1786 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -623,7 +623,7 @@ static void range_cond(enum num_t t, struct range x, struct range y, *newx = range(t, x.a, x.b); *newy = range(t, y.a + 1, y.b); } else if (x.a == x.b && x.b == y.b) { - /* X is a constant matching rigth side of Y */ + /* X is a constant matching right side of Y */ *newx = range(t, x.a, x.b); *newy = range(t, y.a, y.b - 1); } else if (y.a == y.b && x.a == y.a) { @@ -631,7 +631,7 @@ static void range_cond(enum num_t t, struct range x, struct range y, *newx = range(t, x.a + 1, x.b); *newy = range(t, y.a, y.b); } else if (y.a == y.b && x.b == y.b) { - /* Y is a constant matching rigth side of X */ + /* Y is a constant matching right side of X */ *newx = range(t, x.a, x.b - 1); *newy = range(t, y.a, y.b); } else { diff --git a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c index 0703e987df89..8c6c2043a432 100644 --- a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c @@ -99,3 +99,19 @@ end: res_spin_lock__destroy(skel); return; } + +void serial_test_res_spin_lock_stress(void) +{ + if (libbpf_num_possible_cpus() < 3) { + test__skip(); + return; + } + + ASSERT_OK(load_module("bpf_test_rqspinlock.ko", false), "load module AA"); + sleep(5); + unload_module("bpf_test_rqspinlock", false); + + ASSERT_OK(load_module_params("bpf_test_rqspinlock.ko", "test_ab=1", false), "load module ABBA"); + sleep(5); + unload_module("bpf_test_rqspinlock", false); +} diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index e3ea5dc2f697..254fbfeab06a 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -13,22 +13,22 @@ static struct { const char *err_msg; } spin_lock_fail_tests[] = { { "lock_id_kptr_preserve", - "5: (bf) r1 = r0 ; R0_w=ptr_foo(id=2,ref_obj_id=2) " - "R1_w=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" + "5: (bf) r1 = r0 ; R0=ptr_foo(id=2,ref_obj_id=2) " + "R1=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" "R1 type=ptr_ expected=percpu_ptr_" }, { "lock_id_global_zero", - "; R1_w=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n" + "; R1=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" - " R0_w=map_value(id=1,map=array_map,ks=4,vs=8)" - " R1_w=map_value(id=1,map=array_map,ks=4,vs=8)\n" + " R0=map_value(id=1,map=array_map,ks=4,vs=8)" + " R1=map_value(id=1,map=array_map,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_innermapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" " R0=map_value(id=2,ks=4,vs=8)" - " R1_w=map_value(id=2,ks=4,vs=8)\n" + " R1=map_value(id=2,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mismatch_kptr_kptr", "bpf_spin_unlock of different lock" }, diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c index b7ba5cd47d96..271b5cc9fc01 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c @@ -39,7 +39,7 @@ retry: bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 0832fd787457..b277dddd5af7 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -66,7 +66,7 @@ retry: bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c index df59e4ae2951..c23b97414813 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c @@ -1,46 +1,27 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include "stacktrace_map.skel.h" void test_stacktrace_map(void) { + struct stacktrace_map *skel; int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd; - const char *prog_name = "oncpu"; - int err, prog_fd, stack_trace_len; - const char *file = "./test_stacktrace_map.bpf.o"; - __u32 key, val, duration = 0; - struct bpf_program *prog; - struct bpf_object *obj; - struct bpf_link *link; + int err, stack_trace_len; + __u32 key, val, stack_id, duration = 0; + __u64 stack[PERF_MAX_STACK_DEPTH]; - err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); - if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno)) + skel = stacktrace_map__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) return; - prog = bpf_object__find_program_by_name(obj, prog_name); - if (CHECK(!prog, "find_prog", "prog '%s' not found\n", prog_name)) - goto close_prog; - - link = bpf_program__attach_tracepoint(prog, "sched", "sched_switch"); - if (!ASSERT_OK_PTR(link, "attach_tp")) - goto close_prog; - - /* find map fds */ - control_map_fd = bpf_find_map(__func__, obj, "control_map"); - if (CHECK_FAIL(control_map_fd < 0)) - goto disable_pmu; - - stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap"); - if (CHECK_FAIL(stackid_hmap_fd < 0)) - goto disable_pmu; - - stackmap_fd = bpf_find_map(__func__, obj, "stackmap"); - if (CHECK_FAIL(stackmap_fd < 0)) - goto disable_pmu; - - stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap"); - if (CHECK_FAIL(stack_amap_fd < 0)) - goto disable_pmu; + control_map_fd = bpf_map__fd(skel->maps.control_map); + stackid_hmap_fd = bpf_map__fd(skel->maps.stackid_hmap); + stackmap_fd = bpf_map__fd(skel->maps.stackmap); + stack_amap_fd = bpf_map__fd(skel->maps.stack_amap); + err = stacktrace_map__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; /* give some time for bpf program run */ sleep(1); @@ -50,26 +31,32 @@ void test_stacktrace_map(void) bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto out; err = compare_map_keys(stackmap_fd, stackid_hmap_fd); if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto out; stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64); err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len); if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap", "err %d errno %d\n", err, errno)) - goto disable_pmu; - -disable_pmu: - bpf_link__destroy(link); -close_prog: - bpf_object__close(obj); + goto out; + + stack_id = skel->bss->stack_id; + err = bpf_map_lookup_and_delete_elem(stackmap_fd, &stack_id, stack); + if (!ASSERT_OK(err, "lookup and delete target stack_id")) + goto out; + + err = bpf_map_lookup_elem(stackmap_fd, &stack_id, stack); + if (!ASSERT_EQ(err, -ENOENT, "lookup deleted stack_id")) + goto out; +out: + stacktrace_map__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c index c6ef06f55cdb..e985d51d3d47 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c @@ -5,7 +5,7 @@ void test_stacktrace_map_raw_tp(void) { const char *prog_name = "oncpu"; int control_map_fd, stackid_hmap_fd, stackmap_fd; - const char *file = "./test_stacktrace_map.bpf.o"; + const char *file = "./stacktrace_map.bpf.o"; __u32 key, val, duration = 0; int err, prog_fd; struct bpf_program *prog; @@ -46,7 +46,7 @@ void test_stacktrace_map_raw_tp(void) bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c index 1932b1e0685c..dc2ccf6a14d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c @@ -40,7 +40,7 @@ void test_stacktrace_map_skip(void) skel->bss->control = 1; /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (!ASSERT_OK(err, "compare_map_keys stackid_hmap vs. stackmap")) diff --git a/tools/testing/selftests/bpf/prog_tests/stream.c b/tools/testing/selftests/bpf/prog_tests/stream.c index d9f0185dca61..c3cce5c292bd 100644 --- a/tools/testing/selftests/bpf/prog_tests/stream.c +++ b/tools/testing/selftests/bpf/prog_tests/stream.c @@ -2,7 +2,6 @@ /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ #include <test_progs.h> #include <sys/mman.h> -#include <regex.h> #include "stream.skel.h" #include "stream_fail.skel.h" @@ -18,87 +17,6 @@ void test_stream_success(void) return; } -struct { - int prog_off; - const char *errstr; -} stream_error_arr[] = { - { - offsetof(struct stream, progs.stream_cond_break), - "ERROR: Timeout detected for may_goto instruction\n" - "CPU: [0-9]+ UID: 0 PID: [0-9]+ Comm: .*\n" - "Call trace:\n" - "([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" - "|[ \t]+[^\n]+\n)*", - }, - { - offsetof(struct stream, progs.stream_deadlock), - "ERROR: AA or ABBA deadlock detected for bpf_res_spin_lock\n" - "Attempted lock = (0x[0-9a-fA-F]+)\n" - "Total held locks = 1\n" - "Held lock\\[ 0\\] = \\1\n" // Lock address must match - "CPU: [0-9]+ UID: 0 PID: [0-9]+ Comm: .*\n" - "Call trace:\n" - "([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" - "|[ \t]+[^\n]+\n)*", - }, -}; - -static int match_regex(const char *pattern, const char *string) -{ - int err, rc; - regex_t re; - - err = regcomp(&re, pattern, REG_EXTENDED | REG_NEWLINE); - if (err) - return -1; - rc = regexec(&re, string, 0, NULL, 0); - regfree(&re); - return rc == 0 ? 1 : 0; -} - -void test_stream_errors(void) -{ - LIBBPF_OPTS(bpf_test_run_opts, opts); - LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts); - struct stream *skel; - int ret, prog_fd; - char buf[1024]; - - skel = stream__open_and_load(); - if (!ASSERT_OK_PTR(skel, "stream__open_and_load")) - return; - - for (int i = 0; i < ARRAY_SIZE(stream_error_arr); i++) { - struct bpf_program **prog; - - prog = (struct bpf_program **)(((char *)skel) + stream_error_arr[i].prog_off); - prog_fd = bpf_program__fd(*prog); - ret = bpf_prog_test_run_opts(prog_fd, &opts); - ASSERT_OK(ret, "ret"); - ASSERT_OK(opts.retval, "retval"); - -#if !defined(__x86_64__) - ASSERT_TRUE(1, "Timed may_goto unsupported, skip."); - if (i == 0) { - ret = bpf_prog_stream_read(prog_fd, 2, buf, sizeof(buf), &ropts); - ASSERT_EQ(ret, 0, "stream read"); - continue; - } -#endif - - ret = bpf_prog_stream_read(prog_fd, BPF_STREAM_STDERR, buf, sizeof(buf), &ropts); - ASSERT_GT(ret, 0, "stream read"); - ASSERT_LE(ret, 1023, "len for buf"); - buf[ret] = '\0'; - - ret = match_regex(stream_error_arr[i].errstr, buf); - if (!ASSERT_TRUE(ret == 1, "regex match")) - fprintf(stderr, "Output from stream:\n%s\n", buf); - } - - stream__destroy(skel); -} - void test_stream_syscall(void) { LIBBPF_OPTS(bpf_test_run_opts, opts); @@ -139,3 +57,52 @@ void test_stream_syscall(void) stream__destroy(skel); } + +static void test_address(struct bpf_program *prog, unsigned long *fault_addr_p) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts); + int ret, prog_fd; + char fault_addr[64]; + char buf[1024]; + + prog_fd = bpf_program__fd(prog); + + ret = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(ret, "ret"); + ASSERT_OK(opts.retval, "retval"); + + sprintf(fault_addr, "0x%lx", *fault_addr_p); + + ret = bpf_prog_stream_read(prog_fd, BPF_STREAM_STDERR, buf, sizeof(buf), &ropts); + ASSERT_GT(ret, 0, "stream read"); + ASSERT_LE(ret, 1023, "len for buf"); + buf[ret] = '\0'; + + if (!ASSERT_HAS_SUBSTR(buf, fault_addr, "fault_addr")) { + fprintf(stderr, "Output from stream:\n%s\n", buf); + fprintf(stderr, "Fault Addr: %s\n", fault_addr); + } +} + +void test_stream_arena_fault_address(void) +{ + struct stream *skel; + +#if !defined(__x86_64__) && !defined(__aarch64__) + printf("%s:SKIP: arena fault reporting not supported\n", __func__); + test__skip(); + return; +#endif + + skel = stream__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stream__open_and_load")) + return; + + if (test__start_subtest("read_fault")) + test_address(skel->progs.stream_arena_read_fault, &skel->bss->fault_addr); + if (test__start_subtest("write_fault")) + test_address(skel->progs.stream_arena_write_fault, &skel->bss->fault_addr); + + stream__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c index 35af8044d059..4d66fad3c8bd 100644 --- a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c +++ b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c @@ -8,6 +8,7 @@ static const char * const test_cases[] = { "strcmp", + "strcasecmp", "strchr", "strchrnul", "strnchr", diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h new file mode 100644 index 000000000000..2de38776a2d4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TASK_LOCAL_DATA_H +#define __TASK_LOCAL_DATA_H + +#include <errno.h> +#include <fcntl.h> +#include <sched.h> +#include <stdatomic.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/syscall.h> +#include <sys/types.h> + +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT +#include <pthread.h> +#endif + +#include <bpf/bpf.h> + +/* + * OPTIONS + * + * Define the option before including the header + * + * TLD_FREE_DATA_ON_THREAD_EXIT - Frees memory on thread exit automatically + * + * Thread-specific memory for storing TLD is allocated lazily on the first call to + * tld_get_data(). The thread that calls it must also call tld_free() on thread exit + * to prevent memory leak. Pthread will be included if the option is defined. A pthread + * key will be registered with a destructor that calls tld_free(). + * + * + * TLD_DYN_DATA_SIZE - The maximum size of memory allocated for TLDs created dynamically + * (default: 64 bytes) + * + * A TLD can be defined statically using TLD_DEFINE_KEY() or created on the fly using + * tld_create_key(). As the total size of TLDs created with tld_create_key() cannot be + * possibly known statically, a memory area of size TLD_DYN_DATA_SIZE will be allocated + * for these TLDs. This additional memory is allocated for every thread that calls + * tld_get_data() even if no tld_create_key are actually called, so be mindful of + * potential memory wastage. Use TLD_DEFINE_KEY() whenever possible as just enough memory + * will be allocated for TLDs created with it. + * + * + * TLD_NAME_LEN - The maximum length of the name of a TLD (default: 62) + * + * Setting TLD_NAME_LEN will affect the maximum number of TLDs a process can store, + * TLD_MAX_DATA_CNT. + * + * + * TLD_DATA_USE_ALIGNED_ALLOC - Always use aligned_alloc() instead of malloc() + * + * When allocating the memory for storing TLDs, we need to make sure there is a memory + * region of the X bytes within a page. This is due to the limit posed by UPTR: memory + * pinned to the kernel cannot exceed a page nor can it cross the page boundary. The + * library normally calls malloc(2*X) given X bytes of total TLDs, and only uses + * aligned_alloc(PAGE_SIZE, X) when X >= PAGE_SIZE / 2. This is to reduce memory wastage + * as not all memory allocator can use the exact amount of memory requested to fulfill + * aligned_alloc(). For example, some may round the size up to the alignment. Enable the + * option to always use aligned_alloc() if the implementation has low memory overhead. + */ + +#define TLD_PAGE_SIZE getpagesize() +#define TLD_PAGE_MASK (~(TLD_PAGE_SIZE - 1)) + +#define TLD_ROUND_MASK(x, y) ((__typeof__(x))((y) - 1)) +#define TLD_ROUND_UP(x, y) ((((x) - 1) | TLD_ROUND_MASK(x, y)) + 1) + +#define TLD_READ_ONCE(x) (*(volatile typeof(x) *)&(x)) + +#ifndef TLD_DYN_DATA_SIZE +#define TLD_DYN_DATA_SIZE 64 +#endif + +#define TLD_MAX_DATA_CNT (TLD_PAGE_SIZE / sizeof(struct tld_metadata) - 1) + +#ifndef TLD_NAME_LEN +#define TLD_NAME_LEN 62 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + __s16 off; +} tld_key_t; + +struct tld_metadata { + char name[TLD_NAME_LEN]; + _Atomic __u16 size; +}; + +struct tld_meta_u { + _Atomic __u8 cnt; + __u16 size; + struct tld_metadata metadata[]; +}; + +struct tld_data_u { + __u64 start; /* offset of tld_data_u->data in a page */ + char data[]; +}; + +struct tld_map_value { + void *data; + struct tld_meta_u *meta; +}; + +struct tld_meta_u * _Atomic tld_meta_p __attribute__((weak)); +__thread struct tld_data_u *tld_data_p __attribute__((weak)); +__thread void *tld_data_alloc_p __attribute__((weak)); + +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT +pthread_key_t tld_pthread_key __attribute__((weak)); + +static void tld_free(void); + +static void __tld_thread_exit_handler(void *unused) +{ + tld_free(); +} +#endif + +static int __tld_init_meta_p(void) +{ + struct tld_meta_u *meta, *uninit = NULL; + int err = 0; + + meta = (struct tld_meta_u *)aligned_alloc(TLD_PAGE_SIZE, TLD_PAGE_SIZE); + if (!meta) { + err = -ENOMEM; + goto out; + } + + memset(meta, 0, TLD_PAGE_SIZE); + meta->size = TLD_DYN_DATA_SIZE; + + if (!atomic_compare_exchange_strong(&tld_meta_p, &uninit, meta)) { + free(meta); + goto out; + } + +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT + pthread_key_create(&tld_pthread_key, __tld_thread_exit_handler); +#endif +out: + return err; +} + +static int __tld_init_data_p(int map_fd) +{ + bool use_aligned_alloc = false; + struct tld_map_value map_val; + struct tld_data_u *data; + void *data_alloc = NULL; + int err, tid_fd = -1; + + tid_fd = syscall(SYS_pidfd_open, sys_gettid(), O_EXCL); + if (tid_fd < 0) { + err = -errno; + goto out; + } + +#ifdef TLD_DATA_USE_ALIGNED_ALLOC + use_aligned_alloc = true; +#endif + + /* + * tld_meta_p->size = TLD_DYN_DATA_SIZE + + * total size of TLDs defined via TLD_DEFINE_KEY() + */ + data_alloc = (use_aligned_alloc || tld_meta_p->size * 2 >= TLD_PAGE_SIZE) ? + aligned_alloc(TLD_PAGE_SIZE, tld_meta_p->size) : + malloc(tld_meta_p->size * 2); + if (!data_alloc) { + err = -ENOMEM; + goto out; + } + + /* + * Always pass a page-aligned address to UPTR since the size of tld_map_value::data + * is a page in BTF. If data_alloc spans across two pages, use the page that contains large + * enough memory. + */ + if (TLD_PAGE_SIZE - (~TLD_PAGE_MASK & (intptr_t)data_alloc) >= tld_meta_p->size) { + map_val.data = (void *)(TLD_PAGE_MASK & (intptr_t)data_alloc); + data = data_alloc; + data->start = (~TLD_PAGE_MASK & (intptr_t)data_alloc) + + offsetof(struct tld_data_u, data); + } else { + map_val.data = (void *)(TLD_ROUND_UP((intptr_t)data_alloc, TLD_PAGE_SIZE)); + data = (void *)(TLD_ROUND_UP((intptr_t)data_alloc, TLD_PAGE_SIZE)); + data->start = offsetof(struct tld_data_u, data); + } + map_val.meta = TLD_READ_ONCE(tld_meta_p); + + err = bpf_map_update_elem(map_fd, &tid_fd, &map_val, 0); + if (err) { + free(data_alloc); + goto out; + } + + tld_data_p = data; + tld_data_alloc_p = data_alloc; +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT + pthread_setspecific(tld_pthread_key, (void *)1); +#endif +out: + if (tid_fd >= 0) + close(tid_fd); + return err; +} + +static tld_key_t __tld_create_key(const char *name, size_t size, bool dyn_data) +{ + int err, i, sz, off = 0; + __u8 cnt; + + if (!TLD_READ_ONCE(tld_meta_p)) { + err = __tld_init_meta_p(); + if (err) + return (tld_key_t){err}; + } + + for (i = 0; i < TLD_MAX_DATA_CNT; i++) { +retry: + cnt = atomic_load(&tld_meta_p->cnt); + if (i < cnt) { + /* A metadata is not ready until size is updated with a non-zero value */ + while (!(sz = atomic_load(&tld_meta_p->metadata[i].size))) + sched_yield(); + + if (!strncmp(tld_meta_p->metadata[i].name, name, TLD_NAME_LEN)) + return (tld_key_t){-EEXIST}; + + off += TLD_ROUND_UP(sz, 8); + continue; + } + + /* + * TLD_DEFINE_KEY() is given memory upto a page while at most + * TLD_DYN_DATA_SIZE is allocated for tld_create_key() + */ + if (dyn_data) { + if (off + TLD_ROUND_UP(size, 8) > tld_meta_p->size) + return (tld_key_t){-E2BIG}; + } else { + if (off + TLD_ROUND_UP(size, 8) > TLD_PAGE_SIZE - sizeof(struct tld_data_u)) + return (tld_key_t){-E2BIG}; + tld_meta_p->size += TLD_ROUND_UP(size, 8); + } + + /* + * Only one tld_create_key() can increase the current cnt by one and + * takes the latest available slot. Other threads will check again if a new + * TLD can still be added, and then compete for the new slot after the + * succeeding thread update the size. + */ + if (!atomic_compare_exchange_strong(&tld_meta_p->cnt, &cnt, cnt + 1)) + goto retry; + + strncpy(tld_meta_p->metadata[i].name, name, TLD_NAME_LEN); + atomic_store(&tld_meta_p->metadata[i].size, size); + return (tld_key_t){(__s16)off}; + } + + return (tld_key_t){-ENOSPC}; +} + +/** + * TLD_DEFINE_KEY() - Define a TLD and a global variable key associated with the TLD. + * + * @name: The name of the TLD + * @size: The size of the TLD + * @key: The variable name of the key. Cannot exceed TLD_NAME_LEN + * + * The macro can only be used in file scope. + * + * A global variable key of opaque type, tld_key_t, will be declared and initialized before + * main() starts. Use tld_key_is_err() or tld_key_err_or_zero() later to check if the key + * creation succeeded. Pass the key to tld_get_data() to get a pointer to the TLD. + * bpf programs can also fetch the same key by name. + * + * The total size of TLDs created using TLD_DEFINE_KEY() cannot exceed a page. Just + * enough memory will be allocated for each thread on the first call to tld_get_data(). + */ +#define TLD_DEFINE_KEY(key, name, size) \ +tld_key_t key; \ + \ +__attribute__((constructor)) \ +void __tld_define_key_##key(void) \ +{ \ + key = __tld_create_key(name, size, false); \ +} + +/** + * tld_create_key() - Create a TLD and return a key associated with the TLD. + * + * @name: The name the TLD + * @size: The size of the TLD + * + * Return an opaque object key. Use tld_key_is_err() or tld_key_err_or_zero() to check + * if the key creation succeeded. Pass the key to tld_get_data() to get a pointer to + * locate the TLD. bpf programs can also fetch the same key by name. + * + * Use tld_create_key() only when a TLD needs to be created dynamically (e.g., @name is + * not known statically or a TLD needs to be created conditionally) + * + * An additional TLD_DYN_DATA_SIZE bytes are allocated per-thread to accommodate TLDs + * created dynamically with tld_create_key(). Since only a user page is pinned to the + * kernel, when TLDs created with TLD_DEFINE_KEY() uses more than TLD_PAGE_SIZE - + * TLD_DYN_DATA_SIZE, the buffer size will be limited to the rest of the page. + */ +__attribute__((unused)) +static tld_key_t tld_create_key(const char *name, size_t size) +{ + return __tld_create_key(name, size, true); +} + +__attribute__((unused)) +static inline bool tld_key_is_err(tld_key_t key) +{ + return key.off < 0; +} + +__attribute__((unused)) +static inline int tld_key_err_or_zero(tld_key_t key) +{ + return tld_key_is_err(key) ? key.off : 0; +} + +/** + * tld_get_data() - Get a pointer to the TLD associated with the given key of the + * calling thread. + * + * @map_fd: A file descriptor of tld_data_map, the underlying BPF task local storage map + * of task local data. + * @key: A key object created by TLD_DEFINE_KEY() or tld_create_key(). + * + * Return a pointer to the TLD if the key is valid; NULL if not enough memory for TLD + * for this thread, or the key is invalid. The returned pointer is guaranteed to be 8-byte + * aligned. + * + * Threads that call tld_get_data() must call tld_free() on exit to prevent + * memory leak if TLD_FREE_DATA_ON_THREAD_EXIT is not defined. + */ +__attribute__((unused)) +static void *tld_get_data(int map_fd, tld_key_t key) +{ + if (!TLD_READ_ONCE(tld_meta_p)) + return NULL; + + /* tld_data_p is allocated on the first invocation of tld_get_data() */ + if (!tld_data_p && __tld_init_data_p(map_fd)) + return NULL; + + return tld_data_p->data + key.off; +} + +/** + * tld_free() - Free task local data memory of the calling thread + * + * For the calling thread, all pointers to TLDs acquired before will become invalid. + * + * Users must call tld_free() on thread exit to prevent memory leak. Alternatively, + * define TLD_FREE_DATA_ON_THREAD_EXIT and a thread exit handler will be registered + * to free the memory automatically. + */ +__attribute__((unused)) +static void tld_free(void) +{ + if (tld_data_alloc_p) { + free(tld_data_alloc_p); + tld_data_alloc_p = NULL; + tld_data_p = NULL; + } +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __TASK_LOCAL_DATA_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/task_work_stress.c b/tools/testing/selftests/bpf/prog_tests/task_work_stress.c new file mode 100644 index 000000000000..450d17d91a56 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_work_stress.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include <string.h> +#include <stdio.h> +#include "task_work_stress.skel.h" +#include <linux/bpf.h> +#include <linux/perf_event.h> +#include <sys/syscall.h> +#include <time.h> +#include <stdlib.h> +#include <stdatomic.h> + +struct test_data { + int prog_fd; + atomic_int exit; +}; + +void *runner(void *test_data) +{ + struct test_data *td = test_data; + int err = 0; + LIBBPF_OPTS(bpf_test_run_opts, opts); + + while (!err && !atomic_load(&td->exit)) + err = bpf_prog_test_run_opts(td->prog_fd, &opts); + + return NULL; +} + +static int get_env_int(const char *str, int def) +{ + const char *s = getenv(str); + char *end; + int retval; + + if (!s || !*s) + return def; + errno = 0; + retval = strtol(s, &end, 10); + if (errno || *end || retval < 0) + return def; + return retval; +} + +static void task_work_run(bool enable_delete) +{ + struct task_work_stress *skel; + struct bpf_program *scheduler, *deleter; + int nthreads = 16; + int test_time_s = get_env_int("BPF_TASK_WORK_TEST_TIME", 1); + pthread_t tid[nthreads], tid_del; + bool started[nthreads], started_del = false; + struct test_data td_sched = { .exit = 0 }, td_del = { .exit = 1 }; + int i, err; + + skel = task_work_stress__open(); + if (!ASSERT_OK_PTR(skel, "task_work__open")) + return; + + scheduler = bpf_object__find_program_by_name(skel->obj, "schedule_task_work"); + bpf_program__set_autoload(scheduler, true); + + deleter = bpf_object__find_program_by_name(skel->obj, "delete_task_work"); + bpf_program__set_autoload(deleter, true); + + err = task_work_stress__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + for (i = 0; i < nthreads; ++i) + started[i] = false; + + td_sched.prog_fd = bpf_program__fd(scheduler); + for (i = 0; i < nthreads; ++i) { + if (pthread_create(&tid[i], NULL, runner, &td_sched) != 0) { + fprintf(stderr, "could not start thread"); + goto cancel; + } + started[i] = true; + } + + if (enable_delete) + atomic_store(&td_del.exit, 0); + + td_del.prog_fd = bpf_program__fd(deleter); + if (pthread_create(&tid_del, NULL, runner, &td_del) != 0) { + fprintf(stderr, "could not start thread"); + goto cancel; + } + started_del = true; + + /* Run stress test for some time */ + sleep(test_time_s); + +cancel: + atomic_store(&td_sched.exit, 1); + atomic_store(&td_del.exit, 1); + for (i = 0; i < nthreads; ++i) { + if (started[i]) + pthread_join(tid[i], NULL); + } + + if (started_del) + pthread_join(tid_del, NULL); + + ASSERT_GT(skel->bss->callback_scheduled, 0, "work scheduled"); + /* Some scheduling attempts should have failed due to contention */ + ASSERT_GT(skel->bss->schedule_error, 0, "schedule error"); + + if (enable_delete) { + /* If delete thread is enabled, it has cancelled some callbacks */ + ASSERT_GT(skel->bss->delete_success, 0, "delete success"); + ASSERT_LT(skel->bss->callback_success, skel->bss->callback_scheduled, "callbacks"); + } else { + /* Without delete thread number of scheduled callbacks is the same as fired */ + ASSERT_EQ(skel->bss->callback_success, skel->bss->callback_scheduled, "callbacks"); + } + +cleanup: + task_work_stress__destroy(skel); +} + +void test_task_work_stress(void) +{ + if (test__start_subtest("no_delete")) + task_work_run(false); + if (test__start_subtest("with_delete")) + task_work_run(true); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_id_ops_mapping.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_id_ops_mapping.c new file mode 100644 index 000000000000..fd8762ba4b67 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_id_ops_mapping.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include "struct_ops_id_ops_mapping1.skel.h" +#include "struct_ops_id_ops_mapping2.skel.h" + +static void test_st_ops_id_ops_mapping(void) +{ + struct struct_ops_id_ops_mapping1 *skel1 = NULL; + struct struct_ops_id_ops_mapping2 *skel2 = NULL; + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + int err, pid, prog1_fd, prog2_fd; + + skel1 = struct_ops_id_ops_mapping1__open_and_load(); + if (!ASSERT_OK_PTR(skel1, "struct_ops_id_ops_mapping1__open")) + goto out; + + skel2 = struct_ops_id_ops_mapping2__open_and_load(); + if (!ASSERT_OK_PTR(skel2, "struct_ops_id_ops_mapping2__open")) + goto out; + + err = bpf_map_get_info_by_fd(bpf_map__fd(skel1->maps.st_ops_map), + &info, &len); + if (!ASSERT_OK(err, "bpf_map_get_info_by_fd")) + goto out; + + skel1->bss->st_ops_id = info.id; + + err = bpf_map_get_info_by_fd(bpf_map__fd(skel2->maps.st_ops_map), + &info, &len); + if (!ASSERT_OK(err, "bpf_map_get_info_by_fd")) + goto out; + + skel2->bss->st_ops_id = info.id; + + err = struct_ops_id_ops_mapping1__attach(skel1); + if (!ASSERT_OK(err, "struct_ops_id_ops_mapping1__attach")) + goto out; + + err = struct_ops_id_ops_mapping2__attach(skel2); + if (!ASSERT_OK(err, "struct_ops_id_ops_mapping2__attach")) + goto out; + + /* run tracing prog that calls .test_1 and checks return */ + pid = getpid(); + skel1->bss->test_pid = pid; + skel2->bss->test_pid = pid; + sys_gettid(); + skel1->bss->test_pid = 0; + skel2->bss->test_pid = 0; + + /* run syscall_prog that calls .test_1 and checks return */ + prog1_fd = bpf_program__fd(skel1->progs.syscall_prog); + err = bpf_prog_test_run_opts(prog1_fd, NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + prog2_fd = bpf_program__fd(skel2->progs.syscall_prog); + err = bpf_prog_test_run_opts(prog2_fd, NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + ASSERT_EQ(skel1->bss->test_err, 0, "skel1->bss->test_err"); + ASSERT_EQ(skel2->bss->test_err, 0, "skel2->bss->test_err"); + +out: + struct_ops_id_ops_mapping1__destroy(skel1); + struct_ops_id_ops_mapping2__destroy(skel2); +} + +void test_struct_ops_id_ops_mapping(void) +{ + if (test__start_subtest("st_ops_id_ops_mapping")) + test_st_ops_id_ops_mapping(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c new file mode 100644 index 000000000000..9fd6306b455c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <pthread.h> +#include <bpf/btf.h> +#include <test_progs.h> + +#define TLD_FREE_DATA_ON_THREAD_EXIT +#define TLD_DYN_DATA_SIZE 4096 +#include "task_local_data.h" + +struct test_tld_struct { + __u64 a; + __u64 b; + __u64 c; + __u64 d; +}; + +#include "test_task_local_data.skel.h" + +TLD_DEFINE_KEY(value0_key, "value0", sizeof(int)); + +/* + * Reset task local data between subtests by clearing metadata other + * than the statically defined value0. This is safe as subtests run + * sequentially. Users of task local data library should not touch + * library internal. + */ +static void reset_tld(void) +{ + if (TLD_READ_ONCE(tld_meta_p)) { + /* Remove TLDs created by tld_create_key() */ + tld_meta_p->cnt = 1; + tld_meta_p->size = TLD_DYN_DATA_SIZE; + memset(&tld_meta_p->metadata[1], 0, + (TLD_MAX_DATA_CNT - 1) * sizeof(struct tld_metadata)); + } +} + +/* Serialize access to bpf program's global variables */ +static pthread_mutex_t global_mutex; + +static tld_key_t *tld_keys; + +#define TEST_BASIC_THREAD_NUM 32 + +void *test_task_local_data_basic_thread(void *arg) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct test_task_local_data *skel = (struct test_task_local_data *)arg; + int fd, err, tid, *value0, *value1; + struct test_tld_struct *value2; + + fd = bpf_map__fd(skel->maps.tld_data_map); + + value0 = tld_get_data(fd, value0_key); + if (!ASSERT_OK_PTR(value0, "tld_get_data")) + goto out; + + value1 = tld_get_data(fd, tld_keys[1]); + if (!ASSERT_OK_PTR(value1, "tld_get_data")) + goto out; + + value2 = tld_get_data(fd, tld_keys[2]); + if (!ASSERT_OK_PTR(value2, "tld_get_data")) + goto out; + + tid = sys_gettid(); + + *value0 = tid + 0; + *value1 = tid + 1; + value2->a = tid + 2; + value2->b = tid + 3; + value2->c = tid + 4; + value2->d = tid + 5; + + pthread_mutex_lock(&global_mutex); + /* Run task_main that read task local data and save to global variables */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.task_main), &opts); + ASSERT_OK(err, "run task_main"); + ASSERT_OK(opts.retval, "task_main retval"); + + ASSERT_EQ(skel->bss->test_value0, tid + 0, "tld_get_data value0"); + ASSERT_EQ(skel->bss->test_value1, tid + 1, "tld_get_data value1"); + ASSERT_EQ(skel->bss->test_value2.a, tid + 2, "tld_get_data value2.a"); + ASSERT_EQ(skel->bss->test_value2.b, tid + 3, "tld_get_data value2.b"); + ASSERT_EQ(skel->bss->test_value2.c, tid + 4, "tld_get_data value2.c"); + ASSERT_EQ(skel->bss->test_value2.d, tid + 5, "tld_get_data value2.d"); + pthread_mutex_unlock(&global_mutex); + + /* Make sure valueX are indeed local to threads */ + ASSERT_EQ(*value0, tid + 0, "value0"); + ASSERT_EQ(*value1, tid + 1, "value1"); + ASSERT_EQ(value2->a, tid + 2, "value2.a"); + ASSERT_EQ(value2->b, tid + 3, "value2.b"); + ASSERT_EQ(value2->c, tid + 4, "value2.c"); + ASSERT_EQ(value2->d, tid + 5, "value2.d"); + + *value0 = tid + 5; + *value1 = tid + 4; + value2->a = tid + 3; + value2->b = tid + 2; + value2->c = tid + 1; + value2->d = tid + 0; + + /* Run task_main again */ + pthread_mutex_lock(&global_mutex); + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.task_main), &opts); + ASSERT_OK(err, "run task_main"); + ASSERT_OK(opts.retval, "task_main retval"); + + ASSERT_EQ(skel->bss->test_value0, tid + 5, "tld_get_data value0"); + ASSERT_EQ(skel->bss->test_value1, tid + 4, "tld_get_data value1"); + ASSERT_EQ(skel->bss->test_value2.a, tid + 3, "tld_get_data value2.a"); + ASSERT_EQ(skel->bss->test_value2.b, tid + 2, "tld_get_data value2.b"); + ASSERT_EQ(skel->bss->test_value2.c, tid + 1, "tld_get_data value2.c"); + ASSERT_EQ(skel->bss->test_value2.d, tid + 0, "tld_get_data value2.d"); + pthread_mutex_unlock(&global_mutex); + +out: + pthread_exit(NULL); +} + +static void test_task_local_data_basic(void) +{ + struct test_task_local_data *skel; + pthread_t thread[TEST_BASIC_THREAD_NUM]; + char dummy_key_name[TLD_NAME_LEN]; + tld_key_t key; + int i, err; + + reset_tld(); + + ASSERT_OK(pthread_mutex_init(&global_mutex, NULL), "pthread_mutex_init"); + + skel = test_task_local_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + tld_keys = calloc(TLD_MAX_DATA_CNT, sizeof(tld_key_t)); + if (!ASSERT_OK_PTR(tld_keys, "calloc tld_keys")) + goto out; + + ASSERT_FALSE(tld_key_is_err(value0_key), "TLD_DEFINE_KEY"); + tld_keys[1] = tld_create_key("value1", sizeof(int)); + ASSERT_FALSE(tld_key_is_err(tld_keys[1]), "tld_create_key"); + tld_keys[2] = tld_create_key("value2", sizeof(struct test_tld_struct)); + ASSERT_FALSE(tld_key_is_err(tld_keys[2]), "tld_create_key"); + + /* + * Shouldn't be able to store data exceed a page. Create a TLD just big + * enough to exceed a page. TLDs already created are int value0, int + * value1, and struct test_tld_struct value2. + */ + key = tld_create_key("value_not_exist", + TLD_PAGE_SIZE - 2 * sizeof(int) - sizeof(struct test_tld_struct) + 1); + ASSERT_EQ(tld_key_err_or_zero(key), -E2BIG, "tld_create_key"); + + key = tld_create_key("value2", sizeof(struct test_tld_struct)); + ASSERT_EQ(tld_key_err_or_zero(key), -EEXIST, "tld_create_key"); + + /* Shouldn't be able to create the (TLD_MAX_DATA_CNT+1)-th TLD */ + for (i = 3; i < TLD_MAX_DATA_CNT; i++) { + snprintf(dummy_key_name, TLD_NAME_LEN, "dummy_value%d", i); + tld_keys[i] = tld_create_key(dummy_key_name, sizeof(int)); + ASSERT_FALSE(tld_key_is_err(tld_keys[i]), "tld_create_key"); + } + key = tld_create_key("value_not_exist", sizeof(struct test_tld_struct)); + ASSERT_EQ(tld_key_err_or_zero(key), -ENOSPC, "tld_create_key"); + + /* Access TLDs from multiple threads and check if they are thread-specific */ + for (i = 0; i < TEST_BASIC_THREAD_NUM; i++) { + err = pthread_create(&thread[i], NULL, test_task_local_data_basic_thread, skel); + if (!ASSERT_OK(err, "pthread_create")) + goto out; + } + +out: + for (i = 0; i < TEST_BASIC_THREAD_NUM; i++) + pthread_join(thread[i], NULL); + + if (tld_keys) { + free(tld_keys); + tld_keys = NULL; + } + tld_free(); + test_task_local_data__destroy(skel); +} + +#define TEST_RACE_THREAD_NUM (TLD_MAX_DATA_CNT - 3) + +void *test_task_local_data_race_thread(void *arg) +{ + int err = 0, id = (intptr_t)arg; + char key_name[32]; + tld_key_t key; + + key = tld_create_key("value_not_exist", TLD_PAGE_SIZE + 1); + if (tld_key_err_or_zero(key) != -E2BIG) { + err = 1; + goto out; + } + + /* Only one thread will succeed in creating value1 */ + key = tld_create_key("value1", sizeof(int)); + if (!tld_key_is_err(key)) + tld_keys[1] = key; + + /* Only one thread will succeed in creating value2 */ + key = tld_create_key("value2", sizeof(struct test_tld_struct)); + if (!tld_key_is_err(key)) + tld_keys[2] = key; + + snprintf(key_name, 32, "thread_%d", id); + tld_keys[id] = tld_create_key(key_name, sizeof(int)); + if (tld_key_is_err(tld_keys[id])) + err = 2; +out: + return (void *)(intptr_t)err; +} + +static void test_task_local_data_race(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + pthread_t thread[TEST_RACE_THREAD_NUM]; + struct test_task_local_data *skel; + int fd, i, j, err, *data; + void *ret = NULL; + + skel = test_task_local_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + tld_keys = calloc(TLD_MAX_DATA_CNT, sizeof(tld_key_t)); + if (!ASSERT_OK_PTR(tld_keys, "calloc tld_keys")) + goto out; + + fd = bpf_map__fd(skel->maps.tld_data_map); + + ASSERT_FALSE(tld_key_is_err(value0_key), "TLD_DEFINE_KEY"); + tld_keys[0] = value0_key; + + for (j = 0; j < 100; j++) { + reset_tld(); + + for (i = 0; i < TEST_RACE_THREAD_NUM; i++) { + /* + * Try to make tld_create_key() race with each other. Call + * tld_create_key(), both valid and invalid, from different threads. + */ + err = pthread_create(&thread[i], NULL, test_task_local_data_race_thread, + (void *)(intptr_t)(i + 3)); + if (CHECK_FAIL(err)) + break; + } + + /* Wait for all tld_create_key() to return */ + for (i = 0; i < TEST_RACE_THREAD_NUM; i++) { + pthread_join(thread[i], &ret); + if (CHECK_FAIL(ret)) + break; + } + + /* Write a unique number to each TLD */ + for (i = 0; i < TLD_MAX_DATA_CNT; i++) { + data = tld_get_data(fd, tld_keys[i]); + if (CHECK_FAIL(!data)) + break; + *data = i; + } + + /* Read TLDs and check the value to see if any address collides with another */ + for (i = 0; i < TLD_MAX_DATA_CNT; i++) { + data = tld_get_data(fd, tld_keys[i]); + if (CHECK_FAIL(*data != i)) + break; + } + + /* Run task_main to make sure no invalid TLDs are added */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.task_main), &opts); + ASSERT_OK(err, "run task_main"); + ASSERT_OK(opts.retval, "task_main retval"); + } +out: + if (tld_keys) { + free(tld_keys); + tld_keys = NULL; + } + tld_free(); + test_task_local_data__destroy(skel); +} + +void test_task_local_data(void) +{ + if (test__start_subtest("task_local_data_basic")) + test_task_local_data_basic(); + if (test__start_subtest("task_local_data_race")) + test_task_local_data_race(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_work.c b/tools/testing/selftests/bpf/prog_tests/test_task_work.c new file mode 100644 index 000000000000..774b31a5f6ca --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_task_work.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include <string.h> +#include <stdio.h> +#include "task_work.skel.h" +#include "task_work_fail.skel.h" +#include <linux/bpf.h> +#include <linux/perf_event.h> +#include <sys/syscall.h> +#include <time.h> + +static int perf_event_open(__u32 type, __u64 config, int pid) +{ + struct perf_event_attr attr = { + .type = type, + .config = config, + .size = sizeof(struct perf_event_attr), + .sample_period = 100000, + }; + + return syscall(__NR_perf_event_open, &attr, pid, -1, -1, 0); +} + +struct elem { + char data[128]; + struct bpf_task_work tw; +}; + +static int verify_map(struct bpf_map *map, const char *expected_data) +{ + int err; + struct elem value; + int processed_values = 0; + int k, sz; + + sz = bpf_map__max_entries(map); + for (k = 0; k < sz; ++k) { + err = bpf_map__lookup_elem(map, &k, sizeof(int), &value, sizeof(struct elem), 0); + if (err) + continue; + if (!ASSERT_EQ(strcmp(expected_data, value.data), 0, "map data")) { + fprintf(stderr, "expected '%s', found '%s' in %s map", expected_data, + value.data, bpf_map__name(map)); + return 2; + } + processed_values++; + } + + return processed_values == 0; +} + +static void task_work_run(const char *prog_name, const char *map_name) +{ + struct task_work *skel; + struct bpf_program *prog; + struct bpf_map *map; + struct bpf_link *link = NULL; + int err, pe_fd = -1, pid, status, pipefd[2]; + char user_string[] = "hello world"; + + if (!ASSERT_NEQ(pipe(pipefd), -1, "pipe")) + return; + + pid = fork(); + if (pid == 0) { + __u64 num = 1; + int i; + char buf; + + close(pipefd[1]); + read(pipefd[0], &buf, sizeof(buf)); + close(pipefd[0]); + + for (i = 0; i < 10000; ++i) + num *= time(0) % 7; + (void)num; + exit(0); + } + if (!ASSERT_GT(pid, 0, "fork() failed")) { + close(pipefd[0]); + close(pipefd[1]); + return; + } + + skel = task_work__open(); + if (!ASSERT_OK_PTR(skel, "task_work__open")) + return; + + bpf_object__for_each_program(prog, skel->obj) { + bpf_program__set_autoload(prog, false); + } + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "prog_name")) + goto cleanup; + bpf_program__set_autoload(prog, true); + skel->bss->user_ptr = (char *)user_string; + + err = task_work__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + pe_fd = perf_event_open(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, pid); + if (pe_fd == -1 && (errno == ENOENT || errno == EOPNOTSUPP)) { + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); + test__skip(); + goto cleanup; + } + if (!ASSERT_NEQ(pe_fd, -1, "pe_fd")) { + fprintf(stderr, "perf_event_open errno: %d, pid: %d\n", errno, pid); + goto cleanup; + } + + link = bpf_program__attach_perf_event(prog, pe_fd); + if (!ASSERT_OK_PTR(link, "attach_perf_event")) + goto cleanup; + + /* perf event fd ownership is passed to bpf_link */ + pe_fd = -1; + close(pipefd[0]); + write(pipefd[1], user_string, 1); + close(pipefd[1]); + /* Wait to collect some samples */ + waitpid(pid, &status, 0); + pid = 0; + map = bpf_object__find_map_by_name(skel->obj, map_name); + if (!ASSERT_OK_PTR(map, "find map_name")) + goto cleanup; + if (!ASSERT_OK(verify_map(map, user_string), "verify map")) + goto cleanup; +cleanup: + if (pe_fd >= 0) + close(pe_fd); + bpf_link__destroy(link); + task_work__destroy(skel); + if (pid > 0) { + close(pipefd[0]); + write(pipefd[1], user_string, 1); + close(pipefd[1]); + waitpid(pid, &status, 0); + } +} + +void test_task_work(void) +{ + if (test__start_subtest("test_task_work_hash_map")) + task_work_run("oncpu_hash_map", "hmap"); + + if (test__start_subtest("test_task_work_array_map")) + task_work_run("oncpu_array_map", "arrmap"); + + if (test__start_subtest("test_task_work_lru_map")) + task_work_run("oncpu_lru_map", "lrumap"); + + RUN_TESTS(task_work_fail); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_veristat.c b/tools/testing/selftests/bpf/prog_tests/test_veristat.c index 367f47e4a936..b38c16b4247f 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_veristat.c +++ b/tools/testing/selftests/bpf/prog_tests/test_veristat.c @@ -75,26 +75,26 @@ static void test_set_global_vars_succeeds(void) " -vl2 > %s", fix->veristat, fix->tmpfile); read(fix->fd, fix->output, fix->sz); - __CHECK_STR("_w=0xf000000000000001 ", "var_s64 = 0xf000000000000001"); - __CHECK_STR("_w=0xfedcba9876543210 ", "var_u64 = 0xfedcba9876543210"); - __CHECK_STR("_w=0x80000000 ", "var_s32 = -0x80000000"); - __CHECK_STR("_w=0x76543210 ", "var_u32 = 0x76543210"); - __CHECK_STR("_w=0x8000 ", "var_s16 = -32768"); - __CHECK_STR("_w=0xecec ", "var_u16 = 60652"); - __CHECK_STR("_w=128 ", "var_s8 = -128"); - __CHECK_STR("_w=255 ", "var_u8 = 255"); - __CHECK_STR("_w=11 ", "var_ea = EA2"); - __CHECK_STR("_w=12 ", "var_eb = EB2"); - __CHECK_STR("_w=13 ", "var_ec = EC2"); - __CHECK_STR("_w=1 ", "var_b = 1"); - __CHECK_STR("_w=170 ", "struct1[2].struct2[1][2].u.var_u8[2]=170"); - __CHECK_STR("_w=0xaaaa ", "union1.var_u16 = 0xaaaa"); - __CHECK_STR("_w=171 ", "arr[3]= 171"); - __CHECK_STR("_w=172 ", "arr[EA2] =172"); - __CHECK_STR("_w=10 ", "enum_arr[EC2]=EA3"); - __CHECK_STR("_w=173 ", "matrix[31][7][11]=173"); - __CHECK_STR("_w=174 ", "struct1[2].struct2[1][2].u.mat[5][3]=174"); - __CHECK_STR("_w=175 ", "struct11[7][5].struct2[0][1].u.mat[3][0]=175"); + __CHECK_STR("=0xf000000000000001 ", "var_s64 = 0xf000000000000001"); + __CHECK_STR("=0xfedcba9876543210 ", "var_u64 = 0xfedcba9876543210"); + __CHECK_STR("=0x80000000 ", "var_s32 = -0x80000000"); + __CHECK_STR("=0x76543210 ", "var_u32 = 0x76543210"); + __CHECK_STR("=0x8000 ", "var_s16 = -32768"); + __CHECK_STR("=0xecec ", "var_u16 = 60652"); + __CHECK_STR("=128 ", "var_s8 = -128"); + __CHECK_STR("=255 ", "var_u8 = 255"); + __CHECK_STR("=11 ", "var_ea = EA2"); + __CHECK_STR("=12 ", "var_eb = EB2"); + __CHECK_STR("=13 ", "var_ec = EC2"); + __CHECK_STR("=1 ", "var_b = 1"); + __CHECK_STR("=170 ", "struct1[2].struct2[1][2].u.var_u8[2]=170"); + __CHECK_STR("=0xaaaa ", "union1.var_u16 = 0xaaaa"); + __CHECK_STR("=171 ", "arr[3]= 171"); + __CHECK_STR("=172 ", "arr[EA2] =172"); + __CHECK_STR("=10 ", "enum_arr[EC2]=EA3"); + __CHECK_STR("=173 ", "matrix[31][7][11]=173"); + __CHECK_STR("=174 ", "struct1[2].struct2[1][2].u.mat[5][3]=174"); + __CHECK_STR("=175 ", "struct11[7][5].struct2[0][1].u.mat[3][0]=175"); out: teardown_fixture(fix); @@ -117,8 +117,8 @@ static void test_set_global_vars_from_file_succeeds(void) SYS(out, "%s set_global_vars.bpf.o -G \"@%s\" -vl2 > %s", fix->veristat, input_file, fix->tmpfile); read(fix->fd, fix->output, fix->sz); - __CHECK_STR("_w=0x8000 ", "var_s16 = -32768"); - __CHECK_STR("_w=0xecec ", "var_u16 = 60652"); + __CHECK_STR("=0x8000 ", "var_s16 = -32768"); + __CHECK_STR("=0xecec ", "var_u16 = 60652"); out: close(fd); diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index d66687f1ee6a..34f9ccce2602 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -3,6 +3,7 @@ #include <test_progs.h> #include "timer.skel.h" #include "timer_failure.skel.h" +#include "timer_interrupt.skel.h" #define NUM_THR 8 @@ -86,6 +87,10 @@ void serial_test_timer(void) int err; timer_skel = timer__open_and_load(); + if (!timer_skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) return; @@ -95,3 +100,36 @@ void serial_test_timer(void) RUN_TESTS(timer_failure); } + +void test_timer_interrupt(void) +{ + struct timer_interrupt *skel = NULL; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, opts); + + skel = timer_interrupt__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } + if (!ASSERT_OK_PTR(skel, "timer_interrupt__open_and_load")) + return; + + err = timer_interrupt__attach(skel); + if (!ASSERT_OK(err, "timer_interrupt__attach")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.test_timer_interrupt); + err = bpf_prog_test_run_opts(prog_fd, &opts); + if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) + goto out; + + usleep(50); + + ASSERT_EQ(skel->bss->in_interrupt, 0, "in_interrupt"); + if (skel->bss->preempt_count) + ASSERT_NEQ(skel->bss->in_interrupt_cb, 0, "in_interrupt_cb"); + +out: + timer_interrupt__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/timer_crash.c b/tools/testing/selftests/bpf/prog_tests/timer_crash.c index f74b82305da8..b841597c8a3a 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_crash.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_crash.c @@ -12,6 +12,10 @@ static void test_timer_crash_mode(int mode) struct timer_crash *skel; skel = timer_crash__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "timer_crash__open_and_load")) return; skel->bss->pid = getpid(); diff --git a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c index 1a2f99596916..eb303fa1e09a 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c @@ -59,6 +59,10 @@ void test_timer_lockup(void) } skel = timer_lockup__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/timer_mim.c b/tools/testing/selftests/bpf/prog_tests/timer_mim.c index 9ff7843909e7..c930c7d7105b 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_mim.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_mim.c @@ -65,6 +65,10 @@ void serial_test_timer_mim(void) goto cleanup; timer_skel = timer_mim__open_and_load(); + if (!timer_skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_struct.c b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c index 19e68d4b3532..6f8c0bfb0415 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_struct.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c @@ -112,10 +112,39 @@ destroy_skel: tracing_struct_many_args__destroy(skel); } +static void test_union_args(void) +{ + struct tracing_struct *skel; + int err; + + skel = tracing_struct__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_struct__open_and_load")) + return; + + err = tracing_struct__attach(skel); + if (!ASSERT_OK(err, "tracing_struct__attach")) + goto out; + + ASSERT_OK(trigger_module_test_read(256), "trigger_read"); + + ASSERT_EQ(skel->bss->ut1_a_a, 1, "ut1:a.arg.a"); + ASSERT_EQ(skel->bss->ut1_b, 4, "ut1:b"); + ASSERT_EQ(skel->bss->ut1_c, 5, "ut1:c"); + + ASSERT_EQ(skel->bss->ut2_a, 6, "ut2:a"); + ASSERT_EQ(skel->bss->ut2_b_a, 2, "ut2:b.arg.a"); + ASSERT_EQ(skel->bss->ut2_b_b, 3, "ut2:b.arg.b"); + +out: + tracing_struct__destroy(skel); +} + void test_tracing_struct(void) { if (test__start_subtest("struct_args")) test_struct_args(); if (test__start_subtest("struct_many_args")) test_struct_many_args(); + if (test__start_subtest("union_args")) + test_union_args(); } diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe.c b/tools/testing/selftests/bpf/prog_tests/uprobe.c index cf3e0e7a64fa..86404476c1da 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe.c @@ -2,6 +2,7 @@ /* Copyright (c) 2023 Hengqi Chen */ #include <test_progs.h> +#include <asm/ptrace.h> #include "test_uprobe.skel.h" static FILE *urand_spawn(int *pid) @@ -33,7 +34,7 @@ static int urand_trigger(FILE **urand_pipe) return exit_code; } -void test_uprobe(void) +static void test_uprobe_attach(void) { LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); struct test_uprobe *skel; @@ -93,3 +94,156 @@ cleanup: pclose(urand_pipe); test_uprobe__destroy(skel); } + +#ifdef __x86_64__ +__naked __maybe_unused unsigned long uprobe_regs_change_trigger(void) +{ + asm volatile ( + "ret\n" + ); +} + +static __naked void uprobe_regs_change(struct pt_regs *before, struct pt_regs *after) +{ + asm volatile ( + "movq %r11, 48(%rdi)\n" + "movq %r10, 56(%rdi)\n" + "movq %r9, 64(%rdi)\n" + "movq %r8, 72(%rdi)\n" + "movq %rax, 80(%rdi)\n" + "movq %rcx, 88(%rdi)\n" + "movq %rdx, 96(%rdi)\n" + "movq %rsi, 104(%rdi)\n" + "movq %rdi, 112(%rdi)\n" + + /* save 2nd argument */ + "pushq %rsi\n" + "call uprobe_regs_change_trigger\n" + + /* save return value and load 2nd argument pointer to rax */ + "pushq %rax\n" + "movq 8(%rsp), %rax\n" + + "movq %r11, 48(%rax)\n" + "movq %r10, 56(%rax)\n" + "movq %r9, 64(%rax)\n" + "movq %r8, 72(%rax)\n" + "movq %rcx, 88(%rax)\n" + "movq %rdx, 96(%rax)\n" + "movq %rsi, 104(%rax)\n" + "movq %rdi, 112(%rax)\n" + + /* restore return value and 2nd argument */ + "pop %rax\n" + "pop %rsi\n" + + "movq %rax, 80(%rsi)\n" + "ret\n" + ); +} + +static void regs_common(void) +{ + struct pt_regs before = {}, after = {}, expected = { + .rax = 0xc0ffe, + .rcx = 0xbad, + .rdx = 0xdead, + .r8 = 0x8, + .r9 = 0x9, + .r10 = 0x10, + .r11 = 0x11, + .rdi = 0x12, + .rsi = 0x13, + }; + LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); + struct test_uprobe *skel; + + skel = test_uprobe__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->my_pid = getpid(); + skel->bss->regs = expected; + + uprobe_opts.func_name = "uprobe_regs_change_trigger"; + skel->links.test_regs_change = bpf_program__attach_uprobe_opts(skel->progs.test_regs_change, + -1, + "/proc/self/exe", + 0 /* offset */, + &uprobe_opts); + if (!ASSERT_OK_PTR(skel->links.test_regs_change, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + uprobe_regs_change(&before, &after); + + ASSERT_EQ(after.rax, expected.rax, "ax"); + ASSERT_EQ(after.rcx, expected.rcx, "cx"); + ASSERT_EQ(after.rdx, expected.rdx, "dx"); + ASSERT_EQ(after.r8, expected.r8, "r8"); + ASSERT_EQ(after.r9, expected.r9, "r9"); + ASSERT_EQ(after.r10, expected.r10, "r10"); + ASSERT_EQ(after.r11, expected.r11, "r11"); + ASSERT_EQ(after.rdi, expected.rdi, "rdi"); + ASSERT_EQ(after.rsi, expected.rsi, "rsi"); + +cleanup: + test_uprobe__destroy(skel); +} + +static noinline unsigned long uprobe_regs_change_ip_1(void) +{ + return 0xc0ffee; +} + +static noinline unsigned long uprobe_regs_change_ip_2(void) +{ + return 0xdeadbeef; +} + +static void regs_ip(void) +{ + LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); + struct test_uprobe *skel; + unsigned long ret; + + skel = test_uprobe__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->my_pid = getpid(); + skel->bss->ip = (unsigned long) uprobe_regs_change_ip_2; + + uprobe_opts.func_name = "uprobe_regs_change_ip_1"; + skel->links.test_regs_change_ip = bpf_program__attach_uprobe_opts( + skel->progs.test_regs_change_ip, + -1, + "/proc/self/exe", + 0 /* offset */, + &uprobe_opts); + if (!ASSERT_OK_PTR(skel->links.test_regs_change_ip, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + ret = uprobe_regs_change_ip_1(); + ASSERT_EQ(ret, 0xdeadbeef, "ret"); + +cleanup: + test_uprobe__destroy(skel); +} + +static void test_uprobe_regs_change(void) +{ + if (test__start_subtest("regs_change_common")) + regs_common(); + if (test__start_subtest("regs_change_ip")) + regs_ip(); +} +#else +static void test_uprobe_regs_change(void) { } +#endif + +void test_uprobe(void) +{ + if (test__start_subtest("attach")) + test_uprobe_attach(); + test_uprobe_regs_change(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index b17dc39a23db..6d75ede16e7c 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -8,22 +8,31 @@ #include <asm/ptrace.h> #include <linux/compiler.h> #include <linux/stringify.h> +#include <linux/kernel.h> #include <sys/wait.h> #include <sys/syscall.h> #include <sys/prctl.h> #include <asm/prctl.h> #include "uprobe_syscall.skel.h" #include "uprobe_syscall_executed.skel.h" +#include "bpf/libbpf_internal.h" -__naked unsigned long uretprobe_regs_trigger(void) +#define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00 +#include "usdt.h" + +#pragma GCC diagnostic ignored "-Wattributes" + +__attribute__((aligned(16))) +__nocf_check __weak __naked unsigned long uprobe_regs_trigger(void) { asm volatile ( + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */ "movq $0xdeadbeef, %rax\n" "ret\n" ); } -__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) +__naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after) { asm volatile ( "movq %r15, 0(%rdi)\n" @@ -44,15 +53,17 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) "movq $0, 120(%rdi)\n" /* orig_rax */ "movq $0, 128(%rdi)\n" /* rip */ "movq $0, 136(%rdi)\n" /* cs */ + "pushq %rax\n" "pushf\n" "pop %rax\n" "movq %rax, 144(%rdi)\n" /* eflags */ + "pop %rax\n" "movq %rsp, 152(%rdi)\n" /* rsp */ "movq $0, 160(%rdi)\n" /* ss */ /* save 2nd argument */ "pushq %rsi\n" - "call uretprobe_regs_trigger\n" + "call uprobe_regs_trigger\n" /* save return value and load 2nd argument pointer to rax */ "pushq %rax\n" @@ -92,25 +103,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) ); } -static void test_uretprobe_regs_equal(void) +static void test_uprobe_regs_equal(bool retprobe) { + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = retprobe, + ); struct uprobe_syscall *skel = NULL; struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; unsigned long *pa = (unsigned long *) &after; unsigned long *pp; + unsigned long offset; unsigned int i, cnt; - int err; + + offset = get_uprobe_offset(&uprobe_regs_trigger); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return; skel = uprobe_syscall__open_and_load(); if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load")) goto cleanup; - err = uprobe_syscall__attach(skel); - if (!ASSERT_OK(err, "uprobe_syscall__attach")) + skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts")) goto cleanup; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + if (!retprobe) + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); pp = (unsigned long *) &skel->bss->regs; cnt = sizeof(before)/sizeof(*pb); @@ -119,7 +142,7 @@ static void test_uretprobe_regs_equal(void) unsigned int offset = i * sizeof(unsigned long); /* - * Check register before and after uretprobe_regs_trigger call + * Check register before and after uprobe_regs_trigger call * that triggers the uretprobe. */ switch (offset) { @@ -133,7 +156,7 @@ static void test_uretprobe_regs_equal(void) /* * Check register seen from bpf program and register after - * uretprobe_regs_trigger call + * uprobe_regs_trigger call (with rax exception, check below). */ switch (offset) { /* @@ -146,6 +169,15 @@ static void test_uretprobe_regs_equal(void) case offsetof(struct pt_regs, rsp): case offsetof(struct pt_regs, ss): break; + /* + * uprobe does not see return value in rax, it needs to see the + * original (before) rax value + */ + case offsetof(struct pt_regs, rax): + if (!retprobe) { + ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check"); + break; + } default: if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check")) fprintf(stdout, "failed register offset %u\n", offset); @@ -175,7 +207,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset) return ret != n ? (int) ret : 0; } -static void test_uretprobe_regs_change(void) +static void test_regs_change(void) { struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; @@ -183,13 +215,16 @@ static void test_uretprobe_regs_change(void) unsigned long cnt = sizeof(before)/sizeof(*pb); unsigned int i, err, offset; - offset = get_uprobe_offset(uretprobe_regs_trigger); + offset = get_uprobe_offset(uprobe_regs_trigger); err = write_bpf_testmod_uprobe(offset); if (!ASSERT_OK(err, "register_uprobe")) return; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); err = write_bpf_testmod_uprobe(0); if (!ASSERT_OK(err, "unregister_uprobe")) @@ -252,6 +287,7 @@ static void test_uretprobe_syscall_call(void) ); struct uprobe_syscall_executed *skel; int pid, status, err, go[2], c = 0; + struct bpf_link *link; if (!ASSERT_OK(pipe(go), "pipe")) return; @@ -277,11 +313,14 @@ static void test_uretprobe_syscall_call(void) _exit(0); } - skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid, - "/proc/self/exe", - "uretprobe_syscall_call", &opts); - if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi")) + skel->bss->pid = pid; + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + pid, "/proc/self/exe", + "uretprobe_syscall_call", &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) goto cleanup; + skel->links.test_uretprobe_multi = link; /* kick the child */ write(go[1], &c, 1); @@ -301,6 +340,256 @@ cleanup: close(go[0]); } +#define TRAMP "[uprobes-trampoline]" + +__attribute__((aligned(16))) +__nocf_check __weak __naked void uprobe_test(void) +{ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} + +__attribute__((aligned(16))) +__nocf_check __weak void usdt_test(void) +{ + USDT(optimized_uprobe, usdt); +} + +static int find_uprobes_trampoline(void *tramp_addr) +{ + void *start, *end; + char line[128]; + int ret = -1; + FILE *maps; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + fprintf(stderr, "cannot open maps\n"); + return -1; + } + + while (fgets(line, sizeof(line), maps)) { + int m = -1; + + /* We care only about private r-x mappings. */ + if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", &start, &end, &m) != 2) + continue; + if (m < 0) + continue; + if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1) && (start == tramp_addr)) { + ret = 0; + break; + } + } + + fclose(maps); + return ret; +} + +static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; + +static void *find_nop5(void *fn) +{ + int i; + + for (i = 0; i < 10; i++) { + if (!memcmp(nop5, fn + i, 5)) + return fn + i; + } + return NULL; +} + +typedef void (__attribute__((nocf_check)) *trigger_t)(void); + +static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger, + void *addr, int executed) +{ + struct __arch_relative_insn { + __u8 op; + __s32 raddr; + } __packed *call; + void *tramp = NULL; + + /* Uprobe gets optimized after first trigger, so let's press twice. */ + trigger(); + trigger(); + + /* Make sure bpf program got executed.. */ + ASSERT_EQ(skel->bss->executed, executed, "executed"); + + /* .. and check the trampoline is as expected. */ + call = (struct __arch_relative_insn *) addr; + tramp = (void *) (call + 1) + call->raddr; + ASSERT_EQ(call->op, 0xe8, "call"); + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); + + return tramp; +} + +static void check_detach(void *addr, void *tramp) +{ + /* [uprobes_trampoline] stays after detach */ + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); + ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); +} + +static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link, + trigger_t trigger, void *addr, int executed) +{ + void *tramp; + + tramp = check_attach(skel, trigger, addr, executed); + bpf_link__destroy(link); + check_detach(addr, tramp); +} + +static void test_uprobe_legacy(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + /* uprobe */ + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe */ + skel->bss->executed = 0; + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_multi(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + /* uprobe.multi */ + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe.multi */ + skel->bss->executed = 0; + opts.retprobe = true; + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_session(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .session = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 4); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_usdt(void) +{ + struct uprobe_syscall_executed *skel; + struct bpf_link *link; + void *addr; + + errno = 0; + addr = find_nop5(usdt_test); + if (!ASSERT_OK_PTR(addr, "find_nop5")) + return; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_usdt(skel->progs.test_usdt, + -1 /* all PIDs */, "/proc/self/exe", + "optimized_uprobe", "usdt", NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt")) + goto cleanup; + + check(skel, link, usdt_test, addr, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + /* * Borrowed from tools/testing/selftests/x86/test_shadow_stack.c. * @@ -343,43 +632,172 @@ static void test_uretprobe_shadow_stack(void) return; } - /* Run all of the uretprobe tests. */ - test_uretprobe_regs_equal(); - test_uretprobe_regs_change(); + /* Run all the tests with shadow stack in place. */ + + test_uprobe_regs_equal(false); + test_uprobe_regs_equal(true); test_uretprobe_syscall_call(); + test_uprobe_legacy(); + test_uprobe_multi(); + test_uprobe_session(); + test_uprobe_usdt(); + + test_regs_change(); + ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } -#else -static void test_uretprobe_regs_equal(void) + +static volatile bool race_stop; + +static USDT_DEFINE_SEMA(race); + +static void *worker_trigger(void *arg) { - test__skip(); + unsigned long rounds = 0; + + while (!race_stop) { + uprobe_test(); + rounds++; + } + + printf("tid %d trigger rounds: %lu\n", gettid(), rounds); + return NULL; } -static void test_uretprobe_regs_change(void) +static void *worker_attach(void *arg) { - test__skip(); + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct uprobe_syscall_executed *skel; + unsigned long rounds = 0, offset; + const char *sema[2] = { + __stringify(USDT_SEMA(race)), + NULL, + }; + unsigned long *ref; + int err; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return NULL; + + err = elf_resolve_syms_offsets("/proc/self/exe", 1, (const char **) &sema, &ref, STT_OBJECT); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema")) + return NULL; + + opts.ref_ctr_offset = *ref; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return NULL; + + skel->bss->pid = getpid(); + + while (!race_stop) { + skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts")) + break; + + bpf_link__destroy(skel->links.test_uprobe); + skel->links.test_uprobe = NULL; + rounds++; + } + + printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed); + uprobe_syscall_executed__destroy(skel); + free(ref); + return NULL; } -static void test_uretprobe_syscall_call(void) +static useconds_t race_msec(void) { - test__skip(); + char *env; + + env = getenv("BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC"); + if (env) + return atoi(env); + + /* default duration is 500ms */ + return 500; } -static void test_uretprobe_shadow_stack(void) +static void test_uprobe_race(void) { - test__skip(); + int err, i, nr_threads; + pthread_t *threads; + + nr_threads = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_threads, 0, "libbpf_num_possible_cpus")) + return; + nr_threads = max(2, nr_threads); + + threads = alloca(sizeof(*threads) * nr_threads); + if (!ASSERT_OK_PTR(threads, "malloc")) + return; + + for (i = 0; i < nr_threads; i++) { + err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach, + NULL); + if (!ASSERT_OK(err, "pthread_create")) + goto cleanup; + } + + usleep(race_msec() * 1000); + +cleanup: + race_stop = true; + for (nr_threads = i, i = 0; i < nr_threads; i++) + pthread_join(threads[i], NULL); + + ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore"); } + +#ifndef __NR_uprobe +#define __NR_uprobe 336 #endif -void test_uprobe_syscall(void) +static void test_uprobe_error(void) +{ + long err = syscall(__NR_uprobe); + + ASSERT_EQ(err, -1, "error"); + ASSERT_EQ(errno, ENXIO, "errno"); +} + +static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) - test_uretprobe_regs_equal(); - if (test__start_subtest("uretprobe_regs_change")) - test_uretprobe_regs_change(); + test_uprobe_regs_equal(true); if (test__start_subtest("uretprobe_syscall_call")) test_uretprobe_syscall_call(); if (test__start_subtest("uretprobe_shadow_stack")) test_uretprobe_shadow_stack(); + if (test__start_subtest("uprobe_legacy")) + test_uprobe_legacy(); + if (test__start_subtest("uprobe_multi")) + test_uprobe_multi(); + if (test__start_subtest("uprobe_session")) + test_uprobe_session(); + if (test__start_subtest("uprobe_usdt")) + test_uprobe_usdt(); + if (test__start_subtest("uprobe_race")) + test_uprobe_race(); + if (test__start_subtest("uprobe_error")) + test_uprobe_error(); + if (test__start_subtest("uprobe_regs_equal")) + test_uprobe_regs_equal(false); + if (test__start_subtest("regs_change")) + test_regs_change(); +} +#else +static void __test_uprobe_syscall(void) +{ + test__skip(); +} +#endif + +void test_uprobe_syscall(void) +{ + __test_uprobe_syscall(); } diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 9057e983cc54..4f7f45e69315 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -40,12 +40,79 @@ static void __always_inline trigger_func(int x) { } } -static void subtest_basic_usdt(void) +#if defined(__x86_64__) || defined(__i386__) +/* + * SIB (Scale-Index-Base) addressing format: "size@(base_reg, index_reg, scale)" + * - 'size' is the size in bytes of the array element, and its sign indicates + * whether the type is signed (negative) or unsigned (positive). + * - 'base_reg' is the register holding the base address, normally rdx or edx + * - 'index_reg' is the register holding the index, normally rax or eax + * - 'scale' is the scaling factor (typically 1, 2, 4, or 8), which matches the + * size of the element type. + * + * For example, for an array of 'short' (signed 2-byte elements), the SIB spec would be: + * - size: -2 (negative because 'short' is signed) + * - scale: 2 (since sizeof(short) == 2) + * + * The resulting SIB format: "-2@(%%rdx,%%rax,2)" for x86_64, "-2@(%%edx,%%eax,2)" for i386 + */ +static volatile short array[] = {-1, -2, -3, -4}; + +#if defined(__x86_64__) +#define USDT_SIB_ARG_SPEC -2@(%%rdx,%%rax,2) +#else +#define USDT_SIB_ARG_SPEC -2@(%%edx,%%eax,2) +#endif + +unsigned short test_usdt_sib_semaphore SEC(".probes"); + +static void trigger_sib_spec(void) +{ + /* + * Force SIB addressing with inline assembly. + * + * You must compile with -std=gnu99 or -std=c99 to use the + * STAP_PROBE_ASM macro. + * + * The STAP_PROBE_ASM macro generates a quoted string that gets + * inserted between the surrounding assembly instructions. In this + * case, USDT_SIB_ARG_SPEC is embedded directly into the instruction + * stream, creating a probe point between the asm statement boundaries. + * It works fine with gcc/clang. + * + * Register constraints: + * - "d"(array): Binds the 'array' variable to %rdx or %edx register + * - "a"(0): Binds the constant 0 to %rax or %eax register + * These ensure that when USDT_SIB_ARG_SPEC references %%rdx(%edx) and + * %%rax(%eax), they contain the expected values for SIB addressing. + * + * The "memory" clobber prevents the compiler from reordering memory + * accesses around the probe point, ensuring that the probe behavior + * is predictable and consistent. + */ + asm volatile( + STAP_PROBE_ASM(test, usdt_sib, USDT_SIB_ARG_SPEC) + : + : "d"(array), "a"(0) + : "memory" + ); +} +#endif + +static void subtest_basic_usdt(bool optimized) { LIBBPF_OPTS(bpf_usdt_opts, opts); struct test_usdt *skel; struct test_usdt__bss *bss; - int err, i; + int err, i, called; + const __u64 expected_cookie = 0xcafedeadbeeffeed; + +#define TRIGGER(x) ({ \ + trigger_func(x); \ + if (optimized) \ + trigger_func(x); \ + optimized ? 2 : 1; \ + }) skel = test_usdt__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) @@ -59,20 +126,29 @@ static void subtest_basic_usdt(void) goto cleanup; /* usdt0 won't be auto-attached */ - opts.usdt_cookie = 0xcafedeadbeeffeed; + opts.usdt_cookie = expected_cookie; skel->links.usdt0 = bpf_program__attach_usdt(skel->progs.usdt0, 0 /*self*/, "/proc/self/exe", "test", "usdt0", &opts); if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) goto cleanup; - trigger_func(1); +#if defined(__x86_64__) || defined(__i386__) + opts.usdt_cookie = expected_cookie; + skel->links.usdt_sib = bpf_program__attach_usdt(skel->progs.usdt_sib, + 0 /*self*/, "/proc/self/exe", + "test", "usdt_sib", &opts); + if (!ASSERT_OK_PTR(skel->links.usdt_sib, "usdt_sib_link")) + goto cleanup; +#endif + + alled = TRIGGER(1); - ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); - ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); + ASSERT_EQ(bss->usdt0_cookie, expected_cookie, "usdt0_cookie"); ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); ASSERT_EQ(bss->usdt0_arg_ret, -ENOENT, "usdt0_arg_ret"); ASSERT_EQ(bss->usdt0_arg_size, -ENOENT, "usdt0_arg_size"); @@ -119,11 +195,11 @@ static void subtest_basic_usdt(void) * bpf_program__attach_usdt() handles this properly and attaches to * all possible places of USDT invocation. */ - trigger_func(2); + called += TRIGGER(2); - ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); /* only check values that depend on trigger_func()'s input value */ ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1"); @@ -142,9 +218,9 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach")) goto cleanup; - trigger_func(3); + called += TRIGGER(3); - ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); /* this time usdt3 has custom cookie */ ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie"); ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); @@ -156,8 +232,19 @@ static void subtest_basic_usdt(void) ASSERT_EQ(bss->usdt3_args[1], 42, "usdt3_arg2"); ASSERT_EQ(bss->usdt3_args[2], (uintptr_t)&bla, "usdt3_arg3"); +#if defined(__x86_64__) || defined(__i386__) + trigger_sib_spec(); + ASSERT_EQ(bss->usdt_sib_called, 1, "usdt_sib_called"); + ASSERT_EQ(bss->usdt_sib_cookie, expected_cookie, "usdt_sib_cookie"); + ASSERT_EQ(bss->usdt_sib_arg_cnt, 1, "usdt_sib_arg_cnt"); + ASSERT_EQ(bss->usdt_sib_arg, nums[0], "usdt_sib_arg"); + ASSERT_EQ(bss->usdt_sib_arg_ret, 0, "usdt_sib_arg_ret"); + ASSERT_EQ(bss->usdt_sib_arg_size, sizeof(nums[0]), "usdt_sib_arg_size"); +#endif + cleanup: test_usdt__destroy(skel); +#undef TRIGGER } unsigned short test_usdt_100_semaphore SEC(".probes"); @@ -425,7 +512,11 @@ cleanup: void test_usdt(void) { if (test__start_subtest("basic")) - subtest_basic_usdt(); + subtest_basic_usdt(false); +#ifdef __x86_64__ + if (test__start_subtest("basic_optimized")) + subtest_basic_usdt(true); +#endif if (test__start_subtest("multispec")) subtest_multispec_usdt(); if (test__start_subtest("urand_auto_attach")) diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 77ec95d4ffaa..28e81161e6fc 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -46,6 +46,7 @@ #include "verifier_ldsx.skel.h" #include "verifier_leak_ptr.skel.h" #include "verifier_linked_scalars.skel.h" +#include "verifier_live_stack.skel.h" #include "verifier_load_acquire.skel.h" #include "verifier_loops1.skel.h" #include "verifier_lwt.skel.h" @@ -59,6 +60,7 @@ #include "verifier_meta_access.skel.h" #include "verifier_movsx.skel.h" #include "verifier_mtu.skel.h" +#include "verifier_mul.skel.h" #include "verifier_netfilter_ctx.skel.h" #include "verifier_netfilter_retcode.skel.h" #include "verifier_bpf_fastcall.skel.h" @@ -183,6 +185,7 @@ void test_verifier_ld_ind(void) { RUN(verifier_ld_ind); } void test_verifier_ldsx(void) { RUN(verifier_ldsx); } void test_verifier_leak_ptr(void) { RUN(verifier_leak_ptr); } void test_verifier_linked_scalars(void) { RUN(verifier_linked_scalars); } +void test_verifier_live_stack(void) { RUN(verifier_live_stack); } void test_verifier_loops1(void) { RUN(verifier_loops1); } void test_verifier_lwt(void) { RUN(verifier_lwt); } void test_verifier_map_in_map(void) { RUN(verifier_map_in_map); } @@ -194,6 +197,7 @@ void test_verifier_may_goto_1(void) { RUN(verifier_may_goto_1); } void test_verifier_may_goto_2(void) { RUN(verifier_may_goto_2); } void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } +void test_verifier_mul(void) { RUN(verifier_mul); } void test_verifier_netfilter_ctx(void) { RUN(verifier_netfilter_ctx); } void test_verifier_netfilter_retcode(void) { RUN(verifier_netfilter_retcode); } void test_verifier_bpf_fastcall(void) { RUN(verifier_bpf_fastcall); } diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index b9d9f0a502ce..178292d1251a 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -9,6 +9,7 @@ #define TX_NETNS "xdp_context_tx" #define RX_NETNS "xdp_context_rx" #define TAP_NAME "tap0" +#define DUMMY_NAME "dum0" #define TAP_NETNS "xdp_context_tuntap" #define TEST_PAYLOAD_LEN 32 @@ -96,9 +97,7 @@ void test_xdp_context_test_run(void) /* Meta data must be 255 bytes or smaller */ test_xdp_context_error(prog_fd, opts, 0, 256, sizeof(data), 0, 0, 0); - /* Total size of data must match data_end - data_meta */ - test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), - sizeof(data) - 1, 0, 0, 0); + /* Total size of data must be data_end - data_meta or larger */ test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data) + 1, 0, 0, 0); @@ -156,15 +155,30 @@ err: return -1; } -static void assert_test_result(struct test_xdp_meta *skel) +static int write_test_packet(int tap_fd) +{ + __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; + int n; + + /* The ethernet header doesn't need to be valid for this test */ + memset(packet, 0, sizeof(struct ethhdr)); + memcpy(packet + sizeof(struct ethhdr), test_payload, TEST_PAYLOAD_LEN); + + n = write(tap_fd, packet, sizeof(packet)); + if (!ASSERT_EQ(n, sizeof(packet), "write packet")) + return -1; + + return 0; +} + +static void assert_test_result(const struct bpf_map *result_map) { int err; __u32 map_key = 0; __u8 map_value[TEST_PAYLOAD_LEN]; - err = bpf_map__lookup_elem(skel->maps.test_result, &map_key, - sizeof(map_key), &map_value, - TEST_PAYLOAD_LEN, BPF_ANY); + err = bpf_map__lookup_elem(result_map, &map_key, sizeof(map_key), + &map_value, TEST_PAYLOAD_LEN, BPF_ANY); if (!ASSERT_OK(err, "lookup test_result")) return; @@ -172,6 +186,18 @@ static void assert_test_result(struct test_xdp_meta *skel) "test_result map contains test payload"); } +static bool clear_test_result(struct bpf_map *result_map) +{ + const __u8 v[sizeof(test_payload)] = {}; + const __u32 k = 0; + int err; + + err = bpf_map__update_elem(result_map, &k, sizeof(k), v, sizeof(v), BPF_ANY); + ASSERT_OK(err, "update test_result"); + + return err == 0; +} + void test_xdp_context_veth(void) { LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); @@ -248,7 +274,7 @@ void test_xdp_context_veth(void) if (!ASSERT_OK(ret, "send_test_packet")) goto close; - assert_test_result(skel); + assert_test_result(skel->maps.test_result); close: close_netns(nstoken); @@ -257,17 +283,21 @@ close: netns_free(tx_ns); } -void test_xdp_context_tuntap(void) +static void test_tuntap(struct bpf_program *xdp_prog, + struct bpf_program *tc_prio_1_prog, + struct bpf_program *tc_prio_2_prog, + struct bpf_map *result_map) { LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); struct netns_obj *ns = NULL; - struct test_xdp_meta *skel = NULL; - __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; int tap_fd = -1; int tap_ifindex; int ret; + if (!clear_test_result(result_map)) + return; + ns = netns_new(TAP_NETNS, true); if (!ASSERT_OK_PTR(ns, "create and open ns")) return; @@ -278,10 +308,6 @@ void test_xdp_context_tuntap(void) SYS(close, "ip link set dev " TAP_NAME " up"); - skel = test_xdp_meta__open_and_load(); - if (!ASSERT_OK_PTR(skel, "open and load skeleton")) - goto close; - tap_ifindex = if_nametoindex(TAP_NAME); if (!ASSERT_GE(tap_ifindex, 0, "if_nametoindex")) goto close; @@ -291,33 +317,175 @@ void test_xdp_context_tuntap(void) if (!ASSERT_OK(ret, "bpf_tc_hook_create")) goto close; - tc_opts.prog_fd = bpf_program__fd(skel->progs.ing_cls); + tc_opts.prog_fd = bpf_program__fd(tc_prio_1_prog); ret = bpf_tc_attach(&tc_hook, &tc_opts); if (!ASSERT_OK(ret, "bpf_tc_attach")) goto close; - ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(skel->progs.ing_xdp), + if (tc_prio_2_prog) { + LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 2, + .prog_fd = bpf_program__fd(tc_prio_2_prog)); + + ret = bpf_tc_attach(&tc_hook, &tc_opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + goto close; + } + + ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(xdp_prog), 0, NULL); if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) goto close; - /* The ethernet header is not relevant for this test and doesn't need to - * be meaningful. - */ - struct ethhdr eth = { 0 }; + ret = write_test_packet(tap_fd); + if (!ASSERT_OK(ret, "write_test_packet")) + goto close; - memcpy(packet, ð, sizeof(eth)); - memcpy(packet + sizeof(eth), test_payload, TEST_PAYLOAD_LEN); + assert_test_result(result_map); + +close: + if (tap_fd >= 0) + close(tap_fd); + netns_free(ns); +} + +/* Write a packet to a tap dev and copy it to ingress of a dummy dev */ +static void test_tuntap_mirred(struct bpf_program *xdp_prog, + struct bpf_program *tc_prog, + bool *test_pass) +{ + LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); + struct netns_obj *ns = NULL; + int dummy_ifindex; + int tap_fd = -1; + int tap_ifindex; + int ret; + + *test_pass = false; + + ns = netns_new(TAP_NETNS, true); + if (!ASSERT_OK_PTR(ns, "netns_new")) + return; - ret = write(tap_fd, packet, sizeof(packet)); - if (!ASSERT_EQ(ret, sizeof(packet), "write packet")) + /* Setup dummy interface */ + SYS(close, "ip link add name " DUMMY_NAME " type dummy"); + SYS(close, "ip link set dev " DUMMY_NAME " up"); + + dummy_ifindex = if_nametoindex(DUMMY_NAME); + if (!ASSERT_GE(dummy_ifindex, 0, "if_nametoindex")) goto close; - assert_test_result(skel); + tc_hook.ifindex = dummy_ifindex; + ret = bpf_tc_hook_create(&tc_hook); + if (!ASSERT_OK(ret, "bpf_tc_hook_create")) + goto close; + + tc_opts.prog_fd = bpf_program__fd(tc_prog); + ret = bpf_tc_attach(&tc_hook, &tc_opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + goto close; + + /* Setup TAP interface */ + tap_fd = open_tuntap(TAP_NAME, true); + if (!ASSERT_GE(tap_fd, 0, "open_tuntap")) + goto close; + + SYS(close, "ip link set dev " TAP_NAME " up"); + + tap_ifindex = if_nametoindex(TAP_NAME); + if (!ASSERT_GE(tap_ifindex, 0, "if_nametoindex")) + goto close; + + ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(xdp_prog), 0, NULL); + if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) + goto close; + + /* Copy all packets received from TAP to dummy ingress */ + SYS(close, "tc qdisc add dev " TAP_NAME " clsact"); + SYS(close, "tc filter add dev " TAP_NAME " ingress " + "protocol all matchall " + "action mirred ingress mirror dev " DUMMY_NAME); + + /* Receive a packet on TAP */ + ret = write_test_packet(tap_fd); + if (!ASSERT_OK(ret, "write_test_packet")) + goto close; + + ASSERT_TRUE(*test_pass, "test_pass"); close: if (tap_fd >= 0) close(tap_fd); - test_xdp_meta__destroy(skel); netns_free(ns); } + +void test_xdp_context_tuntap(void) +{ + struct test_xdp_meta *skel = NULL; + + skel = test_xdp_meta__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + return; + + if (test__start_subtest("data_meta")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls, + NULL, /* tc prio 2 */ + skel->maps.test_result); + if (test__start_subtest("dynptr_read")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls_dynptr_read, + NULL, /* tc prio 2 */ + skel->maps.test_result); + if (test__start_subtest("dynptr_slice")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls_dynptr_slice, + NULL, /* tc prio 2 */ + skel->maps.test_result); + if (test__start_subtest("dynptr_write")) + test_tuntap(skel->progs.ing_xdp_zalloc_meta, + skel->progs.ing_cls_dynptr_write, + skel->progs.ing_cls_dynptr_read, + skel->maps.test_result); + if (test__start_subtest("dynptr_slice_rdwr")) + test_tuntap(skel->progs.ing_xdp_zalloc_meta, + skel->progs.ing_cls_dynptr_slice_rdwr, + skel->progs.ing_cls_dynptr_slice, + skel->maps.test_result); + if (test__start_subtest("dynptr_offset")) + test_tuntap(skel->progs.ing_xdp_zalloc_meta, + skel->progs.ing_cls_dynptr_offset_wr, + skel->progs.ing_cls_dynptr_offset_rd, + skel->maps.test_result); + if (test__start_subtest("dynptr_offset_oob")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls_dynptr_offset_oob, + skel->progs.ing_cls, + skel->maps.test_result); + if (test__start_subtest("clone_data_meta_empty_on_data_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_data_meta_empty_on_data_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_data_meta_empty_on_meta_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_data_meta_empty_on_meta_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_empty_on_data_slice_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_empty_on_data_slice_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_empty_on_meta_slice_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_empty_on_meta_slice_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_rdonly_before_data_dynptr_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_rdonly_before_data_dynptr_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_rdonly_before_meta_dynptr_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_rdonly_before_meta_dynptr_write, + &skel->bss->test_pass); + + test_xdp_meta__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c index 461ab18705d5..a8ab05216c38 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -7,6 +7,7 @@ #include <test_progs.h> #include "test_xdp_devmap_helpers.skel.h" +#include "test_xdp_devmap_tailcall.skel.h" #include "test_xdp_with_devmap_frags_helpers.skel.h" #include "test_xdp_with_devmap_helpers.skel.h" @@ -107,6 +108,29 @@ static void test_neg_xdp_devmap_helpers(void) } } +static void test_xdp_devmap_tailcall(enum bpf_attach_type prog_dev, + enum bpf_attach_type prog_tail, + bool expect_reject) +{ + struct test_xdp_devmap_tailcall *skel; + int err; + + skel = test_xdp_devmap_tailcall__open(); + if (!ASSERT_OK_PTR(skel, "test_xdp_devmap_tailcall__open")) + return; + + bpf_program__set_expected_attach_type(skel->progs.xdp_devmap, prog_dev); + bpf_program__set_expected_attach_type(skel->progs.xdp_entry, prog_tail); + + err = test_xdp_devmap_tailcall__load(skel); + if (expect_reject) + ASSERT_ERR(err, "test_xdp_devmap_tailcall__load"); + else + ASSERT_OK(err, "test_xdp_devmap_tailcall__load"); + + test_xdp_devmap_tailcall__destroy(skel); +} + static void test_xdp_with_devmap_frags_helpers(void) { struct test_xdp_with_devmap_frags_helpers *skel; @@ -238,8 +262,13 @@ void serial_test_xdp_devmap_attach(void) if (test__start_subtest("DEVMAP with frags programs in entries")) test_xdp_with_devmap_frags_helpers(); - if (test__start_subtest("Verifier check of DEVMAP programs")) + if (test__start_subtest("Verifier check of DEVMAP programs")) { test_neg_xdp_devmap_helpers(); + test_xdp_devmap_tailcall(BPF_XDP_DEVMAP, BPF_XDP_DEVMAP, false); + test_xdp_devmap_tailcall(0, 0, true); + test_xdp_devmap_tailcall(BPF_XDP_DEVMAP, 0, true); + test_xdp_devmap_tailcall(0, BPF_XDP_DEVMAP, true); + } if (test__start_subtest("DEVMAP with programs in entries on veth")) test_xdp_with_devmap_helpers_veth(); diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c new file mode 100644 index 000000000000..efa350d04ec5 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include <network_helpers.h> +#include "test_xdp_pull_data.skel.h" + +#define PULL_MAX (1 << 31) +#define PULL_PLUS_ONE (1 << 30) + +#define XDP_PACKET_HEADROOM 256 + +/* Find headroom and tailroom occupied by struct xdp_frame and struct + * skb_shared_info so that we can calculate the maximum pull lengths for + * test cases. They might not be the real size of the structures due to + * cache alignment. + */ +static int find_xdp_sizes(struct test_xdp_pull_data *skel, int frame_sz) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct xdp_md ctx = {}; + int prog_fd, err; + __u8 *buf; + + buf = calloc(frame_sz, sizeof(__u8)); + if (!ASSERT_OK_PTR(buf, "calloc buf")) + return -ENOMEM; + + topts.data_in = buf; + topts.data_out = buf; + topts.data_size_in = frame_sz; + topts.data_size_out = frame_sz; + /* Pass a data_end larger than the linear space available to make sure + * bpf_prog_test_run_xdp() will fill the linear data area so that + * xdp_find_sizes can infer the size of struct skb_shared_info + */ + ctx.data_end = frame_sz; + topts.ctx_in = &ctx; + topts.ctx_out = &ctx; + topts.ctx_size_in = sizeof(ctx); + topts.ctx_size_out = sizeof(ctx); + + prog_fd = bpf_program__fd(skel->progs.xdp_find_sizes); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + free(buf); + + return err; +} + +/* xdp_pull_data_prog will directly read a marker 0xbb stored at buf[1024] + * so caller expecting XDP_PASS should always pass pull_len no less than 1024 + */ +static void run_test(struct test_xdp_pull_data *skel, int retval, + int frame_sz, int buff_len, int meta_len, int data_len, + int pull_len) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct xdp_md ctx = {}; + int prog_fd, err; + __u8 *buf; + + buf = calloc(buff_len, sizeof(__u8)); + if (!ASSERT_OK_PTR(buf, "calloc buf")) + return; + + buf[meta_len + 1023] = 0xaa; + buf[meta_len + 1024] = 0xbb; + buf[meta_len + 1025] = 0xcc; + + topts.data_in = buf; + topts.data_out = buf; + topts.data_size_in = buff_len; + topts.data_size_out = buff_len; + ctx.data = meta_len; + ctx.data_end = meta_len + data_len; + topts.ctx_in = &ctx; + topts.ctx_out = &ctx; + topts.ctx_size_in = sizeof(ctx); + topts.ctx_size_out = sizeof(ctx); + + skel->bss->data_len = data_len; + if (pull_len & PULL_MAX) { + int headroom = XDP_PACKET_HEADROOM - meta_len - skel->bss->xdpf_sz; + int tailroom = frame_sz - XDP_PACKET_HEADROOM - + data_len - skel->bss->sinfo_sz; + + pull_len = pull_len & PULL_PLUS_ONE ? 1 : 0; + pull_len += headroom + tailroom + data_len; + } + skel->bss->pull_len = pull_len; + + prog_fd = bpf_program__fd(skel->progs.xdp_pull_data_prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_EQ(topts.retval, retval, "xdp_pull_data_prog retval"); + + if (retval == XDP_DROP) + goto out; + + ASSERT_EQ(ctx.data_end, meta_len + pull_len, "linear data size"); + ASSERT_EQ(topts.data_size_out, buff_len, "linear + non-linear data size"); + /* Make sure data around xdp->data_end was not messed up by + * bpf_xdp_pull_data() + */ + ASSERT_EQ(buf[meta_len + 1023], 0xaa, "data[1023]"); + ASSERT_EQ(buf[meta_len + 1024], 0xbb, "data[1024]"); + ASSERT_EQ(buf[meta_len + 1025], 0xcc, "data[1025]"); +out: + free(buf); +} + +static void test_xdp_pull_data_basic(void) +{ + u32 pg_sz, max_meta_len, max_data_len; + struct test_xdp_pull_data *skel; + + skel = test_xdp_pull_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_xdp_pull_data__open_and_load")) + return; + + pg_sz = sysconf(_SC_PAGE_SIZE); + + if (find_xdp_sizes(skel, pg_sz)) + goto out; + + max_meta_len = XDP_PACKET_HEADROOM - skel->bss->xdpf_sz; + max_data_len = pg_sz - XDP_PACKET_HEADROOM - skel->bss->sinfo_sz; + + /* linear xdp pkt, pull 0 byte */ + run_test(skel, XDP_PASS, pg_sz, 2048, 0, 2048, 2048); + + /* multi-buf pkt, pull results in linear xdp pkt */ + run_test(skel, XDP_PASS, pg_sz, 2048, 0, 1024, 2048); + + /* multi-buf pkt, pull 1 byte to linear data area */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, 1024, 1025); + + /* multi-buf pkt, pull 0 byte to linear data area */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, 1025, 1025); + + /* multi-buf pkt, empty linear data area, pull requires memmove */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, 0, PULL_MAX); + + /* multi-buf pkt, no headroom */ + run_test(skel, XDP_PASS, pg_sz, 9000, max_meta_len, 1024, PULL_MAX); + + /* multi-buf pkt, no tailroom, pull requires memmove */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, max_data_len, PULL_MAX); + + /* Test cases with invalid pull length */ + + /* linear xdp pkt, pull more than total data len */ + run_test(skel, XDP_DROP, pg_sz, 2048, 0, 2048, 2049); + + /* multi-buf pkt with no space left in linear data area */ + run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, max_data_len, + PULL_MAX | PULL_PLUS_ONE); + + /* multi-buf pkt, empty linear data area */ + run_test(skel, XDP_DROP, pg_sz, 9000, 0, 0, PULL_MAX | PULL_PLUS_ONE); + + /* multi-buf pkt, no headroom */ + run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, 1024, + PULL_MAX | PULL_PLUS_ONE); + + /* multi-buf pkt, no tailroom */ + run_test(skel, XDP_DROP, pg_sz, 9000, 0, max_data_len, + PULL_MAX | PULL_PLUS_ONE); + +out: + test_xdp_pull_data__destroy(skel); +} + +void test_xdp_pull_data(void) +{ + if (test__start_subtest("xdp_pull_data")) + test_xdp_pull_data_basic(); +} diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c index a52feff98112..d1841aac94a2 100644 --- a/tools/testing/selftests/bpf/progs/arena_atomics.c +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -28,7 +28,8 @@ bool skip_all_tests = true; #if defined(ENABLE_ATOMICS_TESTS) && \ defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ - (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) bool skip_lacq_srel_tests __attribute((__section__(".data"))) = false; #else bool skip_lacq_srel_tests = true; @@ -314,7 +315,8 @@ int load_acquire(const void *ctx) { #if defined(ENABLE_ATOMICS_TESTS) && \ defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ - (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) #define LOAD_ACQUIRE_ARENA(SIZEOP, SIZE, SRC, DST) \ { asm volatile ( \ @@ -365,7 +367,8 @@ int store_release(const void *ctx) { #if defined(ENABLE_ATOMICS_TESTS) && \ defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ - (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) #define STORE_RELEASE_ARENA(SIZEOP, DST, VAL) \ { asm volatile ( \ diff --git a/tools/testing/selftests/bpf/progs/arena_spin_lock.c b/tools/testing/selftests/bpf/progs/arena_spin_lock.c index c4500c37f85e..086b57a426cf 100644 --- a/tools/testing/selftests/bpf/progs/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/progs/arena_spin_lock.c @@ -37,8 +37,11 @@ int prog(void *ctx) #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) unsigned long flags; - if ((ret = arena_spin_lock_irqsave(&lock, flags))) + if ((ret = arena_spin_lock_irqsave(&lock, flags))) { + if (ret == -EOPNOTSUPP) + test_skip = 3; return ret; + } if (counter != limit) counter++; bpf_repeat(cs_count); diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h index d67466c1ff77..f90531cf3ee5 100644 --- a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h +++ b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h @@ -302,7 +302,7 @@ int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val * barriers. */ if (val & _Q_LOCKED_MASK) - smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); + (void)smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); /* * take ownership and clear the pending bit. @@ -380,7 +380,7 @@ queue: /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); - arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); + (void)arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); /* * While waiting for the MCS lock, the next pointer may have diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c index 1654a530aa3d..4e51785e7606 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -101,7 +101,7 @@ static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, tp->snd_cwnd = pkts_in_flight + sndcnt; } -/* Decide wheather to run the increase function of congestion control. */ +/* Decide whether to run the increase function of congestion control. */ static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { if (tcp_sk(sk)->reordering > TCP_REORDERING) diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 7cd73e75f52a..32c511bcd60b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ -/* WARNING: This implemenation is not necessarily the same +/* WARNING: This implementation is not necessarily the same * as the tcp_dctcp.c. The purpose is mainly for testing * the kernel BPF logic. */ diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index c1cfd297aabf..a7a1a684eed1 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -33,7 +33,20 @@ * e.g. "foo{{[0-9]+}}" matches strings like "foo007". * Extended POSIX regular expression syntax is allowed * inside the brackets. + * __not_msg Message not expected to be found in verifier log. + * If __msg_not is situated between __msg tags + * framework matches __msg tags first, and then + * checks that __msg_not is not present in a portion of + * a log between bracketing __msg tags. + * Same regex syntax as for __msg is supported. * __msg_unpriv Same as __msg but for unprivileged mode. + * __not_msg_unpriv Same as __not_msg but for unprivileged mode. + * + * __stderr Message expected to be found in bpf stderr stream. The + * same regex rules apply like __msg. + * __stderr_unpriv Same as __stderr but for unpriveleged mode. + * __stdout Same as __stderr but for stdout stream. + * __stdout_unpriv Same as __stdout but for unpriveleged mode. * * __xlated Expect a line in a disassembly log after verifier applies rewrites. * Multiple __xlated attributes could be specified. @@ -115,12 +128,14 @@ * __caps_unpriv Specify the capabilities that should be set when running the test. */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) +#define __not_msg(msg) __attribute__((btf_decl_tag("comment:test_expect_not_msg=" XSTR(__COUNTER__) "=" msg))) #define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg))) #define __jited(msg) __attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg))) #define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) #define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) #define __description(desc) __attribute__((btf_decl_tag("comment:test_description=" desc))) #define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __not_msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_not_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __jited_unpriv(msg) __attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg))) #define __failure_unpriv __attribute__((btf_decl_tag("comment:test_expect_failure_unpriv"))) @@ -136,9 +151,14 @@ #define __arch_x86_64 __arch("X86_64") #define __arch_arm64 __arch("ARM64") #define __arch_riscv64 __arch("RISCV64") +#define __arch_s390x __arch("s390x") #define __caps_unpriv(caps) __attribute__((btf_decl_tag("comment:test_caps_unpriv=" EXPAND_QUOTE(caps)))) #define __load_if_JITed() __attribute__((btf_decl_tag("comment:load_mode=jited"))) #define __load_if_no_JITed() __attribute__((btf_decl_tag("comment:load_mode=no_jited"))) +#define __stderr(msg) __attribute__((btf_decl_tag("comment:test_expect_stderr=" XSTR(__COUNTER__) "=" msg))) +#define __stderr_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stderr_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __stdout(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout=" XSTR(__COUNTER__) "=" msg))) +#define __stdout_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout_unpriv=" XSTR(__COUNTER__) "=" msg))) /* Define common capabilities tested using __caps_unpriv */ #define CAP_NET_ADMIN 12 @@ -156,6 +176,10 @@ #define __imm_ptr(name) [name]"r"(&name) #define __imm_insn(name, expr) [name]"i"(*(long *)&(expr)) +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) + /* Magic constants used with __retval() */ #define POINTER_VALUE 0xbadcafe #define TEST_DATA_LEN 64 diff --git a/tools/testing/selftests/bpf/progs/bpf_test_utils.h b/tools/testing/selftests/bpf/progs/bpf_test_utils.h new file mode 100644 index 000000000000..f4e67b492dd2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_test_utils.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BPF_TEST_UTILS_H__ +#define __BPF_TEST_UTILS_H__ + +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +/* Clobber as many native registers and stack slots as possible. */ +static __always_inline void clobber_regs_stack(void) +{ + char tmp_str[] = "123456789"; + unsigned long tmp; + + bpf_strtoul(tmp_str, sizeof(tmp_str), 0, &tmp); + __sink(tmp); +} + +#endif diff --git a/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c b/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c index 092db1d0435e..88e13e17ec9e 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c +++ b/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c @@ -73,7 +73,7 @@ int BPF_PROG(use_css_iter_non_sleepable) } SEC("lsm.s/socket_connect") -__failure __msg("expected an RCU CS") +__failure __msg("kernel func bpf_iter_css_new requires RCU critical section protection") int BPF_PROG(use_css_iter_sleepable_missing_rcu_lock) { u64 cgrp_id = bpf_get_current_cgroup_id(); diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c index 5354455a01be..02d8f160ca0e 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c @@ -221,3 +221,15 @@ int BPF_PROG(test_cgrp_from_id, struct cgroup *cgrp, const char *path) return 0; } + +SEC("syscall") +int test_cgrp_from_id_ns(void *ctx) +{ + struct cgroup *cg; + + cg = bpf_cgroup_from_id(1); + if (!cg) + return 42; + bpf_cgroup_release(cg); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c index 645be6cddf36..dfd8a258f14a 100644 --- a/tools/testing/selftests/bpf/progs/crypto_sanity.c +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -14,7 +14,7 @@ unsigned char key[256] = {}; u16 udp_test_port = 7777; u32 authsize, key_len; char algo[128] = {}; -char dst[16] = {}; +char dst[16] = {}, dst_bad[8] = {}; int status; static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) @@ -59,10 +59,9 @@ int skb_crypto_setup(void *ctx) .authsize = authsize, }; struct bpf_crypto_ctx *cctx; - int err = 0; + int err; status = 0; - if (key_len > 256) { status = -EINVAL; return 0; @@ -70,8 +69,8 @@ int skb_crypto_setup(void *ctx) __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); __builtin_memcpy(¶ms.key, key, sizeof(key)); - cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); if (!cctx) { status = err; return 0; @@ -80,7 +79,6 @@ int skb_crypto_setup(void *ctx) err = crypto_ctx_insert(cctx); if (err && err != -EEXIST) status = err; - return 0; } @@ -92,6 +90,7 @@ int decrypt_sanity(struct __sk_buff *skb) struct bpf_dynptr psrc, pdst; int err; + status = 0; err = skb_dynptr_validate(skb, &psrc); if (err < 0) { status = err; @@ -110,13 +109,23 @@ int decrypt_sanity(struct __sk_buff *skb) return TC_ACT_SHOT; } - /* dst is a global variable to make testing part easier to check. In real - * production code, a percpu map should be used to store the result. + /* Check also bad case where the dst buffer is smaller than the + * skb's linear section. + */ + bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst); + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL); + if (!status) + status = -EIO; + if (status != -EINVAL) + goto err; + + /* dst is a global variable to make testing part easier to check. + * In real production code, a percpu map should be used to store + * the result. */ bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); - status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL); - +err: return TC_ACT_SHOT; } @@ -129,7 +138,6 @@ int encrypt_sanity(struct __sk_buff *skb) int err; status = 0; - err = skb_dynptr_validate(skb, &psrc); if (err < 0) { status = err; @@ -148,13 +156,23 @@ int encrypt_sanity(struct __sk_buff *skb) return TC_ACT_SHOT; } - /* dst is a global variable to make testing part easier to check. In real - * production code, a percpu map should be used to store the result. + /* Check also bad case where the dst buffer is smaller than the + * skb's linear section. + */ + bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst); + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL); + if (!status) + status = -EIO; + if (status != -EINVAL) + goto err; + + /* dst is a global variable to make testing part easier to check. + * In real production code, a percpu map should be used to store + * the result. */ bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); - status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL); - +err: return TC_ACT_SHOT; } diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index bd8f15229f5c..dda6a8dada82 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -269,6 +269,26 @@ int data_slice_out_of_bounds_skb(struct __sk_buff *skb) return SK_PASS; } +/* A metadata slice can't be accessed out of bounds */ +SEC("?tc") +__failure __msg("value is outside of the allowed memory range") +int data_slice_out_of_bounds_skb_meta(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + /* this should fail */ + *(md + 1) = 42; + + return SK_PASS; +} + SEC("?raw_tp") __failure __msg("value is outside of the allowed memory range") int data_slice_out_of_bounds_map_value(void *ctx) @@ -1089,6 +1109,26 @@ int skb_invalid_slice_write(struct __sk_buff *skb) return SK_PASS; } +/* bpf_dynptr_slice()s are read-only and cannot be written to */ +SEC("?tc") +__failure __msg("R{{[0-9]+}} cannot write into rdonly_mem") +int skb_meta_invalid_slice_write(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + /* The read-only data slice is invalidated whenever a helper changes packet data */ SEC("?tc") __failure __msg("invalid mem access 'scalar'") @@ -1192,6 +1232,188 @@ int skb_invalid_data_slice4(struct __sk_buff *skb) return SK_PASS; } +/* Read-only skb data slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *d; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + d = bpf_dynptr_slice(&data, 0, NULL, sizeof(*d)); + if (!d) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + val = *d; + + return SK_PASS; +} + +/* Read-write skb data slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *d; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + d = bpf_dynptr_slice_rdwr(&data, 0, NULL, sizeof(*d)); + if (!d) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + *d = 42; + + return SK_PASS; +} + +/* Read-only skb metadata slice is invalidated on write to skb data */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_meta_slice_invalid_after_payload_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *md; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&data, 0, "x", 1, 0); + + /* this should fail */ + val = *md; + + return SK_PASS; +} + +/* Read-write skb metadata slice is invalidated on write to skb data slice */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_meta_slice_invalid_after_payload_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *md; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&data, 0, "x", 1, 0); + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + +/* Read-only skb metadata slice is invalidated whenever a helper changes packet data */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_meta_slice_invalid_after_payload_helper(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + if (bpf_skb_pull_data(skb, skb->len)) + return SK_DROP; + + /* this should fail */ + val = *md; + + return SK_PASS; +} + +/* Read-write skb metadata slice is invalidated whenever a helper changes packet data */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_meta_slice_invalid_after_payload_helper(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + if (bpf_skb_pull_data(skb, skb->len)) + return SK_DROP; + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + +/* Read-only skb metadata slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_meta_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + val = *md; + + return SK_PASS; +} + +/* Read-write skb metadata slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_meta_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + /* The read-only data slice is invalidated whenever a helper changes packet data */ SEC("?xdp") __failure __msg("invalid mem access 'scalar'") @@ -1255,6 +1477,19 @@ int skb_invalid_ctx(void *ctx) return 0; } +/* Only supported prog type can create skb_meta-type dynptrs */ +SEC("?raw_tp") +__failure __msg("calling kernel function bpf_dynptr_from_skb_meta is not allowed") +int skb_meta_invalid_ctx(void *ctx) +{ + struct bpf_dynptr meta; + + /* this should fail */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + + return 0; +} + SEC("fentry/skb_tx_error") __failure __msg("must be referenced or trusted") int BPF_PROG(skb_invalid_ctx_fentry, void *skb) @@ -1665,6 +1900,29 @@ int clone_skb_packet_data(struct __sk_buff *skb) return 0; } +/* A skb clone's metadata slice becomes invalid anytime packet data changes */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int clone_skb_packet_meta(struct __sk_buff *skb) +{ + struct bpf_dynptr clone, meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + bpf_dynptr_clone(&meta, &clone); + md = bpf_dynptr_slice_rdwr(&clone, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + if (bpf_skb_pull_data(skb, skb->len)) + return SK_DROP; + + /* this should fail */ + *md = 42; + + return 0; +} + /* A xdp clone's data slices should be invalid anytime packet data changes */ SEC("?xdp") __failure __msg("invalid mem access 'scalar'") diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index 8315273cb900..127dea342e5a 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -211,6 +211,61 @@ int test_dynptr_skb_data(struct __sk_buff *skb) return 1; } +SEC("?tc") +int test_dynptr_skb_meta_data(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + int ret; + + err = 1; + ret = bpf_dynptr_from_skb_meta(skb, 0, &meta); + if (ret) + return 1; + + /* This should return NULL. Must use bpf_dynptr_slice API */ + err = 2; + md = bpf_dynptr_data(&meta, 0, sizeof(*md)); + if (md) + return 1; + + err = 0; + return 1; +} + +/* Check that skb metadata dynptr ops don't accept any flags. */ +SEC("?tc") +int test_dynptr_skb_meta_flags(struct __sk_buff *skb) +{ + const __u64 INVALID_FLAGS = ~0ULL; + struct bpf_dynptr meta; + __u8 buf; + int ret; + + err = 1; + ret = bpf_dynptr_from_skb_meta(skb, INVALID_FLAGS, &meta); + if (ret != -EINVAL) + return 1; + + err = 2; + ret = bpf_dynptr_from_skb_meta(skb, 0, &meta); + if (ret) + return 1; + + err = 3; + ret = bpf_dynptr_read(&buf, 0, &meta, 0, INVALID_FLAGS); + if (ret != -EINVAL) + return 1; + + err = 4; + ret = bpf_dynptr_write(&meta, 0, &buf, 0, INVALID_FLAGS); + if (ret != -EINVAL) + return 1; + + err = 0; + return 1; +} + SEC("tp/syscalls/sys_enter_nanosleep") int test_adjust(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index 5e0a1ca96d4e..a01c2736890f 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -18,43 +18,43 @@ return *(u64 *)num; \ } -__msg(": R0_w=0xffffffff80000000") +__msg(": R0=0xffffffff80000000") check_assert(s64, ==, eq_int_min, INT_MIN); -__msg(": R0_w=0x7fffffff") +__msg(": R0=0x7fffffff") check_assert(s64, ==, eq_int_max, INT_MAX); -__msg(": R0_w=0") +__msg(": R0=0") check_assert(s64, ==, eq_zero, 0); -__msg(": R0_w=0x8000000000000000 R1_w=0x8000000000000000") +__msg(": R0=0x8000000000000000 R1=0x8000000000000000") check_assert(s64, ==, eq_llong_min, LLONG_MIN); -__msg(": R0_w=0x7fffffffffffffff R1_w=0x7fffffffffffffff") +__msg(": R0=0x7fffffffffffffff R1=0x7fffffffffffffff") check_assert(s64, ==, eq_llong_max, LLONG_MAX); -__msg(": R0_w=scalar(id=1,smax=0x7ffffffe)") +__msg(": R0=scalar(id=1,smax=0x7ffffffe)") check_assert(s64, <, lt_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") check_assert(s64, <, lt_zero, 0); -__msg(": R0_w=scalar(id=1,smax=0xffffffff7fffffff") +__msg(": R0=scalar(id=1,smax=0xffffffff7fffffff") check_assert(s64, <, lt_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smax=0x7fffffff)") +__msg(": R0=scalar(id=1,smax=0x7fffffff)") check_assert(s64, <=, le_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smax=0)") +__msg(": R0=scalar(id=1,smax=0)") check_assert(s64, <=, le_zero, 0); -__msg(": R0_w=scalar(id=1,smax=0xffffffff80000000") +__msg(": R0=scalar(id=1,smax=0xffffffff80000000") check_assert(s64, <=, le_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >, gt_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >, gt_zero, 0); -__msg(": R0_w=scalar(id=1,smin=0xffffffff80000001") +__msg(": R0=scalar(id=1,smin=0xffffffff80000001") check_assert(s64, >, gt_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >=, ge_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >=, ge_zero, 0); -__msg(": R0_w=scalar(id=1,smin=0xffffffff80000000") +__msg(": R0=scalar(id=1,smin=0xffffffff80000000") check_assert(s64, >=, ge_neg, INT_MIN); SEC("?tc") diff --git a/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c b/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c index 544e5ac90461..d09bbd8ae8a8 100644 --- a/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c +++ b/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c @@ -12,7 +12,7 @@ SEC("freplace/connect_v4_prog") int new_connect_v4_prog(struct bpf_sock_addr *ctx) { - // return value thats in invalid range + // return value that's in invalid range return 255; } diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c index f41257eadbb2..d273b46dfc7c 100644 --- a/tools/testing/selftests/bpf/progs/iters_state_safety.c +++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c @@ -30,7 +30,7 @@ int force_clang_to_emit_btf_for_externs(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") int create_and_destroy(void *ctx) { struct bpf_iter_num iter; @@ -196,7 +196,7 @@ int leak_iter_from_subprog_fail(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") int valid_stack_reuse(void *ctx) { struct bpf_iter_num iter; @@ -345,7 +345,7 @@ int __naked read_from_iter_slot_fail(void) "r3 = 1000;" "call %[bpf_iter_num_new];" - /* attemp to leak bpf_iter_num state */ + /* attempt to leak bpf_iter_num state */ "r7 = *(u64 *)(r6 + 0);" "r8 = *(u64 *)(r6 + 8);" diff --git a/tools/testing/selftests/bpf/progs/iters_task_failure.c b/tools/testing/selftests/bpf/progs/iters_task_failure.c index 6b1588d70652..fe3663dedbe1 100644 --- a/tools/testing/selftests/bpf/progs/iters_task_failure.c +++ b/tools/testing/selftests/bpf/progs/iters_task_failure.c @@ -15,7 +15,7 @@ void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") -__failure __msg("expected an RCU CS when using bpf_iter_task_next") +__failure __msg("kernel func bpf_iter_task_new requires RCU critical section protection") int BPF_PROG(iter_tasks_without_lock) { struct task_struct *pos; @@ -27,7 +27,7 @@ int BPF_PROG(iter_tasks_without_lock) } SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") -__failure __msg("expected an RCU CS when using bpf_iter_css_next") +__failure __msg("kernel func bpf_iter_css_new requires RCU critical section protection") int BPF_PROG(iter_css_without_lock) { u64 cg_id = bpf_get_current_cgroup_id(); diff --git a/tools/testing/selftests/bpf/progs/iters_testmod.c b/tools/testing/selftests/bpf/progs/iters_testmod.c index 9e4b45201e69..5379e9960ffd 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod.c @@ -123,3 +123,49 @@ out: bpf_iter_num_destroy(&num_it); return 0; } + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("kernel func bpf_kfunc_ret_rcu_test requires RCU critical section protection") +int iter_ret_rcu_test_protected(const void *ctx) +{ + struct task_struct *p; + + p = bpf_kfunc_ret_rcu_test(); + return p->pid; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("R1 type=rcu_ptr_or_null_ expected=") +int iter_ret_rcu_test_type(const void *ctx) +{ + struct task_struct *p; + + bpf_rcu_read_lock(); + p = bpf_kfunc_ret_rcu_test(); + bpf_this_cpu_ptr(p); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("kernel func bpf_kfunc_ret_rcu_test_nostruct requires RCU critical section protection") +int iter_ret_rcu_test_protected_nostruct(const void *ctx) +{ + void *p; + + p = bpf_kfunc_ret_rcu_test_nostruct(4); + return *(int *)p; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("R1 type=rdonly_rcu_mem_or_null expected=") +int iter_ret_rcu_test_type_nostruct(const void *ctx) +{ + void *p; + + bpf_rcu_read_lock(); + p = bpf_kfunc_ret_rcu_test_nostruct(4); + bpf_this_cpu_ptr(p); + bpf_rcu_read_unlock(); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 6543d5b6e0a9..83791348bed5 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -20,7 +20,7 @@ __s64 res_empty; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_empty(const void *ctx) @@ -38,7 +38,7 @@ __s64 res_full; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_full(const void *ctx) @@ -58,7 +58,7 @@ static volatile int zero = 0; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_truncated(const void *ctx) diff --git a/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c b/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c new file mode 100644 index 000000000000..f77aef0474d3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +#if defined(__TARGET_ARCH_x86) +SEC("kprobe") +int kprobe_write_ctx(struct pt_regs *ctx) +{ + ctx->ax = 0; + return 0; +} + +SEC("kprobe.multi") +int kprobe_multi_write_ctx(struct pt_regs *ctx) +{ + ctx->ax = 0; + return 0; +} +#endif diff --git a/tools/testing/selftests/bpf/progs/linked_list_fail.c b/tools/testing/selftests/bpf/progs/linked_list_fail.c index 6438982b928b..ddd26d1a083f 100644 --- a/tools/testing/selftests/bpf/progs/linked_list_fail.c +++ b/tools/testing/selftests/bpf/progs/linked_list_fail.c @@ -226,8 +226,7 @@ int obj_new_no_composite(void *ctx) SEC("?tc") int obj_new_no_struct(void *ctx) { - - bpf_obj_new(union { int data; unsigned udata; }); + (void)bpf_obj_new(union { int data; unsigned udata; }); return 0; } @@ -252,7 +251,7 @@ int new_null_ret(void *ctx) SEC("?tc") int obj_new_acq(void *ctx) { - bpf_obj_new(struct foo); + (void)bpf_obj_new(struct foo); return 0; } diff --git a/tools/testing/selftests/bpf/progs/loop1.c b/tools/testing/selftests/bpf/progs/loop1.c index 50e66772c046..b0fa26fb4760 100644 --- a/tools/testing/selftests/bpf/progs/loop1.c +++ b/tools/testing/selftests/bpf/progs/loop1.c @@ -1,11 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include <linux/sched.h> -#include <linux/ptrace.h> -#include <stdint.h> -#include <stddef.h> -#include <stdbool.h> -#include <linux/bpf.h> +#include "vmlinux.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> diff --git a/tools/testing/selftests/bpf/progs/loop2.c b/tools/testing/selftests/bpf/progs/loop2.c index 947bb7e988c2..0227409d4b0e 100644 --- a/tools/testing/selftests/bpf/progs/loop2.c +++ b/tools/testing/selftests/bpf/progs/loop2.c @@ -1,11 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include <linux/sched.h> -#include <linux/ptrace.h> -#include <stdint.h> -#include <stddef.h> -#include <stdbool.h> -#include <linux/bpf.h> +#include "vmlinux.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c index 717dab14322b..5d1c9a775e6b 100644 --- a/tools/testing/selftests/bpf/progs/loop3.c +++ b/tools/testing/selftests/bpf/progs/loop3.c @@ -1,11 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include <linux/sched.h> -#include <linux/ptrace.h> -#include <stdint.h> -#include <stddef.h> -#include <stdbool.h> -#include <linux/bpf.h> +#include "vmlinux.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> diff --git a/tools/testing/selftests/bpf/progs/loop6.c b/tools/testing/selftests/bpf/progs/loop6.c index e4ff97fbcce1..dd36aff4fba3 100644 --- a/tools/testing/selftests/bpf/progs/loop6.c +++ b/tools/testing/selftests/bpf/progs/loop6.c @@ -1,8 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/ptrace.h> -#include <stddef.h> -#include <linux/bpf.h> +#include <vmlinux.h> +#include <bpf/bpf_core_read.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include "bpf_misc.h" @@ -26,12 +25,6 @@ char _license[] SEC("license") = "GPL"; #define SG_CHAIN 0x01UL #define SG_END 0x02UL -struct scatterlist { - unsigned long page_link; - unsigned int offset; - unsigned int length; -}; - #define sg_is_chain(sg) ((sg)->page_link & SG_CHAIN) #define sg_is_last(sg) ((sg)->page_link & SG_END) #define sg_chain_ptr(sg) \ @@ -62,7 +55,7 @@ static inline struct scatterlist *get_sgp(struct scatterlist **sgs, int i) return sgp; } -int config = 0; +int run_once = 0; int result = 0; SEC("kprobe/virtqueue_add_sgs") @@ -73,14 +66,14 @@ int BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, __u64 length1 = 0, length2 = 0; unsigned int i, n, len; - if (config != 0) + if (run_once != 0) return 0; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < out_sgs); i++) { __sink(out_sgs); for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); sgp = __sg_next(sgp)) { - bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + len = BPF_CORE_READ(sgp, length); length1 += len; n++; } @@ -90,13 +83,13 @@ int BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, __sink(in_sgs); for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); sgp = __sg_next(sgp)) { - bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + len = BPF_CORE_READ(sgp, length); length2 += len; n++; } } - config = 1; + run_once = 1; result = length2 - length1; return 0; } diff --git a/tools/testing/selftests/bpf/progs/lpm_trie.h b/tools/testing/selftests/bpf/progs/lpm_trie.h new file mode 100644 index 000000000000..76aa5821807f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lpm_trie.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PROGS_LPM_TRIE_H +#define __PROGS_LPM_TRIE_H + +struct trie_key { + __u32 prefixlen; + __u32 data; +}; + +/* Benchmark operations */ +enum { + LPM_OP_NOOP = 0, + LPM_OP_BASELINE, + LPM_OP_LOOKUP, + LPM_OP_INSERT, + LPM_OP_UPDATE, + LPM_OP_DELETE, + LPM_OP_FREE +}; + +/* + * Return values from run_bench. + * + * Negative values are also allowed and represent kernel error codes. + */ +#define LPM_BENCH_SUCCESS 0 +#define LPM_BENCH_REINIT_MAP 1 /* Reset trie to initial state for current op */ + +#endif diff --git a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c new file mode 100644 index 000000000000..a0e6ebd5507a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Cloudflare */ + +#include <vmlinux.h> +#include <errno.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> +#include "bpf_misc.h" +#include "bpf_atomic.h" +#include "progs/lpm_trie.h" + +#define BPF_OBJ_NAME_LEN 16U +#define MAX_ENTRIES 100000000 +#define NR_LOOPS 10000 + +char _license[] SEC("license") = "GPL"; + +/* Filled by userspace. See fill_map() in bench_lpm_trie_map.c */ +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __type(key, struct trie_key); + __type(value, __u32); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, MAX_ENTRIES); +} trie_map SEC(".maps"); + +long hits; +long duration_ns; + +/* Configured from userspace */ +__u32 nr_entries; +__u32 prefixlen; +bool random; +__u8 op; + +static __u64 latency_free_start; + +SEC("fentry/bpf_map_free_deferred") +int BPF_PROG(trie_free_entry, struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + char name[BPF_OBJ_NAME_LEN]; + u32 map_type; + + map_type = BPF_CORE_READ(map, map_type); + if (map_type != BPF_MAP_TYPE_LPM_TRIE) + return 0; + + /* + * Ideally we'd have access to the map ID but that's already + * freed before we enter trie_free(). + */ + BPF_CORE_READ_STR_INTO(&name, map, name); + if (bpf_strncmp(name, BPF_OBJ_NAME_LEN, "trie_free_map")) + return 0; + + latency_free_start = bpf_ktime_get_ns(); + + return 0; +} + +SEC("fexit/bpf_map_free_deferred") +int BPF_PROG(trie_free_exit, struct work_struct *work) +{ + __u64 val; + + if (!latency_free_start) + return 0; + + val = bpf_ktime_get_ns() - latency_free_start; + latency_free_start = 0; + + __sync_add_and_fetch(&duration_ns, val); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +static __u32 cur_key; + +static __always_inline void generate_key(struct trie_key *key) +{ + key->prefixlen = prefixlen; + + if (random) + key->data = bpf_get_prandom_u32() % nr_entries; + else + key->data = cur_key++ % nr_entries; +} + +static int noop(__u32 index, __u32 *unused) +{ + return 0; +} + +static int baseline(__u32 index, __u32 *unused) +{ + struct trie_key key; + __u32 blackbox = 0; + + generate_key(&key); + /* Avoid compiler optimizing out the modulo */ + barrier_var(blackbox); + blackbox = READ_ONCE(key.data); + + return 0; +} + +static int lookup(__u32 index, int *retval) +{ + struct trie_key key; + + generate_key(&key); + if (!bpf_map_lookup_elem(&trie_map, &key)) { + *retval = -ENOENT; + return 1; + } + + return 0; +} + +static int insert(__u32 index, int *retval) +{ + struct trie_key key; + u32 val = 1; + int err; + + generate_key(&key); + err = bpf_map_update_elem(&trie_map, &key, &val, BPF_NOEXIST); + if (err) { + *retval = err; + return 1; + } + + /* Is this the last entry? */ + if (key.data == nr_entries - 1) { + /* For atomicity concerns, see the comment in delete() */ + *retval = LPM_BENCH_REINIT_MAP; + return 1; + } + + return 0; +} + +static int update(__u32 index, int *retval) +{ + struct trie_key key; + u32 val = 1; + int err; + + generate_key(&key); + err = bpf_map_update_elem(&trie_map, &key, &val, BPF_EXIST); + if (err) { + *retval = err; + return 1; + } + + return 0; +} + +static int delete(__u32 index, int *retval) +{ + struct trie_key key; + int err; + + generate_key(&key); + err = bpf_map_delete_elem(&trie_map, &key); + if (err) { + *retval = err; + return 1; + } + + /* Do we need to refill the map? */ + if (key.data == nr_entries - 1) { + /* + * Atomicity isn't required because DELETE only supports + * one producer running concurrently. What we need is a + * way to track how many entries have been deleted from + * the trie between consecutive invocations of the BPF + * prog because a single bpf_loop() call might not + * delete all entries, e.g. when NR_LOOPS < nr_entries. + */ + *retval = LPM_BENCH_REINIT_MAP; + return 1; + } + + return 0; +} + +SEC("xdp") +int BPF_PROG(run_bench) +{ + int err = LPM_BENCH_SUCCESS; + u64 start, delta; + int loops; + + start = bpf_ktime_get_ns(); + + switch (op) { + case LPM_OP_NOOP: + loops = bpf_loop(NR_LOOPS, noop, NULL, 0); + break; + case LPM_OP_BASELINE: + loops = bpf_loop(NR_LOOPS, baseline, NULL, 0); + break; + case LPM_OP_LOOKUP: + loops = bpf_loop(NR_LOOPS, lookup, &err, 0); + break; + case LPM_OP_INSERT: + loops = bpf_loop(NR_LOOPS, insert, &err, 0); + break; + case LPM_OP_UPDATE: + loops = bpf_loop(NR_LOOPS, update, &err, 0); + break; + case LPM_OP_DELETE: + loops = bpf_loop(NR_LOOPS, delete, &err, 0); + break; + default: + bpf_printk("invalid benchmark operation\n"); + return -1; + } + + delta = bpf_ktime_get_ns() - start; + + __sync_add_and_fetch(&duration_ns, delta); + __sync_add_and_fetch(&hits, loops); + + return err; +} diff --git a/tools/testing/selftests/bpf/progs/lpm_trie_map.c b/tools/testing/selftests/bpf/progs/lpm_trie_map.c new file mode 100644 index 000000000000..6e60d686b664 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lpm_trie_map.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define MAX_ENTRIES 100000000 + +struct trie_key { + __u32 prefixlen; + __u32 data; +}; + +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __type(key, struct trie_key); + __type(value, __u32); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, MAX_ENTRIES); +} trie_free_map SEC(".maps"); diff --git a/tools/testing/selftests/bpf/progs/map_excl.c b/tools/testing/selftests/bpf/progs/map_excl.c new file mode 100644 index 000000000000..d461684728e4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/map_excl.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Google LLC. */ +#include <linux/bpf.h> +#include <time.h> +#include <bpf/bpf_helpers.h> + +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 1); +} excl_map SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int should_have_access(void *ctx) +{ + int key = 0, value = 0xdeadbeef; + + bpf_map_update_elem(&excl_map, &key, &value, 0); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int should_not_have_access(void *ctx) +{ + int key = 0, value = 0xdeadbeef; + + bpf_map_update_elem(&excl_map, &key, &value, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c index 4f94c971ae86..3b984b6ae7c0 100644 --- a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c +++ b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c @@ -8,8 +8,8 @@ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r8 = *(u64 *)(r7 +0) ; R7_w=ptr_nameidata(off={{[0-9]+}}) R8_w=rdonly_untrusted_mem(sz=0)") -__msg("r9 = *(u8 *)(r8 +0) ; R8_w=rdonly_untrusted_mem(sz=0) R9_w=scalar") +__msg("r8 = *(u64 *)(r7 +0) ; R7=ptr_nameidata(off={{[0-9]+}}) R8=rdonly_untrusted_mem(sz=0)") +__msg("r9 = *(u8 *)(r8 +0) ; R8=rdonly_untrusted_mem(sz=0) R9=scalar") int btf_id_to_ptr_mem(void *ctx) { struct task_struct *task; diff --git a/tools/testing/selftests/bpf/progs/rbtree_search.c b/tools/testing/selftests/bpf/progs/rbtree_search.c index 098ef970fac1..b05565d1db0d 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_search.c +++ b/tools/testing/selftests/bpf/progs/rbtree_search.c @@ -183,7 +183,7 @@ long test_##op##_spinlock_##dolock(void *ctx) \ } /* - * Use a spearate MSG macro instead of passing to TEST_XXX(..., MSG) + * Use a separate MSG macro instead of passing to TEST_XXX(..., MSG) * to ensure the message itself is not in the bpf prog lineinfo * which the verifier includes in its log. * Otherwise, the test_loader will incorrectly match the prog lineinfo diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/stacktrace_map.c index 47568007b668..0c77df05be7f 100644 --- a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c +++ b/tools/testing/selftests/bpf/progs/stacktrace_map.c @@ -50,6 +50,7 @@ struct sched_switch_args { int next_prio; }; +__u32 stack_id; SEC("tracepoint/sched/sched_switch") int oncpu(struct sched_switch_args *ctx) { @@ -64,6 +65,7 @@ int oncpu(struct sched_switch_args *ctx) /* The size of stackmap and stackid_hmap should be the same */ key = bpf_get_stackid(ctx, &stackmap, 0); if ((int)key >= 0) { + stack_id = key; bpf_map_update_elem(&stackid_hmap, &key, &val, 0); stack_p = bpf_map_lookup_elem(&stack_amap, &key); if (stack_p) diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c index 35790897dc87..4a5bd852f10c 100644 --- a/tools/testing/selftests/bpf/progs/stream.c +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -5,6 +5,7 @@ #include <bpf/bpf_helpers.h> #include "bpf_misc.h" #include "bpf_experimental.h" +#include "bpf_arena_common.h" struct arr_elem { struct bpf_res_spin_lock lock; @@ -17,10 +18,29 @@ struct { __type(value, struct arr_elem); } arrmap SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 1); /* number of pages */ +} arena SEC(".maps"); + +struct elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + #define ENOSPC 28 #define _STR "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" int size; +u64 fault_addr; +void *arena_ptr; SEC("syscall") __success __retval(0) @@ -37,7 +57,15 @@ int stream_exhaust(void *ctx) } SEC("syscall") +__arch_x86_64 +__arch_arm64 +__arch_s390x __success __retval(0) +__stderr("ERROR: Timeout detected for may_goto instruction") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") int stream_cond_break(void *ctx) { while (can_loop) @@ -47,6 +75,15 @@ int stream_cond_break(void *ctx) SEC("syscall") __success __retval(0) +__stderr("ERROR: AA or ABBA deadlock detected for bpf_res_spin_lock") +__stderr("{{Attempted lock = (0x[0-9a-fA-F]+)\n" +"Total held locks = 1\n" +"Held lock\\[ 0\\] = \\1}}") +__stderr("...") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") int stream_deadlock(void *ctx) { struct bpf_res_spin_lock *lock, *nlock; @@ -76,4 +113,125 @@ int stream_syscall(void *ctx) return 0; } +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena WRITE access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_write_fault(void *ctx) +{ + struct bpf_arena *ptr = (void *)&arena; + u64 user_vm_start; + + /* Prevent GCC bounds warning: casting &arena to struct bpf_arena * + * triggers bounds checking since the map definition is smaller than struct + * bpf_arena. barrier_var() makes the pointer opaque to GCC, preventing the + * bounds analysis + */ + barrier_var(ptr); + user_vm_start = ptr->user_vm_start; + fault_addr = user_vm_start + 0x7fff; + bpf_addr_space_cast(user_vm_start, 0, 1); + asm volatile ( + "r1 = %0;" + "r2 = 1;" + "*(u32 *)(r1 + 0x7fff) = r2;" + : + : "r" (user_vm_start) + : "r1", "r2" + ); + return 0; +} + +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena READ access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_read_fault(void *ctx) +{ + struct bpf_arena *ptr = (void *)&arena; + u64 user_vm_start; + + /* Prevent GCC bounds warning: casting &arena to struct bpf_arena * + * triggers bounds checking since the map definition is smaller than struct + * bpf_arena. barrier_var() makes the pointer opaque to GCC, preventing the + * bounds analysis + */ + barrier_var(ptr); + user_vm_start = ptr->user_vm_start; + fault_addr = user_vm_start + 0x7fff; + bpf_addr_space_cast(user_vm_start, 0, 1); + asm volatile ( + "r1 = %0;" + "r1 = *(u32 *)(r1 + 0x7fff);" + : + : "r" (user_vm_start) + : "r1" + ); + return 0; +} + +static __noinline void subprog(void) +{ + int __arena *addr = (int __arena *)0xdeadbeef; + + arena_ptr = &arena; + *addr = 1; +} + +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena WRITE access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_subprog_fault(void *ctx) +{ + subprog(); + return 0; +} + +static __noinline int timer_cb(void *map, int *key, struct bpf_timer *timer) +{ + int __arena *addr = (int __arena *)0xdeadbeef; + + arena_ptr = &arena; + *addr = 1; + return 0; +} + +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena WRITE access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_callback_fault(void *ctx) +{ + struct bpf_timer *arr_timer; + + arr_timer = bpf_map_lookup_elem(&array, &(int){0}); + if (!arr_timer) + return 0; + bpf_timer_init(arr_timer, &array, 1); + bpf_timer_set_callback(arr_timer, timer_cb); + bpf_timer_start(arr_timer, 0, 0); + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c index 53af438bd998..99d72c68f76a 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c @@ -31,6 +31,8 @@ char *invalid_kern_ptr = (char *)-1; /* Passing NULL to string kfuncs (treated as a userspace ptr) */ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_null1(void *ctx) { return bpf_strcmp(NULL, "hello"); } SEC("syscall") __retval(USER_PTR_ERR)int test_strcmp_null2(void *ctx) { return bpf_strcmp("hello", NULL); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_null1(void *ctx) { return bpf_strcasecmp(NULL, "HELLO"); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strcasecmp_null2(void *ctx) { return bpf_strcasecmp("HELLO", NULL); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchr_null(void *ctx) { return bpf_strchr(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchrnul_null(void *ctx) { return bpf_strchrnul(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strnchr_null(void *ctx) { return bpf_strnchr(NULL, 1, 'a'); } @@ -49,6 +51,8 @@ SEC("syscall") __retval(USER_PTR_ERR)int test_strnstr_null2(void *ctx) { return /* Passing userspace ptr to string kfuncs */ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr1(void *ctx) { return bpf_strcmp(user_ptr, "hello"); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr2(void *ctx) { return bpf_strcmp("hello", user_ptr); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr1(void *ctx) { return bpf_strcasecmp(user_ptr, "HELLO"); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr2(void *ctx) { return bpf_strcasecmp("HELLO", user_ptr); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchr_user_ptr(void *ctx) { return bpf_strchr(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchrnul_user_ptr(void *ctx) { return bpf_strchrnul(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strnchr_user_ptr(void *ctx) { return bpf_strnchr(user_ptr, 1, 'a'); } @@ -69,6 +73,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strnstr_user_ptr2(void *ctx) { re /* Passing invalid kernel ptr to string kfuncs should always return -EFAULT */ SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault1(void *ctx) { return bpf_strcmp(invalid_kern_ptr, "hello"); } SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault2(void *ctx) { return bpf_strcmp("hello", invalid_kern_ptr); } +SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault1(void *ctx) { return bpf_strcasecmp(invalid_kern_ptr, "HELLO"); } +SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault2(void *ctx) { return bpf_strcasecmp("HELLO", invalid_kern_ptr); } SEC("syscall") __retval(-EFAULT) int test_strchr_pagefault(void *ctx) { return bpf_strchr(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strchrnul_pagefault(void *ctx) { return bpf_strchrnul(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strnchr_pagefault(void *ctx) { return bpf_strnchr(invalid_kern_ptr, 1, 'a'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c index 89fb4669b0e9..e41cc5601994 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c @@ -7,6 +7,7 @@ char long_str[XATTR_SIZE_MAX + 1]; SEC("syscall") int test_strcmp_too_long(void *ctx) { return bpf_strcmp(long_str, long_str); } +SEC("syscall") int test_strcasecmp_too_long(void *ctx) { return bpf_strcasecmp(long_str, long_str); } SEC("syscall") int test_strchr_too_long(void *ctx) { return bpf_strchr(long_str, 'b'); } SEC("syscall") int test_strchrnul_too_long(void *ctx) { return bpf_strchrnul(long_str, 'b'); } SEC("syscall") int test_strnchr_too_long(void *ctx) { return bpf_strnchr(long_str, sizeof(long_str), 'b'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index 46697f381878..2e3498e37b9c 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -12,6 +12,11 @@ char str[] = "hello world"; /* Functional tests */ __test(0) int test_strcmp_eq(void *ctx) { return bpf_strcmp(str, "hello world"); } __test(1) int test_strcmp_neq(void *ctx) { return bpf_strcmp(str, "hello"); } +__test(0) int test_strcasecmp_eq1(void *ctx) { return bpf_strcasecmp(str, "hello world"); } +__test(0) int test_strcasecmp_eq2(void *ctx) { return bpf_strcasecmp(str, "HELLO WORLD"); } +__test(0) int test_strcasecmp_eq3(void *ctx) { return bpf_strcasecmp(str, "HELLO world"); } +__test(1) int test_strcasecmp_neq1(void *ctx) { return bpf_strcasecmp(str, "hello"); } +__test(1) int test_strcasecmp_neq2(void *ctx) { return bpf_strcasecmp(str, "HELLO"); } __test(1) int test_strchr_found(void *ctx) { return bpf_strchr(str, 'e'); } __test(11) int test_strchr_null(void *ctx) { return bpf_strchr(str, '\0'); } __test(-ENOENT) int test_strchr_notfound(void *ctx) { return bpf_strchr(str, 'x'); } @@ -30,8 +35,12 @@ __test(2) int test_strcspn(void *ctx) { return bpf_strcspn(str, "lo"); } __test(6) int test_strstr_found(void *ctx) { return bpf_strstr(str, "world"); } __test(-ENOENT) int test_strstr_notfound(void *ctx) { return bpf_strstr(str, "hi"); } __test(0) int test_strstr_empty(void *ctx) { return bpf_strstr(str, ""); } -__test(0) int test_strnstr_found(void *ctx) { return bpf_strnstr(str, "hello", 6); } -__test(-ENOENT) int test_strnstr_notfound(void *ctx) { return bpf_strnstr(str, "hi", 10); } +__test(0) int test_strnstr_found1(void *ctx) { return bpf_strnstr("", "", 0); } +__test(0) int test_strnstr_found2(void *ctx) { return bpf_strnstr(str, "hello", 5); } +__test(0) int test_strnstr_found3(void *ctx) { return bpf_strnstr(str, "hello", 6); } +__test(-ENOENT) int test_strnstr_notfound1(void *ctx) { return bpf_strnstr(str, "hi", 10); } +__test(-ENOENT) int test_strnstr_notfound2(void *ctx) { return bpf_strnstr(str, "hello", 4); } +__test(-ENOENT) int test_strnstr_notfound3(void *ctx) { return bpf_strnstr("", "a", 0); } __test(0) int test_strnstr_empty(void *ctx) { return bpf_strnstr(str, "", 1); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping1.c b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping1.c new file mode 100644 index 000000000000..ad8bb546c9bf --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping1.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +#define bpf_kfunc_multi_st_ops_test_1(args) bpf_kfunc_multi_st_ops_test_1(args, st_ops_id) +int st_ops_id; + +int test_pid; +int test_err; + +#define MAP1_MAGIC 1234 + +SEC("struct_ops") +int BPF_PROG(test_1, struct st_ops_args *args) +{ + return MAP1_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP1_MAGIC) + test_err++; + + return 0; +} + +SEC("syscall") +int syscall_prog(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP1_MAGIC) + test_err++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map = { + .test_1 = (void *)test_1, +}; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping2.c b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping2.c new file mode 100644 index 000000000000..cea1a2f4b62f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping2.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +#define bpf_kfunc_multi_st_ops_test_1(args) bpf_kfunc_multi_st_ops_test_1(args, st_ops_id) +int st_ops_id; + +int test_pid; +int test_err; + +#define MAP2_MAGIC 4567 + +SEC("struct_ops") +int BPF_PROG(test_1, struct st_ops_args *args) +{ + return MAP2_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP2_MAGIC) + test_err++; + + return 0; +} + +SEC("syscall") +int syscall_prog(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP2_MAGIC) + test_err++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map = { + .test_1 = (void *)test_1, +}; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c index 36386b3c23a1..2b98b7710816 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c @@ -9,7 +9,7 @@ void bpf_task_release(struct task_struct *p) __ksym; /* This test struct_ops BPF programs returning referenced kptr. The verifier should * allow a referenced kptr or a NULL pointer to be returned. A referenced kptr to task - * here is acquried automatically as the task argument is tagged with "__ref". + * here is acquired automatically as the task argument is tagged with "__ref". */ SEC("struct_ops/test_return_ref_kptr") struct task_struct *BPF_PROG(kptr_return, int dummy, diff --git a/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c b/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c index 76dcb6089d7f..9c0a65466356 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c @@ -9,7 +9,7 @@ __attribute__((nomerge)) extern void bpf_task_release(struct task_struct *p) __k /* This is a test BPF program that uses struct_ops to access a referenced * kptr argument. This is a test for the verifier to ensure that it - * 1) recongnizes the task as a referenced object (i.e., ref_obj_id > 0), and + * 1) recognizes the task as a referenced object (i.e., ref_obj_id > 0), and * 2) the same reference can be acquired from multiple paths as long as it * has not been released. */ diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c index 327ca395e860..d556b19413d7 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c @@ -2,6 +2,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include "bpf_legacy.h" +#include "bpf_test_utils.h" struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); @@ -24,6 +25,8 @@ int entry(struct __sk_buff *skb) { int ret = 1; + clobber_regs_stack(); + count++; subprog_tail(skb); subprog_tail(skb); diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c index 72fd0d577506..ae94c9c70ab7 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c @@ -2,6 +2,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include "bpf_misc.h" +#include "bpf_test_utils.h" int classifier_0(struct __sk_buff *skb); int classifier_1(struct __sk_buff *skb); @@ -60,6 +61,8 @@ int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb) { int ret = 0; + clobber_regs_stack(); + subprog_tail0(skb); subprog_tail1(skb); diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c index a7fb91cb05b7..56b6b0099840 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c @@ -2,6 +2,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include "bpf_misc.h" +#include "bpf_test_utils.h" int classifier_0(struct __sk_buff *skb); @@ -53,6 +54,8 @@ int tailcall_bpf2bpf_hierarchy_3(struct __sk_buff *skb) { int ret = 0; + clobber_regs_stack(); + bpf_tail_call_static(skb, &jmp_table0, 0); __sink(ret); diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c index c87f9ca982d3..5261395713cd 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c @@ -4,6 +4,7 @@ #include "vmlinux.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> +#include "bpf_test_utils.h" struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); @@ -24,6 +25,8 @@ int subprog_tail(void *ctx) SEC("fentry/dummy") int BPF_PROG(fentry, struct sk_buff *skb) { + clobber_regs_stack(); + count++; subprog_tail(ctx); subprog_tail(ctx); diff --git a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h new file mode 100644 index 000000000000..432fff2af844 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TASK_LOCAL_DATA_BPF_H +#define __TASK_LOCAL_DATA_BPF_H + +/* + * Task local data is a library that facilitates sharing per-task data + * between user space and bpf programs. + * + * + * USAGE + * + * A TLD, an entry of data in task local data, first needs to be created by the + * user space. This is done by calling user space API, TLD_DEFINE_KEY() or + * tld_create_key(), with the name of the TLD and the size. + * + * TLD_DEFINE_KEY(prio, "priority", sizeof(int)); + * + * or + * + * void func_call(...) { + * tld_key_t prio, in_cs; + * + * prio = tld_create_key("priority", sizeof(int)); + * in_cs = tld_create_key("in_critical_section", sizeof(bool)); + * ... + * + * A key associated with the TLD, which has an opaque type tld_key_t, will be + * initialized or returned. It can be used to get a pointer to the TLD in the + * user space by calling tld_get_data(). + * + * In a bpf program, tld_object_init() first needs to be called to initialized a + * tld_object on the stack. Then, TLDs can be accessed by calling tld_get_data(). + * The API will try to fetch the key by the name and use it to locate the data. + * A pointer to the TLD will be returned. It also caches the key in a task local + * storage map, tld_key_map, whose value type, struct tld_keys, must be defined + * by the developer. + * + * struct tld_keys { + * tld_key_t prio; + * tld_key_t in_cs; + * }; + * + * SEC("struct_ops") + * void prog(struct task_struct task, ...) + * { + * struct tld_object tld_obj; + * int err, *p; + * + * err = tld_object_init(task, &tld_obj); + * if (err) + * return; + * + * p = tld_get_data(&tld_obj, prio, "priority", sizeof(int)); + * if (p) + * // do something depending on *p + */ +#include <errno.h> +#include <bpf/bpf_helpers.h> + +#define TLD_ROUND_MASK(x, y) ((__typeof__(x))((y) - 1)) +#define TLD_ROUND_UP(x, y) ((((x) - 1) | TLD_ROUND_MASK(x, y)) + 1) + +#define TLD_MAX_DATA_CNT (__PAGE_SIZE / sizeof(struct tld_metadata) - 1) + +#ifndef TLD_NAME_LEN +#define TLD_NAME_LEN 62 +#endif + +#ifndef TLD_KEY_MAP_CREATE_RETRY +#define TLD_KEY_MAP_CREATE_RETRY 10 +#endif + +typedef struct { + __s16 off; +} tld_key_t; + +struct tld_metadata { + char name[TLD_NAME_LEN]; + __u16 size; +}; + +struct tld_meta_u { + __u8 cnt; + __u16 size; + struct tld_metadata metadata[TLD_MAX_DATA_CNT]; +}; + +struct tld_data_u { + __u64 start; /* offset of tld_data_u->data in a page */ + char data[__PAGE_SIZE - sizeof(__u64)]; +}; + +struct tld_map_value { + struct tld_data_u __uptr *data; + struct tld_meta_u __uptr *meta; +}; + +typedef struct tld_uptr_dummy { + struct tld_data_u data[0]; + struct tld_meta_u meta[0]; +} *tld_uptr_dummy_t; + +struct tld_object { + struct tld_map_value *data_map; + struct tld_keys *key_map; + /* + * Force the compiler to generate the actual definition of tld_meta_u + * and tld_data_u in BTF. Without it, tld_meta_u and u_tld_data will + * be BTF_KIND_FWD. + */ + tld_uptr_dummy_t dummy[0]; +}; + +/* + * Map value of tld_key_map for caching keys. Must be defined by the developer. + * Members should be tld_key_t and passed to the 3rd argument of tld_fetch_key(). + */ +struct tld_keys; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tld_map_value); +} tld_data_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tld_keys); +} tld_key_map SEC(".maps"); + +/** + * tld_object_init() - Initialize a tld_object. + * + * @task: The task_struct of the target task + * @tld_obj: A pointer to a tld_object to be initialized + * + * Return 0 on success; -ENODATA if the user space did not initialize task local data + * for the current task through tld_get_data(); -ENOMEM if the creation of tld_key_map + * fails + */ +__attribute__((unused)) +static int tld_object_init(struct task_struct *task, struct tld_object *tld_obj) +{ + int i; + + tld_obj->data_map = bpf_task_storage_get(&tld_data_map, task, 0, 0); + if (!tld_obj->data_map) + return -ENODATA; + + bpf_for(i, 0, TLD_KEY_MAP_CREATE_RETRY) { + tld_obj->key_map = bpf_task_storage_get(&tld_key_map, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (likely(tld_obj->key_map)) + break; + } + if (!tld_obj->key_map) + return -ENOMEM; + + return 0; +} + +/* + * Return the offset of TLD if @name is found. Otherwise, return the current TLD count + * using the nonpositive range so that the next tld_get_data() can skip fetching key if + * no new TLD is added or start comparing name from the first newly added TLD. + */ +__attribute__((unused)) +static int __tld_fetch_key(struct tld_object *tld_obj, const char *name, int i_start) +{ + struct tld_metadata *metadata; + int i, cnt, start, off = 0; + + if (!tld_obj->data_map || !tld_obj->data_map->data || !tld_obj->data_map->meta) + return 0; + + start = tld_obj->data_map->data->start; + cnt = tld_obj->data_map->meta->cnt; + metadata = tld_obj->data_map->meta->metadata; + + bpf_for(i, 0, cnt) { + if (i >= TLD_MAX_DATA_CNT) + break; + + if (i >= i_start && !bpf_strncmp(metadata[i].name, TLD_NAME_LEN, name)) + return start + off; + + off += TLD_ROUND_UP(metadata[i].size, 8); + } + + return -cnt; +} + +/** + * tld_get_data() - Retrieve a pointer to the TLD associated with the name. + * + * @tld_obj: A pointer to a valid tld_object initialized by tld_object_init() + * @key: The cached key of the TLD in tld_key_map + * @name: The name of the key associated with a TLD + * @size: The size of the TLD. Must be a known constant value + * + * Return a pointer to the TLD associated with @name; NULL if not found or @size is too + * big. @key is used to cache the key if the TLD is found to speed up subsequent calls. + * It should be defined as an member of tld_keys of tld_key_t type by the developer. + */ +#define tld_get_data(tld_obj, key, name, size) \ + ({ \ + void *data = NULL, *_data = (tld_obj)->data_map->data; \ + long off = (tld_obj)->key_map->key.off; \ + int cnt; \ + \ + if (likely(_data)) { \ + if (likely(off > 0)) { \ + barrier_var(off); \ + if (likely(off < __PAGE_SIZE - size)) \ + data = _data + off; \ + } else { \ + cnt = -(off); \ + if (likely((tld_obj)->data_map->meta) && \ + cnt < (tld_obj)->data_map->meta->cnt) { \ + off = __tld_fetch_key(tld_obj, name, cnt); \ + (tld_obj)->key_map->key.off = off; \ + \ + if (likely(off < __PAGE_SIZE - size)) { \ + barrier_var(off); \ + if (off > 0) \ + data = _data + off; \ + } \ + } \ + } \ + } \ + data; \ + }) + +#endif diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c new file mode 100644 index 000000000000..23217f06a3ec --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <string.h> +#include <stdbool.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "errno.h" + +char _license[] SEC("license") = "GPL"; + +const void *user_ptr = NULL; + +struct elem { + char data[128]; + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} arrmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} lrumap SEC(".maps"); + +static int process_work(struct bpf_map *map, void *key, void *value) +{ + struct elem *work = value; + + bpf_copy_from_user_str(work->data, sizeof(work->data), (const void *)user_ptr, 0); + return 0; +} + +int key = 0; + +SEC("perf_event") +int oncpu_hash_map(struct pt_regs *args) +{ + struct elem empty_work = { .data = { 0 } }; + struct elem *work; + struct task_struct *task; + int err; + + task = bpf_get_current_task_btf(); + err = bpf_map_update_elem(&hmap, &key, &empty_work, BPF_NOEXIST); + if (err) + return 0; + work = bpf_map_lookup_elem(&hmap, &key); + if (!work) + return 0; + + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +int oncpu_array_map(struct pt_regs *args) +{ + struct elem *work; + struct task_struct *task; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&arrmap, &key); + if (!work) + return 0; + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +int oncpu_lru_map(struct pt_regs *args) +{ + struct elem empty_work = { .data = { 0 } }; + struct elem *work; + struct task_struct *task; + int err; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&lrumap, &key); + if (work) + return 0; + err = bpf_map_update_elem(&lrumap, &key, &empty_work, BPF_NOEXIST); + if (err) + return 0; + work = bpf_map_lookup_elem(&lrumap, &key); + if (!work || work->data[0]) + return 0; + bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work, NULL); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c new file mode 100644 index 000000000000..77fe8f28facd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <string.h> +#include <stdbool.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +const void *user_ptr = NULL; + +struct elem { + char data[128]; + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} arrmap SEC(".maps"); + +static int process_work(struct bpf_map *map, void *key, void *value) +{ + struct elem *work = value; + + bpf_copy_from_user_str(work->data, sizeof(work->data), (const void *)user_ptr, 0); + return 0; +} + +int key = 0; + +SEC("perf_event") +__failure __msg("doesn't match map pointer in R3") +int mismatch_map(struct pt_regs *args) +{ + struct elem *work; + struct task_struct *task; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&arrmap, &key); + if (!work) + return 0; + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +__failure __msg("arg#1 doesn't point to a map value") +int no_map_task_work(struct pt_regs *args) +{ + struct task_struct *task; + struct bpf_task_work tw; + + task = bpf_get_current_task_btf(); + bpf_task_work_schedule_resume(task, &tw, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +__failure __msg("Possibly NULL pointer passed to trusted arg1") +int task_work_null(struct pt_regs *args) +{ + struct task_struct *task; + + task = bpf_get_current_task_btf(); + bpf_task_work_schedule_resume(task, NULL, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +__failure __msg("Possibly NULL pointer passed to trusted arg2") +int map_null(struct pt_regs *args) +{ + struct elem *work; + struct task_struct *task; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&arrmap, &key); + if (!work) + return 0; + bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work, NULL); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c new file mode 100644 index 000000000000..90fca06fff56 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <string.h> +#include <stdbool.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" + +#define ENTRIES 128 + +char _license[] SEC("license") = "GPL"; + +__u64 callback_scheduled = 0; +__u64 callback_success = 0; +__u64 schedule_error = 0; +__u64 delete_success = 0; + +struct elem { + __u32 count; + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, ENTRIES); + __type(key, int); + __type(value, struct elem); +} hmap SEC(".maps"); + +static int process_work(struct bpf_map *map, void *key, void *value) +{ + __sync_fetch_and_add(&callback_success, 1); + return 0; +} + +SEC("syscall") +int schedule_task_work(void *ctx) +{ + struct elem empty_work = {.count = 0}; + struct elem *work; + int key = 0, err; + + key = bpf_ktime_get_ns() % ENTRIES; + work = bpf_map_lookup_elem(&hmap, &key); + if (!work) { + bpf_map_update_elem(&hmap, &key, &empty_work, BPF_NOEXIST); + work = bpf_map_lookup_elem(&hmap, &key); + if (!work) + return 0; + } + err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work, NULL); + if (err) + __sync_fetch_and_add(&schedule_error, 1); + else + __sync_fetch_and_add(&callback_scheduled, 1); + return 0; +} + +SEC("syscall") +int delete_task_work(void *ctx) +{ + int key = 0, err; + + key = bpf_get_prandom_u32() % ENTRIES; + err = bpf_map_delete_elem(&hmap, &key); + if (!err) + __sync_fetch_and_add(&delete_success, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index f344c6835e84..26a53e54b8fa 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -22,6 +22,7 @@ #include "bpf_compiler.h" #include "test_cls_redirect.h" +#include "bpf_misc.h" #pragma GCC diagnostic ignored "-Waddress-of-packed-member" @@ -31,9 +32,6 @@ #define INLINING __always_inline #endif -#define offsetofend(TYPE, MEMBER) \ - (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) - #define IP_OFFSET_MASK (0x1FFF) #define IP_MF (0x2000) @@ -129,7 +127,7 @@ typedef uint8_t *net_ptr __attribute__((align_value(8))); typedef struct buf { struct __sk_buff *skb; net_ptr head; - /* NB: tail musn't have alignment other than 1, otherwise + /* NB: tail mustn't have alignment other than 1, otherwise * LLVM will go and eliminate code, e.g. when checking packet lengths. */ uint8_t *const tail; diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c index d0f7670351e5..dfd4a2710391 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c @@ -494,7 +494,7 @@ static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_header *offset += sizeof(*next_hop); - /* Skip the remainig next hops (may be zero). */ + /* Skip the remaining next hops (may be zero). */ return skip_next_hops(offset, encap->unigue.hop_count - encap->unigue.next_hop - 1); } diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c index abb7344b531f..5edf3cdc213d 100644 --- a/tools/testing/selftests/bpf/progs/test_overhead.c +++ b/tools/testing/selftests/bpf/progs/test_overhead.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ -#include <stdbool.h> -#include <stddef.h> -#include <linux/bpf.h> -#include <linux/ptrace.h> +#include "vmlinux.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> diff --git a/tools/testing/selftests/bpf/progs/test_pinning_devmap.c b/tools/testing/selftests/bpf/progs/test_pinning_devmap.c new file mode 100644 index 000000000000..c855f8f87eff --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_pinning_devmap.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} pinmap1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u32); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} pinmap2 SEC(".maps"); diff --git a/tools/testing/selftests/bpf/progs/test_task_local_data.c b/tools/testing/selftests/bpf/progs/test_task_local_data.c new file mode 100644 index 000000000000..fffafc013044 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_task_local_data.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include <errno.h> +#include <bpf/bpf_helpers.h> + +#include "task_local_data.bpf.h" + +struct tld_keys { + tld_key_t value0; + tld_key_t value1; + tld_key_t value2; + tld_key_t value_not_exist; +}; + +struct test_tld_struct { + __u64 a; + __u64 b; + __u64 c; + __u64 d; +}; + +int test_value0; +int test_value1; +struct test_tld_struct test_value2; + +SEC("syscall") +int task_main(void *ctx) +{ + struct tld_object tld_obj; + struct test_tld_struct *struct_p; + struct task_struct *task; + int err, *int_p; + + task = bpf_get_current_task_btf(); + err = tld_object_init(task, &tld_obj); + if (err) + return 1; + + int_p = tld_get_data(&tld_obj, value0, "value0", sizeof(int)); + if (int_p) + test_value0 = *int_p; + else + return 2; + + int_p = tld_get_data(&tld_obj, value1, "value1", sizeof(int)); + if (int_p) + test_value1 = *int_p; + else + return 3; + + struct_p = tld_get_data(&tld_obj, value2, "value2", sizeof(struct test_tld_struct)); + if (struct_p) + test_value2 = *struct_p; + else + return 4; + + int_p = tld_get_data(&tld_obj, value_not_exist, "value_not_exist", sizeof(int)); + if (int_p) + return 5; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c index 5f4e87ee949a..1ecdf4c54de4 100644 --- a/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c @@ -14,10 +14,7 @@ #include <bpf/bpf_endian.h> #define BPF_PROG_TEST_TCP_HDR_OPTIONS #include "test_tcp_hdr_options.h" - -#ifndef sizeof_field -#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) -#endif +#include "bpf_misc.h" __u8 test_kind = TCPOPT_EXP; __u16 test_magic = 0xeB9F; diff --git a/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c b/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c index 540181c115a8..ef00d38b0a8d 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c @@ -23,7 +23,6 @@ struct { struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(max_entries, 2); __type(key, int); __type(value, __u32); } perf_event_map SEC(".maps"); diff --git a/tools/testing/selftests/bpf/progs/test_uprobe.c b/tools/testing/selftests/bpf/progs/test_uprobe.c index 896c88a4960d..12f4065fca20 100644 --- a/tools/testing/selftests/bpf/progs/test_uprobe.c +++ b/tools/testing/selftests/bpf/progs/test_uprobe.c @@ -59,3 +59,41 @@ int BPF_UPROBE(test4) test4_result = 1; return 0; } + +#if defined(__TARGET_ARCH_x86) +struct pt_regs regs; + +SEC("uprobe") +int BPF_UPROBE(test_regs_change) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + ctx->ax = regs.ax; + ctx->cx = regs.cx; + ctx->dx = regs.dx; + ctx->r8 = regs.r8; + ctx->r9 = regs.r9; + ctx->r10 = regs.r10; + ctx->r11 = regs.r11; + ctx->di = regs.di; + ctx->si = regs.si; + return 0; +} + +unsigned long ip; + +SEC("uprobe") +int BPF_UPROBE(test_regs_change_ip) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + ctx->ip = ip; + return 0; +} +#endif diff --git a/tools/testing/selftests/bpf/progs/test_usdt.c b/tools/testing/selftests/bpf/progs/test_usdt.c index 096488f47fbc..a78c87537b07 100644 --- a/tools/testing/selftests/bpf/progs/test_usdt.c +++ b/tools/testing/selftests/bpf/progs/test_usdt.c @@ -107,4 +107,35 @@ int BPF_USDT(usdt12, int a1, int a2, long a3, long a4, unsigned a5, return 0; } +int usdt_sib_called; +u64 usdt_sib_cookie; +int usdt_sib_arg_cnt; +int usdt_sib_arg_ret; +short usdt_sib_arg; +int usdt_sib_arg_size; + +/* + * usdt_sib is only tested on x86-related architectures, so it requires + * manual attach since auto-attach will panic tests under other architectures + */ +SEC("usdt") +int usdt_sib(struct pt_regs *ctx) +{ + long tmp; + + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + __sync_fetch_and_add(&usdt_sib_called, 1); + + usdt_sib_cookie = bpf_usdt_cookie(ctx); + usdt_sib_arg_cnt = bpf_usdt_arg_cnt(ctx); + + usdt_sib_arg_ret = bpf_usdt_arg(ctx, 0, &tmp); + usdt_sib_arg = (short)tmp; + usdt_sib_arg_size = bpf_usdt_arg_size(ctx, 0); + + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_devmap_tailcall.c b/tools/testing/selftests/bpf/progs/test_xdp_devmap_tailcall.c new file mode 100644 index 000000000000..814e2a980e97 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_devmap_tailcall.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +SEC("xdp") +int xdp_devmap(struct xdp_md *ctx) +{ + return ctx->egress_ifindex; +} + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, int (void *)); +} xdp_map SEC(".maps") = { + .values = { + [0] = (void *)&xdp_devmap, + }, +}; + +SEC("xdp") +int xdp_entry(struct xdp_md *ctx) +{ + bpf_tail_call(ctx, &xdp_map, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index fcf6ca14f2ea..d79cb74b571e 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -1,8 +1,11 @@ +#include <stdbool.h> #include <linux/bpf.h> +#include <linux/errno.h> #include <linux/if_ether.h> #include <linux/pkt_cls.h> #include <bpf/bpf_helpers.h> +#include "bpf_kfuncs.h" #define META_SIZE 32 @@ -23,6 +26,8 @@ struct { __uint(value_size, META_SIZE); } test_result SEC(".maps"); +bool test_pass; + SEC("tc") int ing_cls(struct __sk_buff *ctx) { @@ -40,6 +45,231 @@ int ing_cls(struct __sk_buff *ctx) return TC_ACT_SHOT; } +/* Read from metadata using bpf_dynptr_read helper */ +SEC("tc") +int ing_cls_dynptr_read(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + const __u32 zero = 0; + __u8 *dst; + + dst = bpf_map_lookup_elem(&test_result, &zero); + if (!dst) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_read(dst, META_SIZE, &meta, 0, 0); + + return TC_ACT_SHOT; +} + +/* Write to metadata using bpf_dynptr_write helper */ +SEC("tc") +int ing_cls_dynptr_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + __u8 *src; + + bpf_dynptr_from_skb(ctx, 0, &data); + src = bpf_dynptr_slice(&data, sizeof(struct ethhdr), NULL, META_SIZE); + if (!src) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_write(&meta, 0, src, META_SIZE, 0); + + return TC_ACT_UNSPEC; /* pass */ +} + +/* Read from metadata using read-only dynptr slice */ +SEC("tc") +int ing_cls_dynptr_slice(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + const __u32 zero = 0; + __u8 *dst, *src; + + dst = bpf_map_lookup_elem(&test_result, &zero); + if (!dst) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + src = bpf_dynptr_slice(&meta, 0, NULL, META_SIZE); + if (!src) + return TC_ACT_SHOT; + + __builtin_memcpy(dst, src, META_SIZE); + + return TC_ACT_SHOT; +} + +/* Write to metadata using writeable dynptr slice */ +SEC("tc") +int ing_cls_dynptr_slice_rdwr(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + __u8 *src, *dst; + + bpf_dynptr_from_skb(ctx, 0, &data); + src = bpf_dynptr_slice(&data, sizeof(struct ethhdr), NULL, META_SIZE); + if (!src) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + dst = bpf_dynptr_slice_rdwr(&meta, 0, NULL, META_SIZE); + if (!dst) + return TC_ACT_SHOT; + + __builtin_memcpy(dst, src, META_SIZE); + + return TC_ACT_UNSPEC; /* pass */ +} + +/* Read skb metadata in chunks from various offsets in different ways. */ +SEC("tc") +int ing_cls_dynptr_offset_rd(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + const __u32 chunk_len = META_SIZE / 4; + const __u32 zero = 0; + __u8 *dst, *src; + + dst = bpf_map_lookup_elem(&test_result, &zero); + if (!dst) + return TC_ACT_SHOT; + + /* 1. Regular read */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_read(dst, chunk_len, &meta, 0, 0); + dst += chunk_len; + + /* 2. Read from an offset-adjusted dynptr */ + bpf_dynptr_adjust(&meta, chunk_len, bpf_dynptr_size(&meta)); + bpf_dynptr_read(dst, chunk_len, &meta, 0, 0); + dst += chunk_len; + + /* 3. Read at an offset */ + bpf_dynptr_read(dst, chunk_len, &meta, chunk_len, 0); + dst += chunk_len; + + /* 4. Read from a slice starting at an offset */ + src = bpf_dynptr_slice(&meta, 2 * chunk_len, NULL, chunk_len); + if (!src) + return TC_ACT_SHOT; + __builtin_memcpy(dst, src, chunk_len); + + return TC_ACT_SHOT; +} + +/* Write skb metadata in chunks at various offsets in different ways. */ +SEC("tc") +int ing_cls_dynptr_offset_wr(struct __sk_buff *ctx) +{ + const __u32 chunk_len = META_SIZE / 4; + __u8 payload[META_SIZE]; + struct bpf_dynptr meta; + __u8 *dst, *src; + + bpf_skb_load_bytes(ctx, sizeof(struct ethhdr), payload, sizeof(payload)); + src = payload; + + /* 1. Regular write */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_write(&meta, 0, src, chunk_len, 0); + src += chunk_len; + + /* 2. Write to an offset-adjusted dynptr */ + bpf_dynptr_adjust(&meta, chunk_len, bpf_dynptr_size(&meta)); + bpf_dynptr_write(&meta, 0, src, chunk_len, 0); + src += chunk_len; + + /* 3. Write at an offset */ + bpf_dynptr_write(&meta, chunk_len, src, chunk_len, 0); + src += chunk_len; + + /* 4. Write to a slice starting at an offset */ + dst = bpf_dynptr_slice_rdwr(&meta, 2 * chunk_len, NULL, chunk_len); + if (!dst) + return TC_ACT_SHOT; + __builtin_memcpy(dst, src, chunk_len); + + return TC_ACT_UNSPEC; /* pass */ +} + +/* Pass an OOB offset to dynptr read, write, adjust, slice. */ +SEC("tc") +int ing_cls_dynptr_offset_oob(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + __u8 md, *p; + int err; + + err = bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (err) + goto fail; + + /* read offset OOB */ + err = bpf_dynptr_read(&md, sizeof(md), &meta, META_SIZE, 0); + if (err != -E2BIG) + goto fail; + + /* write offset OOB */ + err = bpf_dynptr_write(&meta, META_SIZE, &md, sizeof(md), 0); + if (err != -E2BIG) + goto fail; + + /* adjust end offset OOB */ + err = bpf_dynptr_adjust(&meta, 0, META_SIZE + 1); + if (err != -ERANGE) + goto fail; + + /* adjust start offset OOB */ + err = bpf_dynptr_adjust(&meta, META_SIZE + 1, META_SIZE + 1); + if (err != -ERANGE) + goto fail; + + /* slice offset OOB */ + p = bpf_dynptr_slice(&meta, META_SIZE, NULL, sizeof(*p)); + if (p) + goto fail; + + /* slice rdwr offset OOB */ + p = bpf_dynptr_slice_rdwr(&meta, META_SIZE, NULL, sizeof(*p)); + if (p) + goto fail; + + return TC_ACT_UNSPEC; +fail: + return TC_ACT_SHOT; +} + +/* Reserve and clear space for metadata but don't populate it */ +SEC("xdp") +int ing_xdp_zalloc_meta(struct xdp_md *ctx) +{ + struct ethhdr *eth = ctx_ptr(ctx, data); + __u8 *meta; + int ret; + + /* Drop any non-test packets */ + if (eth + 1 > ctx_ptr(ctx, data_end)) + return XDP_DROP; + if (eth->h_proto != 0) + return XDP_DROP; + + ret = bpf_xdp_adjust_meta(ctx, -META_SIZE); + if (ret < 0) + return XDP_DROP; + + meta = ctx_ptr(ctx, data_meta); + if (meta + META_SIZE > ctx_ptr(ctx, data)) + return XDP_DROP; + + __builtin_memset(meta, 0, META_SIZE); + + return XDP_PASS; +} + SEC("xdp") int ing_xdp(struct xdp_md *ctx) { @@ -73,4 +303,193 @@ int ing_xdp(struct xdp_md *ctx) return XDP_PASS; } +/* + * Check that skb->data_meta..skb->data is empty if prog writes to packet + * _payload_ using packet pointers. Applies only to cloned skbs. + */ +SEC("tc") +int clone_data_meta_empty_on_data_write(struct __sk_buff *ctx) +{ + struct ethhdr *eth = ctx_ptr(ctx, data); + + if (eth + 1 > ctx_ptr(ctx, data_end)) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect no metadata */ + if (ctx->data_meta != ctx->data) + goto out; + + /* Packet write to trigger unclone in prologue */ + eth->h_proto = 42; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb->data_meta..skb->data is empty if prog writes to packet + * _metadata_ using packet pointers. Applies only to cloned skbs. + */ +SEC("tc") +int clone_data_meta_empty_on_meta_write(struct __sk_buff *ctx) +{ + struct ethhdr *eth = ctx_ptr(ctx, data); + __u8 *md = ctx_ptr(ctx, data_meta); + + if (eth + 1 > ctx_ptr(ctx, data_end)) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + if (md + 1 > ctx_ptr(ctx, data)) { + /* Expect no metadata */ + test_pass = true; + } else { + /* Metadata write to trigger unclone in prologue */ + *md = 42; + } +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is writable but empty if prog writes to packet + * _payload_ using a dynptr slice. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_empty_on_data_slice_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + struct ethhdr *eth; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice_rdwr(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect no metadata */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) + goto out; + + /* Packet write to trigger unclone in prologue */ + eth->h_proto = 42; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is writable but empty if prog writes to packet + * _metadata_ using a dynptr slice. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_empty_on_meta_slice_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + const struct ethhdr *eth; + __u8 *md; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect no metadata */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) + goto out; + + /* Metadata write to trigger unclone in prologue */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (md) + *md = 42; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is read-only before prog writes to packet payload + * using dynptr_write helper. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_rdonly_before_data_dynptr_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + const struct ethhdr *eth; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect read-only metadata before unclone */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE) + goto out; + + /* Helper write to payload will unclone the packet */ + bpf_dynptr_write(&data, offsetof(struct ethhdr, h_proto), "x", 1, 0); + + /* Expect no metadata after unclone */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != 0) + goto out; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is read-only if prog writes to packet + * metadata using dynptr_write helper. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_rdonly_before_meta_dynptr_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + const struct ethhdr *eth; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect read-only metadata */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE) + goto out; + + /* Metadata write. Expect failure. */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_write(&meta, 0, "x", 1, 0) != -EINVAL) + goto out; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_pull_data.c b/tools/testing/selftests/bpf/progs/test_xdp_pull_data.c new file mode 100644 index 000000000000..c41a21413eaa --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_pull_data.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +int xdpf_sz; +int sinfo_sz; +int data_len; +int pull_len; + +#define XDP_PACKET_HEADROOM 256 + +SEC("xdp.frags") +int xdp_find_sizes(struct xdp_md *ctx) +{ + xdpf_sz = sizeof(struct xdp_frame); + sinfo_sz = __PAGE_SIZE - XDP_PACKET_HEADROOM - + (ctx->data_end - ctx->data); + + return XDP_PASS; +} + +SEC("xdp.frags") +int xdp_pull_data_prog(struct xdp_md *ctx) +{ + __u8 *data_end = (void *)(long)ctx->data_end; + __u8 *data = (void *)(long)ctx->data; + __u8 *val_p; + int err; + + if (data_len != data_end - data) + return XDP_DROP; + + err = bpf_xdp_pull_data(ctx, pull_len); + if (err) + return XDP_DROP; + + val_p = (void *)(long)ctx->data + 1024; + if (val_p + 1 > (void *)(long)ctx->data_end) + return XDP_DROP; + + if (*val_p != 0xbb) + return XDP_DROP; + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/timer_interrupt.c b/tools/testing/selftests/bpf/progs/timer_interrupt.c new file mode 100644 index 000000000000..19180a455f40 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_interrupt.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_experimental.h" + +char _license[] SEC("license") = "GPL"; + +#define CLOCK_MONOTONIC 1 + +int preempt_count; +int in_interrupt; +int in_interrupt_cb; + +struct elem { + struct bpf_timer t; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +static int timer_in_interrupt(void *map, int *key, struct bpf_timer *timer) +{ + preempt_count = get_preempt_count(); + in_interrupt_cb = bpf_in_interrupt(); + return 0; +} + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test_timer_interrupt) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&array, &key); + if (!timer) + return 0; + + in_interrupt = bpf_in_interrupt(); + bpf_timer_init(timer, &array, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, timer_in_interrupt); + bpf_timer_start(timer, 0, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_struct.c b/tools/testing/selftests/bpf/progs/tracing_struct.c index c435a3a8328a..d460732e2023 100644 --- a/tools/testing/selftests/bpf/progs/tracing_struct.c +++ b/tools/testing/selftests/bpf/progs/tracing_struct.c @@ -18,6 +18,18 @@ struct bpf_testmod_struct_arg_3 { int b[]; }; +union bpf_testmod_union_arg_1 { + char a; + short b; + struct bpf_testmod_struct_arg_1 arg; +}; + +union bpf_testmod_union_arg_2 { + int a; + long b; + struct bpf_testmod_struct_arg_2 arg; +}; + long t1_a_a, t1_a_b, t1_b, t1_c, t1_ret, t1_nregs; __u64 t1_reg0, t1_reg1, t1_reg2, t1_reg3; long t2_a, t2_b_a, t2_b_b, t2_c, t2_ret; @@ -26,6 +38,9 @@ long t4_a_a, t4_b, t4_c, t4_d, t4_e_a, t4_e_b, t4_ret; long t5_ret; int t6; +long ut1_a_a, ut1_b, ut1_c; +long ut2_a, ut2_b_a, ut2_b_b; + SEC("fentry/bpf_testmod_test_struct_arg_1") int BPF_PROG2(test_struct_arg_1, struct bpf_testmod_struct_arg_2, a, int, b, int, c) { @@ -130,4 +145,22 @@ int BPF_PROG2(test_struct_arg_11, struct bpf_testmod_struct_arg_3 *, a) return 0; } +SEC("fexit/bpf_testmod_test_union_arg_1") +int BPF_PROG2(test_union_arg_1, union bpf_testmod_union_arg_1, a, int, b, int, c) +{ + ut1_a_a = a.arg.a; + ut1_b = b; + ut1_c = c; + return 0; +} + +SEC("fexit/bpf_testmod_test_union_arg_2") +int BPF_PROG2(test_union_arg_2, int, a, union bpf_testmod_union_arg_2, b) +{ + ut2_a = a; + ut2_b_a = b.arg.a; + ut2_b_b = b.arg.b; + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 044a6d78923e..3d5f30c29ae3 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -97,6 +97,12 @@ int bench_trigger_kprobe_multi(void *ctx) return 0; } +SEC("?kprobe.multi/bpf_get_numa_node_id") +int bench_kprobe_multi_empty(void *ctx) +{ + return 0; +} + SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { @@ -104,6 +110,12 @@ int bench_trigger_kretprobe_multi(void *ctx) return 0; } +SEC("?kretprobe.multi/bpf_get_numa_node_id") +int bench_kretprobe_multi_empty(void *ctx) +{ + return 0; +} + SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall.c b/tools/testing/selftests/bpf/progs/uprobe_syscall.c index 8a4fa6c7ef59..e08c31669e5a 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall.c @@ -7,8 +7,8 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; -SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger") -int uretprobe(struct pt_regs *ctx) +SEC("uprobe") +int probe(struct pt_regs *ctx) { __builtin_memcpy(®s, ctx, sizeof(regs)); return 0; diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c index 0d7f1a7db2e2..915d38591bf6 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" #include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/usdt.bpf.h> #include <string.h> struct pt_regs regs; @@ -8,10 +10,64 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; int executed = 0; +int pid; + +SEC("uprobe") +int BPF_UPROBE(test_uprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(test_uretprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.multi") +int test_uprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} SEC("uretprobe.multi") -int test(struct pt_regs *regs) +int test_uretprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.session") +int test_uprobe_session(struct pt_regs *ctx) { - executed = 1; + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("usdt") +int test_usdt(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; return 0; } diff --git a/tools/testing/selftests/bpf/progs/uretprobe_stack.c b/tools/testing/selftests/bpf/progs/uretprobe_stack.c index 9fdcf396b8f4..a2951e2f1711 100644 --- a/tools/testing/selftests/bpf/progs/uretprobe_stack.c +++ b/tools/testing/selftests/bpf/progs/uretprobe_stack.c @@ -26,8 +26,8 @@ int usdt_len; SEC("uprobe//proc/self/exe:target_1") int BPF_UPROBE(uprobe_1) { - /* target_1 is recursive wit depth of 2, so we capture two separate - * stack traces, depending on which occurence it is + /* target_1 is recursive with depth of 2, so we capture two separate + * stack traces, depending on which occurrence it is */ static bool recur = false; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index 9dbdf123542d..f19e15400b3e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -240,6 +240,7 @@ int big_alloc2(void *ctx) return 5; bpf_arena_free_pages(&arena, (void __arena *)pg, 2); page[i] = NULL; + barrier(); page[i + 1] = NULL; cond_break; } diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 87a2c60d86e6..0a72e0228ea9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -926,7 +926,7 @@ l1_%=: r0 = 0; \ SEC("socket") __description("bounds check for non const xor src dst") __success __log_level(2) -__msg("5: (af) r0 ^= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__msg("5: (af) r0 ^= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") __naked void non_const_xor_src_dst(void) { asm volatile (" \ @@ -947,7 +947,7 @@ __naked void non_const_xor_src_dst(void) SEC("socket") __description("bounds check for non const or src dst") __success __log_level(2) -__msg("5: (4f) r0 |= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__msg("5: (4f) r0 |= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") __naked void non_const_or_src_dst(void) { asm volatile (" \ @@ -968,7 +968,7 @@ __naked void non_const_or_src_dst(void) SEC("socket") __description("bounds check for non const mul regs") __success __log_level(2) -__msg("5: (2f) r0 *= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") +__msg("5: (2f) r0 *= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") __naked void non_const_mul_regs(void) { asm volatile (" \ @@ -1241,7 +1241,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("multiply mixed sign bounds. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") __naked void mult_mixed0_sign(void) { asm volatile ( @@ -1264,7 +1264,7 @@ __naked void mult_mixed0_sign(void) SEC("tc") __description("multiply mixed sign bounds. test 2") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=smin32=-100,smax=smax32=200)") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=smin32=-100,smax=smax32=200)") __naked void mult_mixed1_sign(void) { asm volatile ( @@ -1287,7 +1287,7 @@ __naked void mult_mixed1_sign(void) SEC("tc") __description("multiply negative bounds") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=smin32=umin32=0x3ff280b0,smax=umax=smax32=umax32=0x3fff0001,var_off=(0x3ff00000; 0xf81ff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=smin32=umin32=0x3ff280b0,smax=umax=smax32=umax32=0x3fff0001,var_off=(0x3ff00000; 0xf81ff))") __naked void mult_sign_bounds(void) { asm volatile ( @@ -1311,7 +1311,7 @@ __naked void mult_sign_bounds(void) SEC("tc") __description("multiply bounds that don't cross signed boundary") __success __log_level(2) -__msg("r8 *= r6 {{.*}}; R6_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=11,var_off=(0x0; 0xb)) R8_w=scalar(smin=0,smax=umax=0x7b96bb0a94a3a7cd,var_off=(0x0; 0x7fffffffffffffff))") +__msg("r8 *= r6 {{.*}}; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=11,var_off=(0x0; 0xb)) R8=scalar(smin=0,smax=umax=0x7b96bb0a94a3a7cd,var_off=(0x0; 0x7fffffffffffffff))") __naked void mult_no_sign_crossing(void) { asm volatile ( @@ -1331,7 +1331,7 @@ __naked void mult_no_sign_crossing(void) SEC("tc") __description("multiplication overflow, result in unbounded reg. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar()") +__msg("r6 *= r7 {{.*}}; R6=scalar()") __naked void mult_unsign_ovf(void) { asm volatile ( @@ -1353,7 +1353,7 @@ __naked void mult_unsign_ovf(void) SEC("tc") __description("multiplication overflow, result in unbounded reg. test 2") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar()") +__msg("r6 *= r7 {{.*}}; R6=scalar()") __naked void mult_sign_ovf(void) { asm volatile ( @@ -1376,7 +1376,7 @@ __naked void mult_sign_ovf(void) SEC("socket") __description("64-bit addition, all outcomes overflow") __success __log_level(2) -__msg("5: (0f) r3 += r3 {{.*}} R3_w=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe)") +__msg("5: (0f) r3 += r3 {{.*}} R3=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe)") __retval(0) __naked void add64_full_overflow(void) { @@ -1396,7 +1396,7 @@ __naked void add64_full_overflow(void) SEC("socket") __description("64-bit addition, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("4: (0f) r3 += r3 {{.*}} R3_w=scalar()") +__msg("4: (0f) r3 += r3 {{.*}} R3=scalar()") __retval(0) __naked void add64_partial_overflow(void) { @@ -1416,7 +1416,7 @@ __naked void add64_partial_overflow(void) SEC("socket") __description("32-bit addition overflow, all outcomes overflow") __success __log_level(2) -__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=umin=umin32=0x40000000,smax=umax=umax32=0xfffffffe,var_off=(0x0; 0xffffffff))") +__msg("4: (0c) w3 += w3 {{.*}} R3=scalar(smin=umin=umin32=0x40000000,smax=umax=umax32=0xfffffffe,var_off=(0x0; 0xffffffff))") __retval(0) __naked void add32_full_overflow(void) { @@ -1436,7 +1436,7 @@ __naked void add32_full_overflow(void) SEC("socket") __description("32-bit addition, partial overflow, result in unbounded u32 bounds") __success __log_level(2) -__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__msg("4: (0c) w3 += w3 {{.*}} R3=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") __retval(0) __naked void add32_partial_overflow(void) { @@ -1456,7 +1456,7 @@ __naked void add32_partial_overflow(void) SEC("socket") __description("64-bit subtraction, all outcomes underflow") __success __log_level(2) -__msg("6: (1f) r3 -= r1 {{.*}} R3_w=scalar(umin=1,umax=0x8000000000000000)") +__msg("6: (1f) r3 -= r1 {{.*}} R3=scalar(umin=1,umax=0x8000000000000000)") __retval(0) __naked void sub64_full_overflow(void) { @@ -1477,7 +1477,7 @@ __naked void sub64_full_overflow(void) SEC("socket") __description("64-bit subtraction, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("3: (1f) r3 -= r2 {{.*}} R3_w=scalar()") +__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar()") __retval(0) __naked void sub64_partial_overflow(void) { @@ -1496,7 +1496,7 @@ __naked void sub64_partial_overflow(void) SEC("socket") __description("32-bit subtraction overflow, all outcomes underflow") __success __log_level(2) -__msg("5: (1c) w3 -= w1 {{.*}} R3_w=scalar(smin=umin=umin32=1,smax=umax=umax32=0x80000000,var_off=(0x0; 0xffffffff))") +__msg("5: (1c) w3 -= w1 {{.*}} R3=scalar(smin=umin=umin32=1,smax=umax=umax32=0x80000000,var_off=(0x0; 0xffffffff))") __retval(0) __naked void sub32_full_overflow(void) { @@ -1517,7 +1517,7 @@ __naked void sub32_full_overflow(void) SEC("socket") __description("32-bit subtraction, partial overflow, result in unbounded u32 bounds") __success __log_level(2) -__msg("3: (1c) w3 -= w2 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__msg("3: (1c) w3 -= w2 {{.*}} R3=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") __retval(0) __naked void sub32_partial_overflow(void) { @@ -1617,7 +1617,7 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, positive overlap") __success __log_level(2) __flag(BPF_F_TEST_REG_INVARIANTS) -__msg("3: (2d) if r0 > r1 {{.*}} R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f))") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f))") __retval(0) __naked void bounds_deduct_positive_overlap(void) { @@ -1650,7 +1650,7 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, two overlaps") __failure __flag(BPF_F_TEST_REG_INVARIANTS) -__msg("3: (2d) if r0 > r1 {{.*}} R0_w=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") __msg("frame pointer is read only") __naked void bounds_deduct_two_overlaps(void) { @@ -1668,4 +1668,45 @@ l0_%=: r0 = 0; \ : __clobber_all); } +SEC("socket") +__description("dead jne branch due to disagreeing tnums") +__success __log_level(2) +__naked void jne_disagreeing_tnums(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + w0 = w0; \ + r0 >>= 30; \ + r0 <<= 30; \ + r1 = r0; \ + r1 += 1024; \ + if r1 != r0 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("dead jeq branch due to disagreeing tnums") +__success __log_level(2) +__naked void jeq_disagreeing_tnums(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + w0 = w0; \ + r0 >>= 30; \ + r0 <<= 30; \ + r1 = r0; \ + r1 += 1024; \ + if r1 == r0 goto +1; \ + exit; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index c258b0722e04..fb4fa465d67c 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -660,19 +660,24 @@ __naked void may_goto_interaction_x86_64(void) SEC("raw_tp") __arch_arm64 -__log_level(4) __msg("stack depth 16") -/* may_goto counter at -16 */ -__xlated("0: *(u64 *)(r10 -16) =") -__xlated("1: r1 = 1") -__xlated("2: call bpf_get_smp_processor_id") +__log_level(4) __msg("stack depth 24") +/* may_goto counter at -24 */ +__xlated("0: *(u64 *)(r10 -24) =") +/* may_goto timestamp at -16 */ +__xlated("1: *(u64 *)(r10 -16) =") +__xlated("2: r1 = 1") +__xlated("3: call bpf_get_smp_processor_id") /* may_goto expansion starts */ -__xlated("3: r11 = *(u64 *)(r10 -16)") -__xlated("4: if r11 == 0x0 goto pc+3") -__xlated("5: r11 -= 1") -__xlated("6: *(u64 *)(r10 -16) = r11") +__xlated("4: r11 = *(u64 *)(r10 -24)") +__xlated("5: if r11 == 0x0 goto pc+6") +__xlated("6: r11 -= 1") +__xlated("7: if r11 != 0x0 goto pc+2") +__xlated("8: r11 = -24") +__xlated("9: call unknown") +__xlated("10: *(u64 *)(r10 -24) = r11") /* may_goto expansion ends */ -__xlated("7: *(u64 *)(r10 -8) = r1") -__xlated("8: exit") +__xlated("11: *(u64 *)(r10 -8) = r1") +__xlated("12: exit") __success __naked void may_goto_interaction_arm64(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_ctx.c b/tools/testing/selftests/bpf/progs/verifier_ctx.c index 424463094760..5ebf7d9bcc55 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ctx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ctx.c @@ -5,8 +5,6 @@ #include <bpf/bpf_helpers.h> #include "bpf_misc.h" -#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) - SEC("tc") __description("context stores via BPF_ATOMIC") __failure __msg("BPF_ATOMIC stores into R1 ctx is not allowed") @@ -264,4 +262,34 @@ narrow_load("sockops", bpf_sock_ops, skb_hwtstamp); unaligned_access("flow_dissector", __sk_buff, data); unaligned_access("netfilter", bpf_nf_ctx, skb); +#define padding_access(type, ctx, prev_field, sz) \ + SEC(type) \ + __description("access on " #ctx " padding after " #prev_field) \ + __naked void padding_ctx_access_##ctx(void) \ + { \ + asm volatile (" \ + r1 = *(u%[size] *)(r1 + %[off]); \ + r0 = 0; \ + exit;" \ + : \ + : __imm_const(size, sz * 8), \ + __imm_const(off, offsetofend(struct ctx, prev_field)) \ + : __clobber_all); \ + } + +__failure __msg("invalid bpf_context access") +padding_access("cgroup/bind4", bpf_sock_addr, msg_src_ip6[3], 4); + +__success +padding_access("sk_lookup", bpf_sk_lookup, remote_port, 2); + +__failure __msg("invalid bpf_context access") +padding_access("tc", __sk_buff, tstamp_type, 2); + +__failure __msg("invalid bpf_context access") +padding_access("cgroup/post_bind4", bpf_sock, dst_port, 2); + +__failure __msg("invalid bpf_context access") +padding_access("sk_reuseport", sk_reuseport_md, hash, 4); + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index 181da86ba5f0..6630a92b1b47 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -215,7 +215,7 @@ __weak int subprog_untrusted(const volatile struct task_struct *restrict task __ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r1 = {{.*}}; {{.*}}R1_w=trusted_ptr_task_struct()") +__msg("r1 = {{.*}}; {{.*}}R1=trusted_ptr_task_struct()") __msg("Func#1 ('subprog_untrusted') is global and assumed valid.") __msg("Validating subprog_untrusted() func#1...") __msg(": R1=untrusted_ptr_task_struct") @@ -278,7 +278,7 @@ __weak int subprog_enum_untrusted(enum bpf_attach_type *p __arg_untrusted) SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r1 = {{.*}}; {{.*}}R1_w=trusted_ptr_task_struct()") +__msg("r1 = {{.*}}; {{.*}}R1=trusted_ptr_task_struct()") __msg("Func#1 ('subprog_void_untrusted') is global and assumed valid.") __msg("Validating subprog_void_untrusted() func#1...") __msg(": R1=rdonly_untrusted_mem(sz=0)") diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index 52edee41caf6..c8494b682c31 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -3,6 +3,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include "bpf_misc.h" +#include "bpf_arena_common.h" #if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ @@ -10,6 +11,12 @@ defined(__TARGET_ARCH_loongarch)) && \ __clang_major__ >= 18 +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 1); +} arena SEC(".maps"); + SEC("socket") __description("LDSX, S8") __success __success_unpriv __retval(-2) @@ -65,7 +72,7 @@ __naked void ldsx_s32(void) SEC("socket") __description("LDSX, S8 range checking, privileged") __log_level(2) __success __retval(1) -__msg("R1_w=scalar(smin=smin32=-128,smax=smax32=127)") +__msg("R1=scalar(smin=smin32=-128,smax=smax32=127)") __naked void ldsx_s8_range_priv(void) { asm volatile ( @@ -256,6 +263,175 @@ __naked void ldsx_ctx_8(void) : __clobber_all); } +SEC("syscall") +__description("Arena LDSX Disasm") +__success +__arch_x86_64 +__jited("movslq 0x10(%rax,%r12), %r14") +__jited("movswq 0x18(%rax,%r12), %r14") +__jited("movsbq 0x20(%rax,%r12), %r14") +__jited("movslq 0x10(%rdi,%r12), %r15") +__jited("movswq 0x18(%rdi,%r12), %r15") +__jited("movsbq 0x20(%rdi,%r12), %r15") +__arch_arm64 +__jited("add x11, x7, x28") +__jited("ldrsw x21, [x11, #0x10]") +__jited("add x11, x7, x28") +__jited("ldrsh x21, [x11, #0x18]") +__jited("add x11, x7, x28") +__jited("ldrsb x21, [x11, #0x20]") +__jited("add x11, x0, x28") +__jited("ldrsw x22, [x11, #0x10]") +__jited("add x11, x0, x28") +__jited("ldrsh x22, [x11, #0x18]") +__jited("add x11, x0, x28") +__jited("ldrsb x22, [x11, #0x20]") +__naked void arena_ldsx_disasm(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = r0;" + "r8 = *(s32 *)(r0 + 16);" + "r8 = *(s16 *)(r0 + 24);" + "r8 = *(s8 *)(r0 + 32);" + "r9 = *(s32 *)(r1 + 16);" + "r9 = *(s16 *)(r1 + 24);" + "r9 = *(s8 *)(r1 + 32);" + "r0 = 0;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX Exception") +__success __retval(0) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_exception(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r0 = 0xdeadbeef;" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0x3fe;" + "*(u64 *)(r0 + 0) = r1;" + "r0 = *(s8 *)(r0 + 0);" + "exit;" + : + : __imm_addr(arena) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX, S8") +__success __retval(-1) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_s8(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0x3fe;" + "*(u64 *)(r0 + 0) = r1;" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r0 = *(s8 *)(r0 + 0);" +#else + "r0 = *(s8 *)(r0 + 7);" +#endif + "r0 >>= 1;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX, S16") +__success __retval(-1) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_s16(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0x3fffe;" + "*(u64 *)(r0 + 0) = r1;" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r0 = *(s16 *)(r0 + 0);" +#else + "r0 = *(s16 *)(r0 + 6);" +#endif + "r0 >>= 1;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX, S32") +__success __retval(-1) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_s32(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0xfffffffe;" + "*(u64 *)(r0 + 0) = r1;" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r0 = *(s32 *)(r0 + 0);" +#else + "r0 = *(s32 *)(r0 + 4);" +#endif + "r0 >>= 1;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +/* to retain debug info for BTF generation */ +void kfunc_root(void) +{ + bpf_arena_alloc_pages(0, 0, 0, 0, 0); +} + #else SEC("socket") diff --git a/tools/testing/selftests/bpf/progs/verifier_live_stack.c b/tools/testing/selftests/bpf/progs/verifier_live_stack.c new file mode 100644 index 000000000000..c0e808509268 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_live_stack.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, long long); +} map SEC(".maps"); + +SEC("socket") +__log_level(2) +__msg("(0) frame 0 insn 2 +written -8") +__msg("(0) frame 0 insn 1 +live -24") +__msg("(0) frame 0 insn 1 +written -8") +__msg("(0) frame 0 insn 0 +live -8,-24") +__msg("(0) frame 0 insn 0 +written -8") +__msg("(0) live stack update done in 2 iterations") +__naked void simple_read_simple_write(void) +{ + asm volatile ( + "r1 = *(u64 *)(r10 - 8);" + "r2 = *(u64 *)(r10 - 24);" + "*(u64 *)(r10 - 8) = r1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("(0) frame 0 insn 1 +live -8") +__not_msg("(0) frame 0 insn 1 +written") +__msg("(0) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 1 +live -16") +__msg("(0) frame 0 insn 1 +written -32") +__msg("(0) live stack update done in 2 iterations") +__naked void read_write_join(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "if r0 > 42 goto 1f;" + "r0 = *(u64 *)(r10 - 8);" + "*(u64 *)(r10 - 32) = r0;" + "*(u64 *)(r10 - 40) = r0;" + "exit;" +"1:" + "r0 = *(u64 *)(r10 - 16);" + "*(u64 *)(r10 - 32) = r0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("2: (25) if r0 > 0x2a goto pc+1") +__msg("7: (95) exit") +__msg("(0) frame 0 insn 2 +written -16") +__msg("(0) live stack update done in 2 iterations") +__msg("7: (95) exit") +__not_msg("(0) frame 0 insn 2") +__msg("(0) live stack update done in 1 iterations") +__naked void must_write_not_same_slot(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r1 = -8;" + "if r0 > 42 goto 1f;" + "r1 = -16;" +"1:" + "r2 = r10;" + "r2 += r1;" + "*(u64 *)(r2 + 0) = r0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("(0) frame 0 insn 0 +written -8,-16") +__msg("(0) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 0 +written -8") +__msg("(0) live stack update done in 2 iterations") +__naked void must_write_not_same_type(void) +{ + asm volatile ( + "*(u64*)(r10 - 8) = 0;" + "r2 = r10;" + "r2 += -8;" + "r1 = %[map] ll;" + "call %[bpf_map_lookup_elem];" + "if r0 != 0 goto 1f;" + "r0 = r10;" + "r0 += -16;" +"1:" + "*(u64 *)(r0 + 0) = 42;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_map_lookup_elem), + __imm_addr(map) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("(2,4) frame 0 insn 4 +written -8") +__msg("(2,4) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 2 +written -8") +__msg("(0) live stack update done in 2 iterations") +__naked void caller_stack_write(void) +{ + asm volatile ( + "r1 = r10;" + "r1 += -8;" + "call write_first_param;" + "exit;" + ::: __clobber_all); +} + +static __used __naked void write_first_param(void) +{ + asm volatile ( + "*(u64 *)(r1 + 0) = 7;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +/* caller_stack_read() function */ +__msg("2: .12345.... (85) call pc+4") +__msg("5: .12345.... (85) call pc+1") +__msg("6: 0......... (95) exit") +/* read_first_param() function */ +__msg("7: .1........ (79) r0 = *(u64 *)(r1 +0)") +__msg("8: 0......... (95) exit") +/* update for callsite at (2) */ +__msg("(2,7) frame 0 insn 7 +live -8") +__msg("(2,7) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 2 +live -8") +__msg("(0) live stack update done in 2 iterations") +/* update for callsite at (5) */ +__msg("(5,7) frame 0 insn 7 +live -16") +__msg("(5,7) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 5 +live -16") +__msg("(0) live stack update done in 2 iterations") +__naked void caller_stack_read(void) +{ + asm volatile ( + "r1 = r10;" + "r1 += -8;" + "call read_first_param;" + "r1 = r10;" + "r1 += -16;" + "call read_first_param;" + "exit;" + ::: __clobber_all); +} + +static __used __naked void read_first_param(void) +{ + asm volatile ( + "r0 = *(u64 *)(r1 + 0);" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__flag(BPF_F_TEST_STATE_FREQ) +__log_level(2) +/* read_first_param2() function */ +__msg(" 9: .1........ (79) r0 = *(u64 *)(r1 +0)") +__msg("10: .......... (b7) r0 = 0") +__msg("11: 0......... (05) goto pc+0") +__msg("12: 0......... (95) exit") +/* + * The purpose of the test is to check that checkpoint in + * read_first_param2() stops path traversal. This will only happen if + * verifier understands that fp[0]-8 at insn (12) is not alive. + */ +__msg("12: safe") +__msg("processed 20 insns") +__naked void caller_stack_pruning(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "if r0 == 42 goto 1f;" + "r0 = %[map] ll;" +"1:" + "*(u64 *)(r10 - 8) = r0;" + "r1 = r10;" + "r1 += -8;" + /* + * fp[0]-8 is either pointer to map or a scalar, + * preventing state pruning at checkpoint created for call. + */ + "call read_first_param2;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm_addr(map) + : __clobber_all); +} + +static __used __naked void read_first_param2(void) +{ + asm volatile ( + "r0 = *(u64 *)(r1 + 0);" + "r0 = 0;" + /* + * Checkpoint at goto +0 should fire, + * as caller stack fp[0]-8 is not alive at this point. + */ + "goto +0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg("R1 type=scalar expected=map_ptr") +__naked void caller_stack_pruning_callback(void) +{ + asm volatile ( + "r0 = %[map] ll;" + "*(u64 *)(r10 - 8) = r0;" + "r1 = 2;" + "r2 = loop_cb ll;" + "r3 = r10;" + "r3 += -8;" + "r4 = 0;" + /* + * fp[0]-8 is either pointer to map or a scalar, + * preventing state pruning at checkpoint created for call. + */ + "call %[bpf_loop];" + "r0 = 42;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_loop), + __imm_addr(map) + : __clobber_all); +} + +static __used __naked void loop_cb(void) +{ + asm volatile ( + /* + * Checkpoint at function entry should not fire, as caller + * stack fp[0]-8 is alive at this point. + */ + "r6 = r2;" + "r1 = *(u64 *)(r6 + 0);" + "*(u64*)(r10 - 8) = 7;" + "r2 = r10;" + "r2 += -8;" + "call %[bpf_map_lookup_elem];" + /* + * This should stop verifier on a second loop iteration, + * but only if verifier correctly maintains that fp[0]-8 + * is still alive. + */ + "*(u64 *)(r6 + 0) = 0;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_map_lookup_elem), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* + * Because of a bug in verifier.c:compute_postorder() + * the program below overflowed traversal queue in that function. + */ +SEC("socket") +__naked void syzbot_postorder_bug1(void) +{ + asm volatile ( + "r0 = 0;" + "if r0 != 0 goto -1;" + "exit;" + ::: __clobber_all); +} diff --git a/tools/testing/selftests/bpf/progs/verifier_loops1.c b/tools/testing/selftests/bpf/progs/verifier_loops1.c index e07b43b78fd2..fbdde80e7b90 100644 --- a/tools/testing/selftests/bpf/progs/verifier_loops1.c +++ b/tools/testing/selftests/bpf/progs/verifier_loops1.c @@ -283,4 +283,25 @@ exit_%=: \ : __clobber_all); } +/* + * This test case triggered a bug in verifier.c:maybe_exit_scc(). + * Speculative execution path reaches stack access instruction, + * stops and triggers maybe_exit_scc() w/o accompanying maybe_enter_scc() call. + */ +SEC("socket") +__arch_x86_64 +__caps_unpriv(CAP_BPF) +__naked void maybe_exit_scc_bug1(void) +{ + asm volatile ( + "r0 = 100;" +"1:" + /* Speculative execution path reaches and stops here. */ + "*(u64 *)(r10 - 512) = r0;" + /* Condition is always false, but verifier speculatively executes the true branch. */ + "if r0 <= 0x0 goto 1b;" + "exit;" + ::: __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c index 11a079145966..e2767d27d8aa 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c @@ -70,10 +70,13 @@ __naked void bpf_map_ptr_write_rejected(void) : __clobber_all); } +/* The first element of struct bpf_map is a SHA256 hash of 32 bytes, accessing + * into this array is valid. The opts field is now at offset 33. + */ SEC("socket") __description("bpf_map_ptr: read non-existent field rejected") __failure -__msg("cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4") +__msg("cannot access ptr member ops with moff 32 in struct bpf_map with off 33 size 4") __failure_unpriv __msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") __flag(BPF_F_ANY_ALIGNMENT) @@ -82,7 +85,7 @@ __naked void read_non_existent_field_rejected(void) asm volatile (" \ r6 = 0; \ r1 = %[map_array_48b] ll; \ - r6 = *(u32*)(r1 + 1); \ + r6 = *(u32*)(r1 + 33); \ r0 = 1; \ exit; \ " : diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c index 3966d827f288..6d1edaef9213 100644 --- a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c +++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c @@ -9,6 +9,8 @@ SEC("raw_tp") __description("may_goto 0") __arch_x86_64 +__arch_s390x +__arch_arm64 __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -27,6 +29,8 @@ __naked void may_goto_simple(void) SEC("raw_tp") __description("batch 2 of may_goto 0") __arch_x86_64 +__arch_s390x +__arch_arm64 __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -47,6 +51,8 @@ __naked void may_goto_batch_0(void) SEC("raw_tp") __description("may_goto batch with offsets 2/1/0") __arch_x86_64 +__arch_s390x +__arch_arm64 __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -69,8 +75,10 @@ __naked void may_goto_batch_1(void) } SEC("raw_tp") -__description("may_goto batch with offsets 2/0 - x86_64") +__description("may_goto batch with offsets 2/0") __arch_x86_64 +__arch_s390x +__arch_arm64 __xlated("0: *(u64 *)(r10 -16) = 65535") __xlated("1: *(u64 *)(r10 -8) = 0") __xlated("2: r11 = *(u64 *)(r10 -16)") @@ -84,33 +92,7 @@ __xlated("9: r0 = 1") __xlated("10: r0 = 2") __xlated("11: exit") __success -__naked void may_goto_batch_2_x86_64(void) -{ - asm volatile ( - ".8byte %[may_goto1];" - ".8byte %[may_goto3];" - "r0 = 1;" - "r0 = 2;" - "exit;" - : - : __imm_insn(may_goto1, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 2 /* offset */, 0)), - __imm_insn(may_goto3, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0 /* offset */, 0)) - : __clobber_all); -} - -SEC("raw_tp") -__description("may_goto batch with offsets 2/0 - arm64") -__arch_arm64 -__xlated("0: *(u64 *)(r10 -8) = 8388608") -__xlated("1: r11 = *(u64 *)(r10 -8)") -__xlated("2: if r11 == 0x0 goto pc+3") -__xlated("3: r11 -= 1") -__xlated("4: *(u64 *)(r10 -8) = r11") -__xlated("5: r0 = 1") -__xlated("6: r0 = 2") -__xlated("7: exit") -__success -__naked void may_goto_batch_2_arm64(void) +__naked void may_goto_batch_2(void) { asm volatile ( ".8byte %[may_goto1];" diff --git a/tools/testing/selftests/bpf/progs/verifier_mul.c b/tools/testing/selftests/bpf/progs/verifier_mul.c new file mode 100644 index 000000000000..7145fe3351d5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_mul.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Nandakumar Edamana */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" + +/* Intended to test the abstract multiplication technique(s) used by + * the verifier. Using assembly to avoid compiler optimizations. + */ +SEC("fentry/bpf_fentry_test1") +void BPF_PROG(mul_precise, int x) +{ + /* First, force the verifier to be uncertain about the value: + * unsigned int a = (bpf_get_prandom_u32() & 0x2) | 0x1; + * + * Assuming the verifier is using tnum, a must be tnum{.v=0x1, .m=0x2}. + * Then a * 0x3 would be m0m1 (m for uncertain). Added imprecision + * would cause the following to fail, because the required return value + * is 0: + * return (a * 0x3) & 0x4); + */ + asm volatile ("\ + call %[bpf_get_prandom_u32];\ + r0 &= 0x2;\ + r0 |= 0x1;\ + r0 *= 0x3;\ + r0 &= 0x4;\ + if r0 != 0 goto l0_%=;\ + r0 = 0;\ + goto l1_%=;\ +l0_%=:\ + r0 = 1;\ +l1_%=:\ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c index 73fee2aec698..1fe090cd6744 100644 --- a/tools/testing/selftests/bpf/progs/verifier_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -144,21 +144,21 @@ SEC("?raw_tp") __success __log_level(2) /* * Without the bug fix there will be no history between "last_idx 3 first_idx 3" - * and "parent state regs=" lines. "R0_w=6" parts are here to help anchor + * and "parent state regs=" lines. "R0=6" parts are here to help anchor * expected log messages to the one specific mark_chain_precision operation. * * This is quite fragile: if verifier checkpointing heuristic changes, this * might need adjusting. */ -__msg("2: (07) r0 += 1 ; R0_w=6") +__msg("2: (07) r0 += 1 ; R0=6") __msg("3: (35) if r0 >= 0xa goto pc+1") __msg("mark_precise: frame0: last_idx 3 first_idx 3 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 2: (07) r0 += 1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (07) r0 += 1") __msg("mark_precise: frame0: regs=r0 stack= before 4: (05) goto pc-4") __msg("mark_precise: frame0: regs=r0 stack= before 3: (35) if r0 >= 0xa goto pc+1") -__msg("mark_precise: frame0: parent state regs= stack=: R0_rw=P4") -__msg("3: R0_w=6") +__msg("mark_precise: frame0: parent state regs= stack=: R0=P4") +__msg("3: R0=6") __naked int state_loop_first_last_equal(void) { asm volatile ( @@ -233,8 +233,8 @@ __naked void bpf_cond_op_not_r10(void) SEC("lsm.s/socket_connect") __success __log_level(2) -__msg("0: (b7) r0 = 1 ; R0_w=1") -__msg("1: (84) w0 = -w0 ; R0_w=0xffffffff") +__msg("0: (b7) r0 = 1 ; R0=1") +__msg("1: (84) w0 = -w0 ; R0=0xffffffff") __msg("mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (84) w0 = -w0") __msg("mark_precise: frame0: regs=r0 stack= before 0: (b7) r0 = 1") @@ -268,8 +268,8 @@ __naked int bpf_neg_3(void) SEC("lsm.s/socket_connect") __success __log_level(2) -__msg("0: (b7) r0 = 1 ; R0_w=1") -__msg("1: (87) r0 = -r0 ; R0_w=-1") +__msg("0: (b7) r0 = 1 ; R0=1") +__msg("1: (87) r0 = -r0 ; R0=-1") __msg("mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (87) r0 = -r0") __msg("mark_precise: frame0: regs=r0 stack= before 0: (b7) r0 = 1") diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index 7c5e5e6d10eb..c0ce690ddb68 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -349,11 +349,11 @@ __naked void precision_two_ids(void) SEC("socket") __success __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) -/* check thar r0 and r6 have different IDs after 'if', +/* check that r0 and r6 have different IDs after 'if', * collect_linked_regs() can't tie more than 6 registers for a single insn. */ __msg("8: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") -__msg("9: (bf) r6 = r6 ; R6_w=scalar(id=2") +__msg("9: (bf) r6 = r6 ; R6=scalar(id=2") /* check that r{0-5} are marked precise after 'if' */ __msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0") __msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:") @@ -779,12 +779,12 @@ __success __retval(0) /* Check that verifier believes r1/r0 are zero at exit */ __log_level(2) -__msg("4: (77) r1 >>= 32 ; R1_w=0") -__msg("5: (bf) r0 = r1 ; R0_w=0 R1_w=0") +__msg("4: (77) r1 >>= 32 ; R1=0") +__msg("5: (bf) r0 = r1 ; R0=0 R1=0") __msg("6: (95) exit") __msg("from 3 to 4") -__msg("4: (77) r1 >>= 32 ; R1_w=0") -__msg("5: (bf) r0 = r1 ; R0_w=0 R1_w=0") +__msg("4: (77) r1 >>= 32 ; R1=0") +__msg("5: (bf) r0 = r1 ; R0=0 R1=0") __msg("6: (95) exit") /* Verify that statements to randomize upper half of r1 had not been * generated. diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index 0d5e56dffabb..2b4610b53382 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -1,14 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* Converted from tools/testing/selftests/bpf/verifier/sock.c */ -#include <linux/bpf.h> +#include "vmlinux.h" #include <bpf/bpf_helpers.h> #include "bpf_misc.h" -#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) -#define offsetofend(TYPE, MEMBER) \ - (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) - struct { __uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY); __uint(max_entries, 1); @@ -1073,6 +1069,48 @@ int invalidate_pkt_pointers_from_global_func(struct __sk_buff *sk) } __noinline +long xdp_pull_data2(struct xdp_md *x, __u32 len) +{ + return bpf_xdp_pull_data(x, len); +} + +__noinline +long xdp_pull_data1(struct xdp_md *x, __u32 len) +{ + return xdp_pull_data2(x, len); +} + +/* global function calls bpf_xdp_pull_data(), which invalidates packet + * pointers established before global function call. + */ +SEC("xdp") +__failure __msg("invalid mem access") +int invalidate_xdp_pkt_pointers_from_global_func(struct xdp_md *x) +{ + int *p = (void *)(long)x->data; + + if ((void *)(p + 1) > (void *)(long)x->data_end) + return XDP_DROP; + xdp_pull_data1(x, 0); + *p = 42; /* this is unsafe */ + return XDP_PASS; +} + +/* XDP packet changing kfunc calls invalidate packet pointers */ +SEC("xdp") +__failure __msg("invalid mem access") +int invalidate_xdp_pkt_pointers(struct xdp_md *x) +{ + int *p = (void *)(long)x->data; + + if ((void *)(p + 1) > (void *)(long)x->data_end) + return XDP_DROP; + bpf_xdp_pull_data(x, 0); + *p = 42; /* this is unsafe */ + return XDP_PASS; +} + +__noinline int tail_call(struct __sk_buff *sk) { bpf_tail_call_static(sk, &jmp_table, 0); diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 1e5a511e8494..7a13dbd794b2 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -506,17 +506,17 @@ SEC("raw_tp") __log_level(2) __success /* fp-8 is spilled IMPRECISE value zero (represented by a zero value fake reg) */ -__msg("2: (7a) *(u64 *)(r10 -8) = 0 ; R10=fp0 fp-8_w=0") +__msg("2: (7a) *(u64 *)(r10 -8) = 0 ; R10=fp0 fp-8=0") /* but fp-16 is spilled IMPRECISE zero const reg */ -__msg("4: (7b) *(u64 *)(r10 -16) = r0 ; R0_w=0 R10=fp0 fp-16_w=0") +__msg("4: (7b) *(u64 *)(r10 -16) = r0 ; R0=0 R10=fp0 fp-16=0") /* validate that assigning R2 from STACK_SPILL with zero value doesn't mark register * precise immediately; if necessary, it will be marked precise later */ -__msg("6: (71) r2 = *(u8 *)(r10 -1) ; R2_w=0 R10=fp0 fp-8_w=0") +__msg("6: (71) r2 = *(u8 *)(r10 -1) ; R2=0 R10=fp0 fp-8=0") /* similarly, when R2 is assigned from spilled register, it is initially * imprecise, but will be marked precise later once it is used in precise context */ -__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2_w=0 R10=fp0 fp-16_w=0") +__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2=0 R10=fp0 fp-16=0") __msg("11: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 11 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 10: (71) r2 = *(u8 *)(r10 -9)") @@ -598,7 +598,7 @@ __log_level(2) __success /* fp-4 is STACK_ZERO */ __msg("2: (62) *(u32 *)(r10 -4) = 0 ; R10=fp0 fp-8=0000????") -__msg("4: (71) r2 = *(u8 *)(r10 -1) ; R2_w=0 R10=fp0 fp-8=0000????") +__msg("4: (71) r2 = *(u8 *)(r10 -1) ; R2=0 R10=fp0 fp-8=0000????") __msg("5: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 5 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 4: (71) r2 = *(u8 *)(r10 -1)") @@ -640,25 +640,25 @@ SEC("raw_tp") __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) __success /* make sure fp-8 is IMPRECISE fake register spill */ -__msg("3: (7a) *(u64 *)(r10 -8) = 1 ; R10=fp0 fp-8_w=1") +__msg("3: (7a) *(u64 *)(r10 -8) = 1 ; R10=fp0 fp-8=1") /* and fp-16 is spilled IMPRECISE const reg */ -__msg("5: (7b) *(u64 *)(r10 -16) = r0 ; R0_w=1 R10=fp0 fp-16_w=1") +__msg("5: (7b) *(u64 *)(r10 -16) = r0 ; R0=1 R10=fp0 fp-16=1") /* validate load from fp-8, which was initialized using BPF_ST_MEM */ -__msg("8: (79) r2 = *(u64 *)(r10 -8) ; R2_w=1 R10=fp0 fp-8=1") +__msg("8: (79) r2 = *(u64 *)(r10 -8) ; R2=1 R10=fp0 fp-8=1") __msg("9: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 8: (79) r2 = *(u64 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6") /* note, fp-8 is precise, fp-16 is not yet precise, we'll get there */ -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_rw=P1 fp-16_w=1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (7a) *(u64 *)(r10 -8) = 1") -__msg("10: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ -__msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2_w=1 R10=fp0 fp-16=1") +__msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2=1 R10=fp0 fp-16=1") __msg("13: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 13 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 12: (79) r2 = *(u64 *)(r10 -16)") @@ -668,12 +668,12 @@ __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2") __msg("mark_precise: frame0: regs= stack=-16 before 8: (79) r2 = *(u64 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6") /* now both fp-8 and fp-16 are precise, very good */ -__msg("mark_precise: frame0: parent state regs= stack=-16: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_rw=P1 fp-16_rw=P1") +__msg("mark_precise: frame0: parent state regs= stack=-16: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=P1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") __naked void stack_load_preserves_const_precision(void) { asm volatile ( @@ -719,22 +719,22 @@ __success /* make sure fp-8 is 32-bit FAKE subregister spill */ __msg("3: (62) *(u32 *)(r10 -8) = 1 ; R10=fp0 fp-8=????1") /* but fp-16 is spilled IMPRECISE zero const reg */ -__msg("5: (63) *(u32 *)(r10 -16) = r0 ; R0_w=1 R10=fp0 fp-16=????1") +__msg("5: (63) *(u32 *)(r10 -16) = r0 ; R0=1 R10=fp0 fp-16=????1") /* validate load from fp-8, which was initialized using BPF_ST_MEM */ -__msg("8: (61) r2 = *(u32 *)(r10 -8) ; R2_w=1 R10=fp0 fp-8=????1") +__msg("8: (61) r2 = *(u32 *)(r10 -8) ; R2=1 R10=fp0 fp-8=????1") __msg("9: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 8: (61) r2 = *(u32 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6") -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_r=????P1 fp-16=????1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (62) *(u32 *)(r10 -8) = 1") -__msg("10: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ -__msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2_w=1 R10=fp0 fp-16=????1") +__msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2=1 R10=fp0 fp-16=????1") __msg("13: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 13 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 12: (61) r2 = *(u32 *)(r10 -16)") @@ -743,12 +743,12 @@ __msg("mark_precise: frame0: regs= stack=-16 before 10: (73) *(u8 *)(r1 +0) = r2 __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2") __msg("mark_precise: frame0: regs= stack=-16 before 8: (61) r2 = *(u32 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6") -__msg("mark_precise: frame0: parent state regs= stack=-16: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_r=????P1 fp-16_r=????P1") +__msg("mark_precise: frame0: parent state regs= stack=-16: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????P1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") __naked void stack_load_preserves_const_precision_subreg(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 9d415f7ce599..ac3e418c2a96 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -105,7 +105,7 @@ __msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") __msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") __msg("mark_precise: frame0: regs=r0 stack= before 10: (95) exit") __msg("mark_precise: frame1: regs=r0 stack= before 9: (bf) r0 = (s8)r10") -__msg("7: R0_w=scalar") +__msg("7: R0=scalar") __naked int fp_precise_subprog_result(void) { asm volatile ( @@ -141,7 +141,7 @@ __msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = (s8)r1") * anyways, at which point we'll break precision chain */ __msg("mark_precise: frame1: regs=r1 stack= before 9: (bf) r1 = r10") -__msg("7: R0_w=scalar") +__msg("7: R0=scalar") __naked int sneaky_fp_precise_subprog_result(void) { asm volatile ( @@ -681,7 +681,7 @@ __msg("mark_precise: frame0: last_idx 10 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r7 stack= before 9: (bf) r1 = r8") __msg("mark_precise: frame0: regs=r7 stack= before 8: (27) r7 *= 4") __msg("mark_precise: frame0: regs=r7 stack= before 7: (79) r7 = *(u64 *)(r10 -8)") -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=2 R6_w=1 R8_rw=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8_rw=P1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=2 R6=1 R8=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8=P1") __msg("mark_precise: frame0: last_idx 18 first_idx 0 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit") __msg("mark_precise: frame1: regs= stack= before 17: (0f) r0 += r2") diff --git a/tools/testing/selftests/bpf/progs/verifier_var_off.c b/tools/testing/selftests/bpf/progs/verifier_var_off.c index 1d36d01b746e..f345466bca68 100644 --- a/tools/testing/selftests/bpf/progs/verifier_var_off.c +++ b/tools/testing/selftests/bpf/progs/verifier_var_off.c @@ -114,8 +114,8 @@ __naked void stack_write_priv_vs_unpriv(void) } /* Similar to the previous test, but this time also perform a read from the - * address written to with a variable offset. The read is allowed, showing that, - * after a variable-offset write, a priviledged program can read the slots that + * address written to with a variable offet. The read is allowed, showing that, + * after a variable-offset write, a privileged program can read the slots that * were in the range of that write (even if the verifier doesn't actually know if * the slot being read was really written to or not. * @@ -157,7 +157,7 @@ __naked void stack_write_followed_by_read(void) SEC("socket") __description("variable-offset stack write clobbers spilled regs") __failure -/* In the priviledged case, dereferencing a spilled-and-then-filled +/* In the privileged case, dereferencing a spilled-and-then-filled * register is rejected because the previous variable offset stack * write might have overwritten the spilled pointer (i.e. we lose track * of the spilled register when we analyze the write). diff --git a/tools/testing/selftests/bpf/test_kmods/Makefile b/tools/testing/selftests/bpf/test_kmods/Makefile index d4e50c4509c9..63c4d3f6a12f 100644 --- a/tools/testing/selftests/bpf/test_kmods/Makefile +++ b/tools/testing/selftests/bpf/test_kmods/Makefile @@ -8,7 +8,7 @@ Q = @ endif MODULES = bpf_testmod.ko bpf_test_no_cfi.ko bpf_test_modorder_x.ko \ - bpf_test_modorder_y.ko + bpf_test_modorder_y.ko bpf_test_rqspinlock.ko $(foreach m,$(MODULES),$(eval obj-m += $(m:.ko=.o))) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c new file mode 100644 index 000000000000..769206fc70e4 --- /dev/null +++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/prandom.h> +#include <asm/rqspinlock.h> +#include <linux/perf_event.h> +#include <linux/kthread.h> +#include <linux/atomic.h> +#include <linux/slab.h> + +static struct perf_event_attr hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, + .sample_period = 100000, +}; + +static rqspinlock_t lock_a; +static rqspinlock_t lock_b; + +static struct perf_event **rqsl_evts; +static int rqsl_nevts; + +static bool test_ab = false; +module_param(test_ab, bool, 0644); +MODULE_PARM_DESC(test_ab, "Test ABBA situations instead of AA situations"); + +static struct task_struct **rqsl_threads; +static int rqsl_nthreads; +static atomic_t rqsl_ready_cpus = ATOMIC_INIT(0); + +static int pause = 0; + +static bool nmi_locks_a(int cpu) +{ + return (cpu & 1) && test_ab; +} + +static int rqspinlock_worker_fn(void *arg) +{ + int cpu = smp_processor_id(); + unsigned long flags; + int ret; + + if (cpu) { + atomic_inc(&rqsl_ready_cpus); + + while (!kthread_should_stop()) { + if (READ_ONCE(pause)) { + msleep(1000); + continue; + } + if (nmi_locks_a(cpu)) + ret = raw_res_spin_lock_irqsave(&lock_b, flags); + else + ret = raw_res_spin_lock_irqsave(&lock_a, flags); + mdelay(20); + if (nmi_locks_a(cpu) && !ret) + raw_res_spin_unlock_irqrestore(&lock_b, flags); + else if (!ret) + raw_res_spin_unlock_irqrestore(&lock_a, flags); + cpu_relax(); + } + return 0; + } + + while (!kthread_should_stop()) { + int expected = rqsl_nthreads > 0 ? rqsl_nthreads - 1 : 0; + int ready = atomic_read(&rqsl_ready_cpus); + + if (ready == expected && !READ_ONCE(pause)) { + for (int i = 0; i < rqsl_nevts; i++) + perf_event_enable(rqsl_evts[i]); + pr_err("Waiting 5 secs to pause the test\n"); + msleep(1000 * 5); + WRITE_ONCE(pause, 1); + pr_err("Paused the test\n"); + } else { + msleep(1000); + cpu_relax(); + } + } + return 0; +} + +static void nmi_cb(struct perf_event *event, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + unsigned long flags; + int ret; + + if (!cpu || READ_ONCE(pause)) + return; + + if (nmi_locks_a(cpu)) + ret = raw_res_spin_lock_irqsave(&lock_a, flags); + else + ret = raw_res_spin_lock_irqsave(test_ab ? &lock_b : &lock_a, flags); + + mdelay(10); + + if (nmi_locks_a(cpu) && !ret) + raw_res_spin_unlock_irqrestore(&lock_a, flags); + else if (!ret) + raw_res_spin_unlock_irqrestore(test_ab ? &lock_b : &lock_a, flags); +} + +static void free_rqsl_threads(void) +{ + int i; + + if (rqsl_threads) { + for_each_online_cpu(i) { + if (rqsl_threads[i]) + kthread_stop(rqsl_threads[i]); + } + kfree(rqsl_threads); + } +} + +static void free_rqsl_evts(void) +{ + int i; + + if (rqsl_evts) { + for (i = 0; i < rqsl_nevts; i++) { + if (rqsl_evts[i]) + perf_event_release_kernel(rqsl_evts[i]); + } + kfree(rqsl_evts); + } +} + +static int bpf_test_rqspinlock_init(void) +{ + int i, ret; + int ncpus = num_online_cpus(); + + pr_err("Mode = %s\n", test_ab ? "ABBA" : "AA"); + + if (ncpus < 3) + return -ENOTSUPP; + + raw_res_spin_lock_init(&lock_a); + raw_res_spin_lock_init(&lock_b); + + rqsl_evts = kcalloc(ncpus - 1, sizeof(*rqsl_evts), GFP_KERNEL); + if (!rqsl_evts) + return -ENOMEM; + rqsl_nevts = ncpus - 1; + + for (i = 1; i < ncpus; i++) { + struct perf_event *e; + + e = perf_event_create_kernel_counter(&hw_attr, i, NULL, nmi_cb, NULL); + if (IS_ERR(e)) { + ret = PTR_ERR(e); + goto err_perf_events; + } + rqsl_evts[i - 1] = e; + } + + rqsl_threads = kcalloc(ncpus, sizeof(*rqsl_threads), GFP_KERNEL); + if (!rqsl_threads) { + ret = -ENOMEM; + goto err_perf_events; + } + rqsl_nthreads = ncpus; + + for_each_online_cpu(i) { + struct task_struct *t; + + t = kthread_create(rqspinlock_worker_fn, NULL, "rqsl_w/%d", i); + if (IS_ERR(t)) { + ret = PTR_ERR(t); + goto err_threads_create; + } + kthread_bind(t, i); + rqsl_threads[i] = t; + wake_up_process(t); + } + return 0; + +err_threads_create: + free_rqsl_threads(); +err_perf_events: + free_rqsl_evts(); + return ret; +} + +module_init(bpf_test_rqspinlock_init); + +static void bpf_test_rqspinlock_exit(void) +{ + free_rqsl_threads(); + free_rqsl_evts(); +} + +module_exit(bpf_test_rqspinlock_exit); + +MODULE_AUTHOR("Kumar Kartikeya Dwivedi"); +MODULE_DESCRIPTION("BPF rqspinlock stress test module"); +MODULE_LICENSE("GPL"); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index e9e918cdf31f..8074bc5f6f20 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -62,6 +62,18 @@ struct bpf_testmod_struct_arg_5 { long d; }; +union bpf_testmod_union_arg_1 { + char a; + short b; + struct bpf_testmod_struct_arg_1 arg; +}; + +union bpf_testmod_union_arg_2 { + int a; + long b; + struct bpf_testmod_struct_arg_2 arg; +}; + __bpf_hook_start(); noinline int @@ -129,6 +141,20 @@ bpf_testmod_test_struct_arg_9(u64 a, void *b, short c, int d, void *e, char f, } noinline int +bpf_testmod_test_union_arg_1(union bpf_testmod_union_arg_1 a, int b, int c) +{ + bpf_testmod_test_struct_arg_result = a.arg.a + b + c; + return bpf_testmod_test_struct_arg_result; +} + +noinline int +bpf_testmod_test_union_arg_2(int a, union bpf_testmod_union_arg_2 b) +{ + bpf_testmod_test_struct_arg_result = a + b.arg.a + b.arg.b; + return bpf_testmod_test_struct_arg_result; +} + +noinline int bpf_testmod_test_arg_ptr_to_struct(struct bpf_testmod_struct_arg_1 *a) { bpf_testmod_test_struct_arg_result = a->a; return bpf_testmod_test_struct_arg_result; @@ -218,6 +244,16 @@ __bpf_kfunc void bpf_kfunc_rcu_task_test(struct task_struct *ptr) { } +__bpf_kfunc struct task_struct *bpf_kfunc_ret_rcu_test(void) +{ + return NULL; +} + +__bpf_kfunc int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) +{ + return NULL; +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@ -398,6 +434,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, struct bpf_testmod_struct_arg_3 *struct_arg3; struct bpf_testmod_struct_arg_4 struct_arg4 = {21, 22}; struct bpf_testmod_struct_arg_5 struct_arg5 = {23, 24, 25, 26}; + union bpf_testmod_union_arg_1 union_arg1 = { .arg = {1} }; + union bpf_testmod_union_arg_2 union_arg2 = { .arg = {2, 3} }; int i = 1; while (bpf_testmod_return_ptr(i)) @@ -415,6 +453,9 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, (void)bpf_testmod_test_struct_arg_9(16, (void *)17, 18, 19, (void *)20, 21, 22, struct_arg5, 27); + (void)bpf_testmod_test_union_arg_1(union_arg1, 4, 5); + (void)bpf_testmod_test_union_arg_2(6, union_arg2); + (void)bpf_testmod_test_arg_ptr_to_struct(&struct_arg1_2); (void)trace_bpf_testmod_test_raw_tp_null_tp(NULL); @@ -501,14 +542,20 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { #ifdef __x86_64__ static int +uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data) +{ + regs->cx = 0x87654321feebdaed; + return 0; +} + +static int uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs, __u64 *data) { regs->ax = 0x12345678deadbeef; - regs->cx = 0x87654321feebdaed; regs->r11 = (u64) -1; - return true; + return 0; } struct testmod_uprobe { @@ -520,6 +567,7 @@ struct testmod_uprobe { static DEFINE_MUTEX(testmod_uprobe_mutex); static struct testmod_uprobe uprobe = { + .consumer.handler = uprobe_handler, .consumer.ret_handler = uprobe_ret_handler, }; @@ -623,6 +671,8 @@ BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) +BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test, KF_RET_NULL | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test_nostruct, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1) @@ -1057,6 +1107,8 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) return args->a; } +__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -1097,6 +1149,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABL BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1528,6 +1581,114 @@ static struct bpf_struct_ops testmod_st_ops = { .owner = THIS_MODULE, }; +struct hlist_head multi_st_ops_list; +static DEFINE_SPINLOCK(multi_st_ops_lock); + +static int multi_st_ops_init(struct btf *btf) +{ + spin_lock_init(&multi_st_ops_lock); + INIT_HLIST_HEAD(&multi_st_ops_list); + + return 0; +} + +static int multi_st_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static struct bpf_testmod_multi_st_ops *multi_st_ops_find_nolock(u32 id) +{ + struct bpf_testmod_multi_st_ops *st_ops; + + hlist_for_each_entry(st_ops, &multi_st_ops_list, node) { + if (st_ops->id == id) + return st_ops; + } + + return NULL; +} + +int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) +{ + struct bpf_testmod_multi_st_ops *st_ops; + unsigned long flags; + int ret = -1; + + spin_lock_irqsave(&multi_st_ops_lock, flags); + st_ops = multi_st_ops_find_nolock(id); + if (st_ops) + ret = st_ops->test_1(args); + spin_unlock_irqrestore(&multi_st_ops_lock, flags); + + return ret; +} + +static int multi_st_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_testmod_multi_st_ops *st_ops = + (struct bpf_testmod_multi_st_ops *)kdata; + unsigned long flags; + int err = 0; + u32 id; + + if (!st_ops->test_1) + return -EINVAL; + + id = bpf_struct_ops_id(kdata); + + spin_lock_irqsave(&multi_st_ops_lock, flags); + if (multi_st_ops_find_nolock(id)) { + pr_err("multi_st_ops(id:%d) has already been registered\n", id); + err = -EEXIST; + goto unlock; + } + + st_ops->id = id; + hlist_add_head(&st_ops->node, &multi_st_ops_list); +unlock: + spin_unlock_irqrestore(&multi_st_ops_lock, flags); + + return err; +} + +static void multi_st_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_testmod_multi_st_ops *st_ops; + unsigned long flags; + u32 id; + + id = bpf_struct_ops_id(kdata); + + spin_lock_irqsave(&multi_st_ops_lock, flags); + st_ops = multi_st_ops_find_nolock(id); + if (st_ops) + hlist_del(&st_ops->node); + spin_unlock_irqrestore(&multi_st_ops_lock, flags); +} + +static int bpf_testmod_multi_st_ops__test_1(struct st_ops_args *args) +{ + return 0; +} + +static struct bpf_testmod_multi_st_ops multi_st_ops_cfi_stubs = { + .test_1 = bpf_testmod_multi_st_ops__test_1, +}; + +struct bpf_struct_ops testmod_multi_st_ops = { + .verifier_ops = &bpf_testmod_verifier_ops, + .init = multi_st_ops_init, + .init_member = multi_st_ops_init_member, + .reg = multi_st_ops_reg, + .unreg = multi_st_ops_unreg, + .cfi_stubs = &multi_st_ops_cfi_stubs, + .name = "bpf_testmod_multi_st_ops", + .owner = THIS_MODULE, +}; + extern int bpf_fentry_test1(int a); static int bpf_testmod_init(void) @@ -1550,6 +1711,7 @@ static int bpf_testmod_init(void) ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2); ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops3, bpf_testmod_ops3); ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops); + ret = ret ?: register_bpf_struct_ops(&testmod_multi_st_ops, bpf_testmod_multi_st_ops); ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors, ARRAY_SIZE(bpf_testmod_dtors), THIS_MODULE); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h index c9fab51f16e2..f6e492f9d042 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h @@ -116,4 +116,10 @@ struct bpf_testmod_st_ops { struct module *owner; }; +struct bpf_testmod_multi_st_ops { + int (*test_1)(struct st_ops_args *args); + struct hlist_node node; + int id; +}; + #endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index b58817938deb..4df6fa6a92cb 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -158,5 +158,9 @@ void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr) __ksym; void bpf_kfunc_trusted_task_test(struct task_struct *ptr) __ksym; void bpf_kfunc_trusted_num_test(int *ptr) __ksym; void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym; +struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym; +int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym; + +int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym; #endif /* _BPF_TESTMOD_KFUNC_H */ diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c index 4694422aa76c..88e4aeab21b7 100644 --- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c +++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c @@ -74,7 +74,7 @@ int main(int argc, char **argv) /* Let's try detach it before it was ever attached */ ret = bpf_prog_detach2(progfd, lircfd, BPF_LIRC_MODE2); - if (ret != -1 || errno != ENOENT) { + if (ret != -ENOENT) { printf("bpf_prog_detach2 not attached should fail: %m\n"); return 1; } diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 78423cf89e01..74ecc281bb8c 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -2,7 +2,6 @@ /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include <linux/capability.h> #include <stdlib.h> -#include <regex.h> #include <test_progs.h> #include <bpf/btf.h> @@ -20,10 +19,12 @@ #define TEST_TAG_EXPECT_FAILURE "comment:test_expect_failure" #define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success" #define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg=" +#define TEST_TAG_EXPECT_NOT_MSG_PFX "comment:test_expect_not_msg=" #define TEST_TAG_EXPECT_XLATED_PFX "comment:test_expect_xlated=" #define TEST_TAG_EXPECT_FAILURE_UNPRIV "comment:test_expect_failure_unpriv" #define TEST_TAG_EXPECT_SUCCESS_UNPRIV "comment:test_expect_success_unpriv" #define TEST_TAG_EXPECT_MSG_PFX_UNPRIV "comment:test_expect_msg_unpriv=" +#define TEST_TAG_EXPECT_NOT_MSG_PFX_UNPRIV "comment:test_expect_not_msg_unpriv=" #define TEST_TAG_EXPECT_XLATED_PFX_UNPRIV "comment:test_expect_xlated_unpriv=" #define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level=" #define TEST_TAG_PROG_FLAGS_PFX "comment:test_prog_flags=" @@ -38,6 +39,10 @@ #define TEST_TAG_JITED_PFX_UNPRIV "comment:test_jited_unpriv=" #define TEST_TAG_CAPS_UNPRIV "comment:test_caps_unpriv=" #define TEST_TAG_LOAD_MODE_PFX "comment:load_mode=" +#define TEST_TAG_EXPECT_STDERR_PFX "comment:test_expect_stderr=" +#define TEST_TAG_EXPECT_STDERR_PFX_UNPRIV "comment:test_expect_stderr_unpriv=" +#define TEST_TAG_EXPECT_STDOUT_PFX "comment:test_expect_stdout=" +#define TEST_TAG_EXPECT_STDOUT_PFX_UNPRIV "comment:test_expect_stdout_unpriv=" /* Warning: duplicated in bpf_misc.h */ #define POINTER_VALUE 0xbadcafe @@ -61,24 +66,14 @@ enum load_mode { NO_JITED = 1 << 1, }; -struct expect_msg { - const char *substr; /* substring match */ - regex_t regex; - bool is_regex; - bool on_next_line; -}; - -struct expected_msgs { - struct expect_msg *patterns; - size_t cnt; -}; - struct test_subspec { char *name; bool expect_failure; struct expected_msgs expect_msgs; struct expected_msgs expect_xlated; struct expected_msgs jited; + struct expected_msgs stderr; + struct expected_msgs stdout; int retval; bool execute; __u64 caps; @@ -139,6 +134,10 @@ static void free_test_spec(struct test_spec *spec) free_msgs(&spec->unpriv.expect_xlated); free_msgs(&spec->priv.jited); free_msgs(&spec->unpriv.jited); + free_msgs(&spec->unpriv.stderr); + free_msgs(&spec->priv.stderr); + free_msgs(&spec->unpriv.stdout); + free_msgs(&spec->priv.stdout); free(spec->priv.name); free(spec->unpriv.name); @@ -206,7 +205,8 @@ static int compile_regex(const char *pattern, regex_t *regex) return 0; } -static int __push_msg(const char *pattern, bool on_next_line, struct expected_msgs *msgs) +static int __push_msg(const char *pattern, bool on_next_line, bool negative, + struct expected_msgs *msgs) { struct expect_msg *msg; void *tmp; @@ -222,6 +222,7 @@ static int __push_msg(const char *pattern, bool on_next_line, struct expected_ms msg = &msgs->patterns[msgs->cnt]; msg->on_next_line = on_next_line; msg->substr = pattern; + msg->negative = negative; msg->is_regex = false; if (strstr(pattern, "{{")) { err = compile_regex(pattern, &msg->regex); @@ -240,16 +241,16 @@ static int clone_msgs(struct expected_msgs *from, struct expected_msgs *to) for (i = 0; i < from->cnt; i++) { msg = &from->patterns[i]; - err = __push_msg(msg->substr, msg->on_next_line, to); + err = __push_msg(msg->substr, msg->on_next_line, msg->negative, to); if (err) return err; } return 0; } -static int push_msg(const char *substr, struct expected_msgs *msgs) +static int push_msg(const char *substr, bool negative, struct expected_msgs *msgs) { - return __push_msg(substr, false, msgs); + return __push_msg(substr, false, negative, msgs); } static int push_disasm_msg(const char *regex_str, bool *on_next_line, struct expected_msgs *msgs) @@ -260,7 +261,7 @@ static int push_disasm_msg(const char *regex_str, bool *on_next_line, struct exp *on_next_line = false; return 0; } - err = __push_msg(regex_str, *on_next_line, msgs); + err = __push_msg(regex_str, *on_next_line, false, msgs); if (err) return err; *on_next_line = true; @@ -374,6 +375,7 @@ enum arch { ARCH_X86_64 = 0x2, ARCH_ARM64 = 0x4, ARCH_RISCV64 = 0x8, + ARCH_S390X = 0x10, }; static int get_current_arch(void) @@ -384,6 +386,8 @@ static int get_current_arch(void) return ARCH_ARM64; #elif defined(__riscv) && __riscv_xlen == 64 return ARCH_RISCV64; +#elif defined(__s390x__) + return ARCH_S390X; #endif return ARCH_UNKNOWN; } @@ -404,6 +408,10 @@ static int parse_test_spec(struct test_loader *tester, bool xlated_on_next_line = true; bool unpriv_jit_on_next_line; bool jit_on_next_line; + bool stderr_on_next_line = true; + bool unpriv_stderr_on_next_line = true; + bool stdout_on_next_line = true; + bool unpriv_stdout_on_next_line = true; bool collect_jit = false; int func_id, i, err = 0; u32 arch_mask = 0; @@ -465,12 +473,22 @@ static int parse_test_spec(struct test_loader *tester, spec->auxiliary = true; spec->mode_mask |= UNPRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX))) { - err = push_msg(msg, &spec->priv.expect_msgs); + err = push_msg(msg, false, &spec->priv.expect_msgs); + if (err) + goto cleanup; + spec->mode_mask |= PRIV; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_NOT_MSG_PFX))) { + err = push_msg(msg, true, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV))) { - err = push_msg(msg, &spec->unpriv.expect_msgs); + err = push_msg(msg, false, &spec->unpriv.expect_msgs); + if (err) + goto cleanup; + spec->mode_mask |= UNPRIV; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_NOT_MSG_PFX_UNPRIV))) { + err = push_msg(msg, true, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; @@ -565,8 +583,10 @@ static int parse_test_spec(struct test_loader *tester, arch = ARCH_ARM64; } else if (strcmp(val, "RISCV64") == 0) { arch = ARCH_RISCV64; + } else if (strcmp(val, "s390x") == 0) { + arch = ARCH_S390X; } else { - PRINT_FAIL("bad arch spec: '%s'", val); + PRINT_FAIL("bad arch spec: '%s'\n", val); err = -EINVAL; goto cleanup; } @@ -593,6 +613,26 @@ static int parse_test_spec(struct test_loader *tester, err = -EINVAL; goto cleanup; } + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDERR_PFX))) { + err = push_disasm_msg(msg, &stderr_on_next_line, + &spec->priv.stderr); + if (err) + goto cleanup; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDERR_PFX_UNPRIV))) { + err = push_disasm_msg(msg, &unpriv_stderr_on_next_line, + &spec->unpriv.stderr); + if (err) + goto cleanup; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDOUT_PFX))) { + err = push_disasm_msg(msg, &stdout_on_next_line, + &spec->priv.stdout); + if (err) + goto cleanup; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDOUT_PFX_UNPRIV))) { + err = push_disasm_msg(msg, &unpriv_stdout_on_next_line, + &spec->unpriv.stdout); + if (err) + goto cleanup; } } @@ -646,6 +686,10 @@ static int parse_test_spec(struct test_loader *tester, clone_msgs(&spec->priv.expect_xlated, &spec->unpriv.expect_xlated); if (spec->unpriv.jited.cnt == 0) clone_msgs(&spec->priv.jited, &spec->unpriv.jited); + if (spec->unpriv.stderr.cnt == 0) + clone_msgs(&spec->priv.stderr, &spec->unpriv.stderr); + if (spec->unpriv.stdout.cnt == 0) + clone_msgs(&spec->priv.stdout, &spec->unpriv.stdout); } spec->valid = true; @@ -707,44 +751,155 @@ static void emit_jited(const char *jited, bool force) fprintf(stdout, "JITED:\n=============\n%s=============\n", jited); } -static void validate_msgs(char *log_buf, struct expected_msgs *msgs, - void (*emit_fn)(const char *buf, bool force)) +static void emit_stderr(const char *stderr, bool force) { - const char *log = log_buf, *prev_match; + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "STDERR:\n=============\n%s=============\n", stderr); +} + +static void emit_stdout(const char *bpf_stdout, bool force) +{ + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "STDOUT:\n=============\n%s=============\n", bpf_stdout); +} + +static const char *match_msg(struct expect_msg *msg, const char **log) +{ + const char *match = NULL; regmatch_t reg_match[1]; - int prev_match_line; - int match_line; - int i, j, err; + int err; + + if (!msg->is_regex) { + match = strstr(*log, msg->substr); + if (match) + *log = match + strlen(msg->substr); + } else { + err = regexec(&msg->regex, *log, 1, reg_match, 0); + if (err == 0) { + match = *log + reg_match[0].rm_so; + *log += reg_match[0].rm_eo; + } + } + return match; +} + +static int count_lines(const char *start, const char *end) +{ + const char *tmp; + int n = 0; + + for (tmp = start; tmp < end; ++tmp) + if (*tmp == '\n') + n++; + return n; +} + +struct match { + const char *start; + const char *end; + int line; +}; + +/* + * Positive messages are matched sequentially, each next message + * is looked for starting from the end of a previous matched one. + */ +static void match_positive_msgs(const char *log, struct expected_msgs *msgs, struct match *matches) +{ + const char *prev_match; + int i, line; - prev_match_line = -1; - match_line = 0; prev_match = log; + line = 0; for (i = 0; i < msgs->cnt; i++) { struct expect_msg *msg = &msgs->patterns[i]; - const char *match = NULL, *pat_status; - bool wrong_line = false; - - if (!msg->is_regex) { - match = strstr(log, msg->substr); - if (match) - log = match + strlen(msg->substr); - } else { - err = regexec(&msg->regex, log, 1, reg_match, 0); - if (err == 0) { - match = log + reg_match[0].rm_so; - log += reg_match[0].rm_eo; + const char *match = NULL; + + if (msg->negative) + continue; + + match = match_msg(msg, &log); + if (match) { + line += count_lines(prev_match, match); + matches[i].start = match; + matches[i].end = log; + matches[i].line = line; + prev_match = match; + } + } +} + +/* + * Each negative messages N located between positive messages P1 and P2 + * is matched in the span P1.end .. P2.start. Consequently, negative messages + * are unordered within the span. + */ +static void match_negative_msgs(const char *log, struct expected_msgs *msgs, struct match *matches) +{ + const char *start = log, *end, *next, *match; + const char *log_end = log + strlen(log); + int i, j, next_positive; + + for (i = 0; i < msgs->cnt; i++) { + struct expect_msg *msg = &msgs->patterns[i]; + + /* positive message bumps span start */ + if (!msg->negative) { + start = matches[i].end ?: start; + continue; + } + + /* count stride of negative patterns and adjust span end */ + end = log_end; + for (next_positive = i + 1; next_positive < msgs->cnt; next_positive++) { + if (!msgs->patterns[next_positive].negative) { + end = matches[next_positive].start; + break; } } - if (match) { - for (; prev_match < match; ++prev_match) - if (*prev_match == '\n') - ++match_line; - wrong_line = msg->on_next_line && prev_match_line >= 0 && - prev_match_line + 1 != match_line; + /* try matching negative messages within identified span */ + for (j = i; j < next_positive; j++) { + next = start; + match = match_msg(msg, &next); + if (match && next <= end) { + matches[j].start = match; + matches[j].end = next; + } } - if (!match || wrong_line) { + /* -1 to account for i++ */ + i = next_positive - 1; + } +} + +void validate_msgs(const char *log_buf, struct expected_msgs *msgs, + void (*emit_fn)(const char *buf, bool force)) +{ + struct match matches[msgs->cnt]; + struct match *prev_match = NULL; + int i, j; + + memset(matches, 0, sizeof(*matches) * msgs->cnt); + match_positive_msgs(log_buf, msgs, matches); + match_negative_msgs(log_buf, msgs, matches); + + for (i = 0; i < msgs->cnt; i++) { + struct expect_msg *msg = &msgs->patterns[i]; + struct match *match = &matches[i]; + const char *pat_status; + bool unexpected; + bool wrong_line; + bool no_match; + + no_match = !msg->negative && !match->start; + wrong_line = !msg->negative && + msg->on_next_line && + prev_match && prev_match->line + 1 != match->line; + unexpected = msg->negative && match->start; + if (no_match || wrong_line || unexpected) { PRINT_FAIL("expect_msg\n"); if (env.verbosity == VERBOSE_NONE) emit_fn(log_buf, true /*force*/); @@ -754,8 +909,10 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, pat_status = "MATCHED "; else if (wrong_line) pat_status = "WRONG LINE"; - else + else if (no_match) pat_status = "EXPECTED "; + else + pat_status = "UNEXPECTED"; msg = &msgs->patterns[j]; fprintf(stderr, "%s %s: '%s'\n", pat_status, @@ -765,12 +922,13 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, if (wrong_line) { fprintf(stderr, "expecting match at line %d, actual match is at line %d\n", - prev_match_line + 1, match_line); + prev_match->line + 1, match->line); } break; } - prev_match_line = match_line; + if (!msg->negative) + prev_match = match; } } @@ -929,6 +1087,19 @@ out: return err; } +/* Read the bpf stream corresponding to the stream_id */ +static int get_stream(int stream_id, int prog_fd, char *text, size_t text_sz) +{ + LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts); + int ret; + + ret = bpf_prog_stream_read(prog_fd, stream_id, text, text_sz, &ropts); + ASSERT_GT(ret, 0, "stream read"); + text[ret] = '\0'; + + return ret; +} + /* this function is forced noinline and has short generic name to look better * in test_progs output (in case of a failure) */ @@ -1083,7 +1254,7 @@ void run_subtest(struct test_loader *tester, link = bpf_map__attach_struct_ops(map); if (!link) { PRINT_FAIL("bpf_map__attach_struct_ops failed for map %s: err=%d\n", - bpf_map__name(map), err); + bpf_map__name(map), -errno); goto tobj_cleanup; } links[links_cnt++] = link; @@ -1103,6 +1274,31 @@ void run_subtest(struct test_loader *tester, PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval); goto tobj_cleanup; } + + if (subspec->stderr.cnt) { + err = get_stream(2, bpf_program__fd(tprog), + tester->log_buf, tester->log_buf_sz); + if (err <= 0) { + PRINT_FAIL("Unexpected retval from get_stream(): %d, errno = %d\n", + err, errno); + goto tobj_cleanup; + } + emit_stderr(tester->log_buf, false /*force*/); + validate_msgs(tester->log_buf, &subspec->stderr, emit_stderr); + } + + if (subspec->stdout.cnt) { + err = get_stream(1, bpf_program__fd(tprog), + tester->log_buf, tester->log_buf_sz); + if (err <= 0) { + PRINT_FAIL("Unexpected retval from get_stream(): %d, errno = %d\n", + err, errno); + goto tobj_cleanup; + } + emit_stdout(tester->log_buf, false /*force*/); + validate_msgs(tester->log_buf, &subspec->stdout, emit_stdout); + } + /* redo bpf_map__attach_struct_ops for each test */ while (links_cnt > 0) bpf_link__destroy(links[--links_cnt]); diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 309d9d4a8ace..02a85dda30e6 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -14,12 +14,14 @@ #include <netinet/in.h> #include <sys/select.h> #include <sys/socket.h> +#include <linux/keyctl.h> #include <sys/un.h> #include <bpf/btf.h> #include <time.h> #include "json_writer.h" #include "network_helpers.h" +#include "verification_cert.h" /* backtrace() and backtrace_symbols_fd() are glibc specific, * use header file when glibc is available and provide stub @@ -1928,6 +1930,13 @@ static void free_test_states(void) } } +static __u32 register_session_key(const char *key_data, size_t key_data_size) +{ + return syscall(__NR_add_key, "asymmetric", "libbpf_session_key", + (const void *)key_data, key_data_size, + KEY_SPEC_SESSION_KEYRING); +} + int main(int argc, char **argv) { static const struct argp argp = { @@ -1961,6 +1970,10 @@ int main(int argc, char **argv) /* Use libbpf 1.0 API mode */ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); libbpf_set_print(libbpf_print_fn); + err = register_session_key((const char *)test_progs_verification_cert, + test_progs_verification_cert_len); + if (err < 0) + return err; traffic_monitor_set_print(traffic_monitor_print_fn); diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index df2222a1806f..eebfc18cdcd2 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -7,6 +7,7 @@ #include <errno.h> #include <string.h> #include <assert.h> +#include <regex.h> #include <stdlib.h> #include <stdarg.h> #include <time.h> @@ -546,4 +547,20 @@ extern void test_loader_fini(struct test_loader *tester); test_loader_fini(&tester); \ }) +struct expect_msg { + const char *substr; /* substring match */ + regex_t regex; + bool is_regex; + bool on_next_line; + bool negative; +}; + +struct expected_msgs { + struct expect_msg *patterns; + size_t cnt; +}; + +void validate_msgs(const char *log_buf, struct expected_msgs *msgs, + void (*emit_fn)(const char *buf, bool force)); + #endif /* __TEST_PROGS_H */ diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index fd2da2234cc9..76568db7a664 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1372,7 +1372,7 @@ run: } else fprintf(stderr, "unknown test\n"); out: - /* Detatch and zero all the maps */ + /* Detach and zero all the maps */ bpf_prog_detach2(bpf_program__fd(progs[3]), cg_fd, BPF_CGROUP_SOCK_OPS); for (i = 0; i < ARRAY_SIZE(links); i++) { diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c index 595194453ff8..35b4893ccdf8 100644 --- a/tools/testing/selftests/bpf/test_tcpnotify_user.c +++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c @@ -15,20 +15,18 @@ #include <bpf/libbpf.h> #include <sys/ioctl.h> #include <linux/rtnetlink.h> -#include <signal.h> #include <linux/perf_event.h> -#include <linux/err.h> -#include "bpf_util.h" #include "cgroup_helpers.h" #include "test_tcpnotify.h" -#include "trace_helpers.h" #include "testing_helpers.h" #define SOCKET_BUFFER_SIZE (getpagesize() < 8192L ? getpagesize() : 8192L) pthread_t tid; +static bool exit_thread; + int rx_callbacks; static void dummyfn(void *ctx, int cpu, void *data, __u32 size) @@ -45,7 +43,7 @@ void tcp_notifier_poller(struct perf_buffer *pb) { int err; - while (1) { + while (!exit_thread) { err = perf_buffer__poll(pb, 100); if (err < 0 && err != -EINTR) { printf("failed perf_buffer__poll: %d\n", err); @@ -78,15 +76,10 @@ int main(int argc, char **argv) int error = EXIT_FAILURE; struct bpf_object *obj; char test_script[80]; - cpu_set_t cpuset; __u32 key = 0; libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - CPU_ZERO(&cpuset); - CPU_SET(0, &cpuset); - pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); - cg_fd = cgroup_setup_and_join(cg_path); if (cg_fd < 0) goto err; @@ -151,6 +144,13 @@ int main(int argc, char **argv) sleep(10); + exit_thread = true; + int ret = pthread_join(tid, NULL); + if (ret) { + printf("FAILED: pthread_join\n"); + goto err; + } + if (verify_result(&g)) { printf("FAILED: Wrong stats Expected %d calls, got %d\n", g.ncalls, rx_callbacks); diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 65aafe0003db..62db060298a4 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -241,4 +241,6 @@ done if [ $failures -eq 0 ]; then echo "All tests successful!" +else + exit 1 fi diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 5e9f16683be5..16eb37e5bad6 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -399,7 +399,7 @@ int unload_module(const char *name, bool verbose) return 0; } -int load_module(const char *path, bool verbose) +static int __load_module(const char *path, const char *param_values, bool verbose) { int fd; @@ -411,7 +411,7 @@ int load_module(const char *path, bool verbose) fprintf(stdout, "Can't find %s kernel module: %d\n", path, -errno); return -ENOENT; } - if (finit_module(fd, "", 0)) { + if (finit_module(fd, param_values, 0)) { fprintf(stdout, "Failed to load %s into the kernel: %d\n", path, -errno); close(fd); return -EINVAL; @@ -423,6 +423,16 @@ int load_module(const char *path, bool verbose) return 0; } +int load_module_params(const char *path, const char *param_values, bool verbose) +{ + return __load_module(path, param_values, verbose); +} + +int load_module(const char *path, bool verbose) +{ + return __load_module(path, "", verbose); +} + int unload_bpf_testmod(bool verbose) { return unload_module("bpf_testmod", verbose); diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h index 46d7f7089f63..eb20d3772218 100644 --- a/tools/testing/selftests/bpf/testing_helpers.h +++ b/tools/testing/selftests/bpf/testing_helpers.h @@ -39,6 +39,7 @@ int kern_sync_rcu(void); int finit_module(int fd, const char *param_values, int flags); int delete_module(const char *name, int flags); int load_module(const char *path, bool verbose); +int load_module_params(const char *path, const char *param_values, bool verbose); int unload_module(const char *name, bool verbose); static inline __u64 get_time_ns(void) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 81943c6254e6..171987627f3a 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -17,7 +17,9 @@ #include <linux/limits.h> #include <libelf.h> #include <gelf.h> +#include "bpf/hashmap.h" #include "bpf/libbpf_internal.h" +#include "bpf_util.h" #define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" #define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" @@ -519,3 +521,235 @@ void read_trace_pipe(void) { read_trace_pipe_iter(trace_pipe_cb, NULL, 0); } + +static size_t symbol_hash(long key, void *ctx __maybe_unused) +{ + return str_hash((const char *) key); +} + +static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) +{ + return strcmp((const char *) key1, (const char *) key2) == 0; +} + +static bool is_invalid_entry(char *buf, bool kernel) +{ + if (kernel && strchr(buf, '[')) + return true; + if (!kernel && !strchr(buf, '[')) + return true; + return false; +} + +static const char * const trace_blacklist[] = { + "migrate_disable", + "migrate_enable", + "rcu_read_unlock_strict", + "preempt_count_add", + "preempt_count_sub", + "__rcu_read_lock", + "__rcu_read_unlock", + "bpf_get_numa_node_id", +}; + +static bool skip_entry(char *name) +{ + int i; + + /* + * We attach to almost all kernel functions and some of them + * will cause 'suspicious RCU usage' when fprobe is attached + * to them. Filter out the current culprits - arch_cpu_idle + * default_idle and rcu_* functions. + */ + if (!strcmp(name, "arch_cpu_idle")) + return true; + if (!strcmp(name, "default_idle")) + return true; + if (!strncmp(name, "rcu_", 4)) + return true; + if (!strcmp(name, "bpf_dispatcher_xdp_func")) + return true; + if (!strncmp(name, "__ftrace_invalid_address__", + sizeof("__ftrace_invalid_address__") - 1)) + return true; + + for (i = 0; i < ARRAY_SIZE(trace_blacklist); i++) { + if (!strcmp(name, trace_blacklist[i])) + return true; + } + + return false; +} + +/* Do comparison by ignoring '.llvm.<hash>' suffixes. */ +static int compare_name(const char *name1, const char *name2) +{ + const char *res1, *res2; + int len1, len2; + + res1 = strstr(name1, ".llvm."); + res2 = strstr(name2, ".llvm."); + len1 = res1 ? res1 - name1 : strlen(name1); + len2 = res2 ? res2 - name2 : strlen(name2); + + if (len1 == len2) + return strncmp(name1, name2, len1); + if (len1 < len2) + return strncmp(name1, name2, len1) <= 0 ? -1 : 1; + return strncmp(name1, name2, len2) >= 0 ? 1 : -1; +} + +static int load_kallsyms_compare(const void *p1, const void *p2) +{ + return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); +} + +static int search_kallsyms_compare(const void *p1, const struct ksym *p2) +{ + return compare_name(p1, p2->name); +} + +int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel) +{ + size_t cap = 0, cnt = 0; + char *name = NULL, *ksym_name, **syms = NULL; + struct hashmap *map; + struct ksyms *ksyms; + struct ksym *ks; + char buf[256]; + FILE *f; + int err = 0; + + ksyms = load_kallsyms_custom_local(load_kallsyms_compare); + if (!ksyms) + return -EINVAL; + + /* + * The available_filter_functions contains many duplicates, + * but other than that all symbols are usable to trace. + * Filtering out duplicates by using hashmap__add, which won't + * add existing entry. + */ + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r"); + + if (!f) + return -EINVAL; + + map = hashmap__new(symbol_hash, symbol_equal, NULL); + if (IS_ERR(map)) { + err = libbpf_get_error(map); + goto error; + } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) + continue; + if (skip_entry(name)) + continue; + + ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); + if (!ks) { + err = -EINVAL; + goto error; + } + + ksym_name = ks->name; + err = hashmap__add(map, ksym_name, 0); + if (err == -EEXIST) { + err = 0; + continue; + } + if (err) + goto error; + + err = libbpf_ensure_mem((void **) &syms, &cap, + sizeof(*syms), cnt + 1); + if (err) + goto error; + + syms[cnt++] = ksym_name; + } + + *symsp = syms; + *cntp = cnt; + +error: + free(name); + fclose(f); + hashmap__free(map); + if (err) + free(syms); + return err; +} + +int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) +{ + unsigned long *addr, *addrs, *tmp_addrs; + int err = 0, max_cnt, inc_cnt; + char *name = NULL; + size_t cnt = 0; + char buf[256]; + FILE *f; + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); + + if (!f) + return -ENOENT; + + /* In my local setup, the number of entries is 50k+ so Let us initially + * allocate space to hold 64k entries. If 64k is not enough, incrementally + * increase 1k each time. + */ + max_cnt = 65536; + inc_cnt = 1024; + addrs = malloc(max_cnt * sizeof(long)); + if (addrs == NULL) { + err = -ENOMEM; + goto error; + } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) + continue; + if (skip_entry(name)) + continue; + + if (cnt == max_cnt) { + max_cnt += inc_cnt; + tmp_addrs = realloc(addrs, max_cnt); + if (!tmp_addrs) { + err = -ENOMEM; + goto error; + } + addrs = tmp_addrs; + } + + addrs[cnt++] = (unsigned long)addr; + } + + *addrsp = addrs; + *cntp = cnt; + +error: + free(name); + fclose(f); + if (err) + free(addrs); + return err; +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 2ce873c9f9aa..9437bdd4afa5 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -41,4 +41,7 @@ ssize_t get_rel_offset(uintptr_t addr); int read_build_id(const char *path, char *build_id, size_t size); +int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel); +int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel); + #endif diff --git a/tools/testing/selftests/bpf/usdt.h b/tools/testing/selftests/bpf/usdt.h new file mode 100644 index 000000000000..549d1f774810 --- /dev/null +++ b/tools/testing/selftests/bpf/usdt.h @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * This single-header library defines a collection of variadic macros for + * defining and triggering USDTs (User Statically-Defined Tracepoints): + * + * - For USDTs without associated semaphore: + * USDT(group, name, args...) + * + * - For USDTs with implicit (transparent to the user) semaphore: + * USDT_WITH_SEMA(group, name, args...) + * USDT_IS_ACTIVE(group, name) + * + * - For USDTs with explicit (user-defined and provided) semaphore: + * USDT_WITH_EXPLICIT_SEMA(sema, group, name, args...) + * USDT_SEMA_IS_ACTIVE(sema) + * + * all of which emit a NOP instruction into the instruction stream, and so + * have *zero* overhead for the surrounding code. USDTs are identified by + * a combination of `group` and `name` identifiers, which is used by external + * tracing tooling (tracers) for identifying exact USDTs of interest. + * + * USDTs can have an associated (2-byte) activity counter (USDT semaphore), + * automatically maintained by Linux kernel whenever any correctly written + * BPF-based tracer is attached to the USDT. This USDT semaphore can be used + * to check whether there is a need to do any extra data collection and + * processing for a given USDT (if necessary), and otherwise avoid extra work + * for a common case of USDT not being traced ("active"). + * + * See documentation for USDT_WITH_SEMA()/USDT_IS_ACTIVE() or + * USDT_WITH_EXPLICIT_SEMA()/USDT_SEMA_IS_ACTIVE() APIs below for details on + * working with USDTs with implicitly or explicitly associated + * USDT semaphores, respectively. + * + * There is also some additional data recorded into an auxiliary note + * section. The data in the note section describes the operands, in terms of + * size and location, used by tracing tooling to know where to find USDT + * arguments. Each location is encoded as an assembler operand string. + * Tracing tools (bpftrace and BPF-based tracers, systemtap, etc) insert + * breakpoints on top of the nop, and decode the location operand-strings, + * like an assembler, to find the values being passed. + * + * The operand strings are selected by the compiler for each operand. + * They are constrained by inline-assembler codes.The default is: + * + * #define USDT_ARG_CONSTRAINT nor + * + * This is a good default if the operands tend to be integral and + * moderate in number (smaller than number of registers). In other + * cases, the compiler may report "'asm' requires impossible reload" or + * similar. In this case, consider simplifying the macro call (fewer + * and simpler operands), reduce optimization, or override the default + * constraints string via: + * + * #define USDT_ARG_CONSTRAINT g + * #include <usdt.h> + * + * For some historical description of USDT v3 format (the one used by this + * library and generally recognized and assumed by BPF-based tracing tools) + * see [0]. The more formal specification can be found at [1]. Additional + * argument constraints information can be found at [2]. + * + * Original SystemTap's sys/sdt.h implementation ([3]) was used as a base for + * this USDT library implementation. Current implementation differs *a lot* in + * terms of exposed user API and general usability, which was the main goal + * and focus of the reimplementation work. Nevertheless, underlying recorded + * USDT definitions are fully binary compatible and any USDT-based tooling + * should work equally well with USDTs defined by either SystemTap's or this + * library's USDT implementation. + * + * [0] https://ecos.sourceware.org/ml/systemtap/2010-q3/msg00145.html + * [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + * [2] https://gcc.gnu.org/onlinedocs/gcc/Constraints.html + * [3] https://sourceware.org/git/?p=systemtap.git;a=blob;f=includes/sys/sdt.h + */ +#ifndef __USDT_H +#define __USDT_H + +/* + * Changelog: + * + * 0.1.0 + * ----- + * - Initial release + */ +#define USDT_MAJOR_VERSION 0 +#define USDT_MINOR_VERSION 1 +#define USDT_PATCH_VERSION 0 + +/* C++20 and C23 added __VA_OPT__ as a standard replacement for non-standard `##__VA_ARGS__` extension */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L) +#define __usdt_va_opt 1 +#define __usdt_va_args(...) __VA_OPT__(,) __VA_ARGS__ +#else +#define __usdt_va_args(...) , ##__VA_ARGS__ +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. No USDT semaphore is + * associated with this USDT. + * + * Such "semaphoreless" USDTs are commonly used when there is no extra data + * collection or processing needed to collect and prepare USDT arguments and + * they are just available in the surrounding code. USDT() macro will just + * record their locations in CPU registers or in memory for tracing tooling to + * be able to access them, if necessary. + */ +#ifdef __usdt_va_opt +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0 __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0, ##__VA_ARGS__) +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. USDT also get an + * implicitly-defined associated USDT semaphore, which will be "activated" by + * tracing tooling and can be used to check whether USDT is being actively + * observed. + * + * USDTs with semaphore are commonly used when there is a need to perform + * additional data collection and processing to prepare USDT arguments, which + * otherwise might not be necessary for the rest of application logic. In such + * case, USDT semaphore can be used to avoid unnecessary extra work. If USDT + * is not traced (which is presumed to be a common situation), the associated + * USDT semaphore is "inactive", and so there is no need to waste resources to + * prepare USDT arguments. Use USDT_IS_ACTIVE(group, name) to check whether + * USDT is "active". + * + * N.B. There is an inherent (albeit short) gap between checking whether USDT + * is active and triggering corresponding USDT, in which external tracer can + * be attached to an USDT and activate USDT semaphore after the activity check. + * If such a race occurs, tracers might miss one USDT execution. Tracers are + * expected to accommodate such possibility and this is expected to not be + * a problem for applications and tracers. + * + * N.B. Implicit USDT semaphore defined by USDT_WITH_SEMA() is contained + * within a single executable or shared library and is not shared outside + * them. I.e., if you use USDT_WITH_SEMA() with the same USDT group and name + * identifier across executable and shared library, it will work and won't + * conflict, per se, but will define independent USDT semaphores, one for each + * shared library/executable in which USDT_WITH_SEMA(group, name) is used. + * That is, if you attach to this USDT in one shared library (or executable), + * then only USDT semaphore within that shared library (or executable) will be + * updated by the kernel, while other libraries (or executable) will not see + * activated USDT semaphore. In short, it's best to use unique USDT group:name + * identifiers across different shared libraries (and, equivalently, between + * executable and shared library). This is advanced consideration and is + * rarely (if ever) seen in practice, but just to avoid surprises this is + * called out here. (Static libraries become a part of final executable, once + * linked by linker, so the above considerations don't apply to them.) + */ +#ifdef __usdt_va_opt +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name) \ + __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name), \ + ##__VA_ARGS__) +#endif + +struct usdt_sema { volatile unsigned short active; }; + +/* + * Check if USDT with `group`:`name` identifier is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into the given USDT. It is assumed that USDT is triggered with + * USDT_WITH_SEMA() macro which will implicitly define associated USDT + * semaphore. (If one needs more control over USDT semaphore, see + * USDT_DEFINE_SEMA() and USDT_WITH_EXPLICIT_SEMA() macros below.) + * + * N.B. Such checks are necessarily racy and speculative. Between checking + * whether USDT is active and triggering the USDT itself, tracer can be + * detached with no notification. This race should be extremely rare and worst + * case should result in one-time wasted extra data collection and processing. + */ +#define USDT_IS_ACTIVE(group, name) ({ \ + extern struct usdt_sema __usdt_sema_name(group, name) \ + __usdt_asm_name(__usdt_sema_name(group, name)); \ + __usdt_sema_implicit(__usdt_sema_name(group, name)); \ + __usdt_sema_name(group, name).active > 0; \ +}) + +/* + * APIs for working with user-defined explicit USDT semaphores. + * + * This is a less commonly used advanced API for use cases in which user needs + * an explicit control over (potentially shared across multiple USDTs) USDT + * semaphore instance. This can be used when there is a group of logically + * related USDTs that all need extra data collection and processing whenever + * any of a family of related USDTs are "activated" (i.e., traced). In such + * a case, all such related USDTs will be associated with the same shared USDT + * semaphore defined with USDT_DEFINE_SEMA() and the USDTs themselves will be + * triggered with USDT_WITH_EXPLICIT_SEMA() macros, taking an explicit extra + * USDT semaphore identifier as an extra parameter. + */ + +/** + * Underlying C global variable name for user-defined USDT semaphore with + * `sema` identifier. Could be useful for debugging, but normally shouldn't be + * used explicitly. + */ +#define USDT_SEMA(sema) __usdt_sema_##sema + +/* + * Define storage for user-defined USDT semaphore `sema`. + * + * Should be used only once in non-header source file to let compiler allocate + * space for the semaphore variable. Just like with any other global variable. + * + * This macro can be used anywhere where global variable declaration is + * allowed. Just like with global variable definitions, there should be only + * one definition of user-defined USDT semaphore with given `sema` identifier, + * otherwise compiler or linker will complain about duplicate variable + * definition. + * + * For C++, it is allowed to use USDT_DEFINE_SEMA() both in global namespace + * and inside namespaces (including nested namespaces). Just make sure that + * USDT_DECLARE_SEMA() is placed within the namespace where this semaphore is + * referenced, or any of its parent namespaces, so the C++ language-level + * identifier is visible to the code that needs to reference the semaphore. + * At the lowest layer, USDT semaphores have global naming and visibility + * (they have a corresponding `__usdt_sema_<name>` symbol, which can be linked + * against from C or C++ code, if necessary). To keep it simple, putting + * USDT_DECLARE_SEMA() declarations into global namespaces is the simplest + * no-brainer solution. All these aspects are irrelevant for plain C, because + * C doesn't have namespaces and everything is always in the global namespace. + * + * N.B. Due to USDT metadata being recorded in non-allocatable ELF note + * section, it has limitations when it comes to relocations, which, in + * practice, means that it's not possible to correctly share USDT semaphores + * between main executable and shared libraries, or even between multiple + * shared libraries. USDT semaphore has to be contained to individual shared + * library or executable to avoid unpleasant surprises with half-working USDT + * semaphores. We enforce this by marking semaphore ELF symbols as having + * a hidden visibility. This is quite an advanced use case and consideration + * and for most users this should have no consequences whatsoever. + */ +#define USDT_DEFINE_SEMA(sema) \ + struct usdt_sema __usdt_sema_sec USDT_SEMA(sema) \ + __usdt_asm_name(USDT_SEMA(sema)) \ + __attribute__((visibility("hidden"))) = { 0 } + +/* + * Declare extern reference to user-defined USDT semaphore `sema`. + * + * Refers to a variable defined in another compilation unit by + * USDT_DEFINE_SEMA() and allows to use the same USDT semaphore across + * multiple compilation units (i.e., .c and .cpp files). + * + * See USDT_DEFINE_SEMA() notes above for C++ language usage peculiarities. + */ +#define USDT_DECLARE_SEMA(sema) \ + extern struct usdt_sema USDT_SEMA(sema) __usdt_asm_name(USDT_SEMA(sema)) + +/* + * Check if user-defined USDT semaphore `sema` is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into USDT(s) associated with USDT semaphore `sema`. + * + * N.B. Such checks are necessarily racy. Between checking the state of USDT + * semaphore and triggering associated USDT(s), the active tracer might attach + * or detach. This race should be extremely rare and worst case should result + * in one-time missed USDT event or wasted extra data collection and + * processing. USDT-using tracers should be written with this in mind and is + * not a concern of the application defining USDTs with associated semaphore. + */ +#define USDT_SEMA_IS_ACTIVE(sema) (USDT_SEMA(sema).active > 0) + +/* + * Invoke USDT specified by `group` and `name` identifiers and associate + * explicitly user-defined semaphore `sema` with it. Pass through `args` as + * USDT arguments. `args` are optional and zero arguments are acceptable. + * + * Semaphore is defined with the help of USDT_DEFINE_SEMA() macro and can be + * checked whether active with USDT_SEMA_IS_ACTIVE(). + */ +#ifdef __usdt_va_opt +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema), ##__VA_ARGS__) +#else +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema) __VA_OPT__(,) __VA_ARGS__) +#endif + +/* + * Adjustable implementation aspects + */ +#ifndef USDT_ARG_CONSTRAINT +#if defined __powerpc__ +#define USDT_ARG_CONSTRAINT nZr +#elif defined __arm__ +#define USDT_ARG_CONSTRAINT g +#elif defined __loongarch__ +#define USDT_ARG_CONSTRAINT nmr +#else +#define USDT_ARG_CONSTRAINT nor +#endif +#endif /* USDT_ARG_CONSTRAINT */ + +#ifndef USDT_NOP +#if defined(__ia64__) || defined(__s390__) || defined(__s390x__) +#define USDT_NOP nop 0 +#else +#define USDT_NOP nop +#endif +#endif /* USDT_NOP */ + +/* + * Implementation details + */ +/* USDT name for implicitly-defined USDT semaphore, derived from group:name */ +#define __usdt_sema_name(group, name) __usdt_sema_##group##__##name +/* ELF section into which USDT semaphores are put */ +#define __usdt_sema_sec __attribute__((section(".probes"))) + +#define __usdt_concat(a, b) a ## b +#define __usdt_apply(fn, n) __usdt_concat(fn, n) + +#ifndef __usdt_nth +#define __usdt_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, N, ...) N +#endif + +#ifndef __usdt_narg +#ifdef __usdt_va_opt +#define __usdt_narg(...) __usdt_nth(_ __VA_OPT__(,) __VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#else +#define __usdt_narg(...) __usdt_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif +#endif /* __usdt_narg */ + +#define __usdt_hash # +#define __usdt_str_(x) #x +#define __usdt_str(x) __usdt_str_(x) + +#ifndef __usdt_asm_name +#define __usdt_asm_name(name) __asm__(__usdt_str(name)) +#endif + +#define __usdt_asm0() "\n" +#define __usdt_asm1(x) __usdt_str(x) "\n" +#define __usdt_asm2(x, ...) __usdt_str(x) "," __usdt_asm1(__VA_ARGS__) +#define __usdt_asm3(x, ...) __usdt_str(x) "," __usdt_asm2(__VA_ARGS__) +#define __usdt_asm4(x, ...) __usdt_str(x) "," __usdt_asm3(__VA_ARGS__) +#define __usdt_asm5(x, ...) __usdt_str(x) "," __usdt_asm4(__VA_ARGS__) +#define __usdt_asm6(x, ...) __usdt_str(x) "," __usdt_asm5(__VA_ARGS__) +#define __usdt_asm7(x, ...) __usdt_str(x) "," __usdt_asm6(__VA_ARGS__) +#define __usdt_asm8(x, ...) __usdt_str(x) "," __usdt_asm7(__VA_ARGS__) +#define __usdt_asm9(x, ...) __usdt_str(x) "," __usdt_asm8(__VA_ARGS__) +#define __usdt_asm10(x, ...) __usdt_str(x) "," __usdt_asm9(__VA_ARGS__) +#define __usdt_asm11(x, ...) __usdt_str(x) "," __usdt_asm10(__VA_ARGS__) +#define __usdt_asm12(x, ...) __usdt_str(x) "," __usdt_asm11(__VA_ARGS__) +#define __usdt_asm(...) __usdt_apply(__usdt_asm, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#ifdef __LP64__ +#define __usdt_asm_addr .8byte +#else +#define __usdt_asm_addr .4byte +#endif + +#define __usdt_asm_strz_(x) __usdt_asm1(.asciz #x) +#define __usdt_asm_strz(x) __usdt_asm_strz_(x) +#define __usdt_asm_str_(x) __usdt_asm1(.ascii #x) +#define __usdt_asm_str(x) __usdt_asm_str_(x) + +/* "semaphoreless" USDT case */ +#ifndef __usdt_sema_none +#define __usdt_sema_none(sema) +#endif + +/* implicitly defined __usdt_sema__group__name semaphore (using weak symbols) */ +#ifndef __usdt_sema_implicit +#define __usdt_sema_implicit(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm1(.ifndef sema) \ + __usdt_asm3( .pushsection .probes, "aw", "progbits") \ + __usdt_asm1( .weak sema) \ + __usdt_asm1( .hidden sema) \ + __usdt_asm1( .align 2) \ + __usdt_asm1(sema:) \ + __usdt_asm1( .zero 2) \ + __usdt_asm2( .type sema, @object) \ + __usdt_asm2( .size sema, 2) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + ); +#endif + +/* externally defined semaphore using USDT_DEFINE_SEMA() and passed explicitly by user */ +#ifndef __usdt_sema_explicit +#define __usdt_sema_explicit(sema) \ + __asm__ __volatile__ ("" :: "m" (sema)); +#endif + +/* main USDT definition (nop and .note.stapsdt metadata) */ +#define __usdt_probe(group, name, sema_def, sema, ...) do { \ + sema_def(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm( 990: USDT_NOP) \ + __usdt_asm3( .pushsection .note.stapsdt, "", "note") \ + __usdt_asm1( .balign 4) \ + __usdt_asm3( .4byte 992f-991f,994f-993f,3) \ + __usdt_asm1(991: .asciz "stapsdt") \ + __usdt_asm1(992: .balign 4) \ + __usdt_asm1(993: __usdt_asm_addr 990b) \ + __usdt_asm1( __usdt_asm_addr _.stapsdt.base) \ + __usdt_asm1( __usdt_asm_addr sema) \ + __usdt_asm_strz(group) \ + __usdt_asm_strz(name) \ + __usdt_asm_args(__VA_ARGS__) \ + __usdt_asm1( .ascii "\0") \ + __usdt_asm1(994: .balign 4) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.ifndef _.stapsdt.base) \ + __usdt_asm5( .pushsection .stapsdt.base,"aG","progbits",.stapsdt.base,comdat)\ + __usdt_asm1( .weak _.stapsdt.base) \ + __usdt_asm1( .hidden _.stapsdt.base) \ + __usdt_asm1(_.stapsdt.base:) \ + __usdt_asm1( .space 1) \ + __usdt_asm2( .size _.stapsdt.base, 1) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + :: __usdt_asm_ops(__VA_ARGS__) \ + ); \ +} while (0) + +/* + * NB: gdb PR24541 highlighted an unspecified corner of the sdt.h + * operand note format. + * + * The named register may be a longer or shorter (!) alias for the + * storage where the value in question is found. For example, on + * i386, 64-bit value may be put in register pairs, and a register + * name stored would identify just one of them. Previously, gcc was + * asked to emit the %w[id] (16-bit alias of some registers holding + * operands), even when a wider 32-bit value was used. + * + * Bottom line: the byte-width given before the @ sign governs. If + * there is a mismatch between that width and that of the named + * register, then a sys/sdt.h note consumer may need to employ + * architecture-specific heuristics to figure out where the compiler + * has actually put the complete value. + */ +#if defined(__powerpc__) || defined(__powerpc64__) +#define __usdt_argref(id) %I[id]%[id] +#elif defined(__i386__) +#define __usdt_argref(id) %k[id] /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */ +#else +#define __usdt_argref(id) %[id] +#endif + +#define __usdt_asm_arg(n) __usdt_asm_str(%c[__usdt_asz##n]) \ + __usdt_asm1(.ascii "@") \ + __usdt_asm_str(__usdt_argref(__usdt_aval##n)) + +#define __usdt_asm_args0 /* no arguments */ +#define __usdt_asm_args1 __usdt_asm_arg(1) +#define __usdt_asm_args2 __usdt_asm_args1 __usdt_asm1(.ascii " ") __usdt_asm_arg(2) +#define __usdt_asm_args3 __usdt_asm_args2 __usdt_asm1(.ascii " ") __usdt_asm_arg(3) +#define __usdt_asm_args4 __usdt_asm_args3 __usdt_asm1(.ascii " ") __usdt_asm_arg(4) +#define __usdt_asm_args5 __usdt_asm_args4 __usdt_asm1(.ascii " ") __usdt_asm_arg(5) +#define __usdt_asm_args6 __usdt_asm_args5 __usdt_asm1(.ascii " ") __usdt_asm_arg(6) +#define __usdt_asm_args7 __usdt_asm_args6 __usdt_asm1(.ascii " ") __usdt_asm_arg(7) +#define __usdt_asm_args8 __usdt_asm_args7 __usdt_asm1(.ascii " ") __usdt_asm_arg(8) +#define __usdt_asm_args9 __usdt_asm_args8 __usdt_asm1(.ascii " ") __usdt_asm_arg(9) +#define __usdt_asm_args10 __usdt_asm_args9 __usdt_asm1(.ascii " ") __usdt_asm_arg(10) +#define __usdt_asm_args11 __usdt_asm_args10 __usdt_asm1(.ascii " ") __usdt_asm_arg(11) +#define __usdt_asm_args12 __usdt_asm_args11 __usdt_asm1(.ascii " ") __usdt_asm_arg(12) +#define __usdt_asm_args(...) __usdt_apply(__usdt_asm_args, __usdt_narg(__VA_ARGS__)) + +#define __usdt_is_arr(x) (__builtin_classify_type(x) == 14 || __builtin_classify_type(x) == 5) +#define __usdt_arg_size(x) (__usdt_is_arr(x) ? sizeof(void *) : sizeof(x)) + +/* + * We can't use __builtin_choose_expr() in C++, so fall back to table-based + * signedness determination for known types, utilizing templates magic. + */ +#ifdef __cplusplus + +#define __usdt_is_signed(x) (!__usdt_is_arr(x) && __usdt_t<__typeof(x)>::is_signed) + +#include <cstddef> + +template<typename T> struct __usdt_t { static const bool is_signed = false; }; +template<typename A> struct __usdt_t<A[]> : public __usdt_t<A *> {}; +template<typename A, size_t N> struct __usdt_t<A[N]> : public __usdt_t<A *> {}; + +#define __usdt_def_signed(T) \ +template<> struct __usdt_t<T> { static const bool is_signed = true; }; \ +template<> struct __usdt_t<const T> { static const bool is_signed = true; }; \ +template<> struct __usdt_t<volatile T> { static const bool is_signed = true; }; \ +template<> struct __usdt_t<const volatile T> { static const bool is_signed = true; } +#define __usdt_maybe_signed(T) \ +template<> struct __usdt_t<T> { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t<const T> { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t<volatile T> { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t<const volatile T> { static const bool is_signed = (T)-1 < (T)1; } + +__usdt_def_signed(signed char); +__usdt_def_signed(short); +__usdt_def_signed(int); +__usdt_def_signed(long); +__usdt_def_signed(long long); +__usdt_maybe_signed(char); +__usdt_maybe_signed(wchar_t); + +#else /* !__cplusplus */ + +#define __usdt_is_inttype(x) (__builtin_classify_type(x) >= 1 && __builtin_classify_type(x) <= 4) +#define __usdt_inttype(x) __typeof(__builtin_choose_expr(__usdt_is_inttype(x), (x), 0U)) +#define __usdt_is_signed(x) ((__usdt_inttype(x))-1 < (__usdt_inttype(x))1) + +#endif /* __cplusplus */ + +#define __usdt_asm_op(n, x) \ + [__usdt_asz##n] "n" ((__usdt_is_signed(x) ? (int)-1 : 1) * (int)__usdt_arg_size(x)), \ + [__usdt_aval##n] __usdt_str(USDT_ARG_CONSTRAINT)(x) + +#define __usdt_asm_ops0() [__usdt_dummy] "g" (0) +#define __usdt_asm_ops1(x) __usdt_asm_op(1, x) +#define __usdt_asm_ops2(a,x) __usdt_asm_ops1(a), __usdt_asm_op(2, x) +#define __usdt_asm_ops3(a,b,x) __usdt_asm_ops2(a,b), __usdt_asm_op(3, x) +#define __usdt_asm_ops4(a,b,c,x) __usdt_asm_ops3(a,b,c), __usdt_asm_op(4, x) +#define __usdt_asm_ops5(a,b,c,d,x) __usdt_asm_ops4(a,b,c,d), __usdt_asm_op(5, x) +#define __usdt_asm_ops6(a,b,c,d,e,x) __usdt_asm_ops5(a,b,c,d,e), __usdt_asm_op(6, x) +#define __usdt_asm_ops7(a,b,c,d,e,f,x) __usdt_asm_ops6(a,b,c,d,e,f), __usdt_asm_op(7, x) +#define __usdt_asm_ops8(a,b,c,d,e,f,g,x) __usdt_asm_ops7(a,b,c,d,e,f,g), __usdt_asm_op(8, x) +#define __usdt_asm_ops9(a,b,c,d,e,f,g,h,x) __usdt_asm_ops8(a,b,c,d,e,f,g,h), __usdt_asm_op(9, x) +#define __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,x) __usdt_asm_ops9(a,b,c,d,e,f,g,h,i), __usdt_asm_op(10, x) +#define __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,x) __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,j), __usdt_asm_op(11, x) +#define __usdt_asm_ops12(a,b,c,d,e,f,g,h,i,j,k,x) __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,k), __usdt_asm_op(12, x) +#define __usdt_asm_ops(...) __usdt_apply(__usdt_asm_ops, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#endif /* __USDT_H */ diff --git a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c index b616575c3b00..ce13002c7a19 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c +++ b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c @@ -93,7 +93,7 @@ .expected_attach_type = BPF_SK_LOOKUP, .result = VERBOSE_ACCEPT, .runs = -1, - .errstr = "0: (7a) *(u64 *)(r10 -8) = -44 ; R10=fp0 fp-8_w=-44\ + .errstr = "0: (7a) *(u64 *)(r10 -8) = -44 ; R10=fp0 fp-8=-44\ 2: (c5) if r0 s< 0x0 goto pc+2\ - R0_w=-44", + R0=-44", }, diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index f3492efc8834..c8d640802cce 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -1375,7 +1375,7 @@ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), /* write into map value */ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), - /* fetch secound map_value_ptr from the stack */ + /* fetch second map_value_ptr from the stack */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), /* write into map value */ @@ -1439,7 +1439,7 @@ /* second time with fp-16 */ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), - /* fetch secound map_value_ptr from the stack */ + /* fetch second map_value_ptr from the stack */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), /* write into map value */ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), @@ -1493,7 +1493,7 @@ /* second time with fp-16 */ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), - /* fetch secound map_value_ptr from the stack */ + /* fetch second map_value_ptr from the stack */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), /* write into map value */ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), @@ -2380,7 +2380,7 @@ */ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1), BPF_MOV64_REG(BPF_REG_9, BPF_REG_8), - /* r9 = *r9 ; verifier get's to this point via two paths: + /* r9 = *r9 ; verifier gets to this point via two paths: * ; (I) one including r9 = r8, verified first; * ; (II) one excluding r9 = r8, verified next. * ; After load of *r9 to r9 the frame[0].fp[-24].id == r9.id. diff --git a/tools/testing/selftests/bpf/verify_sig_setup.sh b/tools/testing/selftests/bpf/verify_sig_setup.sh index f2cac42298ba..09179fb551f0 100755 --- a/tools/testing/selftests/bpf/verify_sig_setup.sh +++ b/tools/testing/selftests/bpf/verify_sig_setup.sh @@ -32,7 +32,7 @@ usage() exit 1 } -setup() +genkey() { local tmp_dir="$1" @@ -45,9 +45,14 @@ setup() openssl x509 -in ${tmp_dir}/signing_key.pem -out \ ${tmp_dir}/signing_key.der -outform der +} - key_id=$(cat ${tmp_dir}/signing_key.der | keyctl padd asymmetric ebpf_testing_key @s) +setup() +{ + local tmp_dir="$1" + genkey "${tmp_dir}" + key_id=$(cat ${tmp_dir}/signing_key.der | keyctl padd asymmetric ebpf_testing_key @s) keyring_id=$(keyctl newring ebpf_testing_keyring @s) keyctl link $key_id $keyring_id } @@ -105,6 +110,8 @@ main() if [[ "${action}" == "setup" ]]; then setup "${tmp_dir}" + elif [[ "${action}" == "genkey" ]]; then + genkey "${tmp_dir}" elif [[ "${action}" == "cleanup" ]]; then cleanup "${tmp_dir}" elif [[ "${action}" == "fsverity-create-sign" ]]; then diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index d532dd82a3a8..e962f133250c 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -181,6 +181,12 @@ struct var_preset { bool applied; }; +enum dump_mode { + DUMP_NONE = 0, + DUMP_XLATED = 1, + DUMP_JITED = 2, +}; + static struct env { char **filenames; int filename_cnt; @@ -227,6 +233,7 @@ static struct env { char orig_cgroup[PATH_MAX]; char stat_cgroup[PATH_MAX]; int memory_peak_fd; + __u32 dump_mode; } env; static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) @@ -271,6 +278,7 @@ const char argp_program_doc[] = enum { OPT_LOG_FIXED = 1000, OPT_LOG_SIZE = 1001, + OPT_DUMP = 1002, }; static const struct argp_option opts[] = { @@ -295,6 +303,7 @@ static const struct argp_option opts[] = { "Force BPF verifier failure on register invariant violation (BPF_F_TEST_REG_INVARIANTS program flag)" }, { "top-src-lines", 'S', "N", 0, "Emit N most frequent source code lines" }, { "set-global-vars", 'G', "GLOBAL", 0, "Set global variables provided in the expression, for example \"var1 = 1\"" }, + { "dump", OPT_DUMP, "DUMP_MODE", OPTION_ARG_OPTIONAL, "Print BPF program dump (xlated, jited)" }, {}, }; @@ -427,6 +436,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return err; } break; + case OPT_DUMP: + if (!arg || strcasecmp(arg, "xlated") == 0) { + env.dump_mode |= DUMP_XLATED; + } else if (strcasecmp(arg, "jited") == 0) { + env.dump_mode |= DUMP_JITED; + } else { + fprintf(stderr, "Unrecognized dump mode '%s'\n", arg); + return -EINVAL; + } + break; default: return ARGP_ERR_UNKNOWN; } @@ -1554,6 +1573,36 @@ static int parse_rvalue(const char *val, struct rvalue *rvalue) return 0; } +static void dump(__u32 prog_id, enum dump_mode mode, const char *file_name, const char *prog_name) +{ + char command[64], buf[4096]; + FILE *fp; + int status; + + status = system("command -v bpftool > /dev/null 2>&1"); + if (status != 0) { + fprintf(stderr, "bpftool is not available, can't print program dump\n"); + return; + } + snprintf(command, sizeof(command), "bpftool prog dump %s id %u", + mode == DUMP_JITED ? "jited" : "xlated", prog_id); + fp = popen(command, "r"); + if (!fp) { + fprintf(stderr, "bpftool failed with error: %d\n", errno); + return; + } + + printf("DUMP (%s) %s/%s:\n", mode == DUMP_JITED ? "JITED" : "XLATED", file_name, prog_name); + while (fgets(buf, sizeof(buf), fp)) + fputs(buf, stdout); + fprintf(stdout, "\n"); + + if (ferror(fp)) + fprintf(stderr, "Failed to dump BPF prog with error: %d\n", errno); + + pclose(fp); +} + static int process_prog(const char *filename, struct bpf_object *obj, struct bpf_program *prog) { const char *base_filename = basename(strdupa(filename)); @@ -1630,8 +1679,13 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf memset(&info, 0, info_len); fd = bpf_program__fd(prog); - if (fd > 0 && bpf_prog_get_info_by_fd(fd, &info, &info_len) == 0) + if (fd > 0 && bpf_prog_get_info_by_fd(fd, &info, &info_len) == 0) { stats->stats[JITED_SIZE] = info.jited_prog_len; + if (env.dump_mode & DUMP_JITED) + dump(info.id, DUMP_JITED, base_filename, prog_name); + if (env.dump_mode & DUMP_XLATED) + dump(info.id, DUMP_XLATED, base_filename, prog_name); + } parse_verif_log(buf, buf_sz, stats); diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c index 1503a1d2faa0..9ed8c796645d 100644 --- a/tools/testing/selftests/bpf/xdping.c +++ b/tools/testing/selftests/bpf/xdping.c @@ -155,7 +155,7 @@ int main(int argc, char **argv) } if (!server) { - /* Only supports IPv4; see hints initiailization above. */ + /* Only supports IPv4; see hints initialization above. */ if (getaddrinfo(argv[optind], NULL, &hints, &a) || !a) { fprintf(stderr, "Could not resolve %s\n", argv[optind]); return 1; diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h index 93c2cc413cfc..48729da142c2 100644 --- a/tools/testing/selftests/bpf/xsk.h +++ b/tools/testing/selftests/bpf/xsk.h @@ -93,8 +93,8 @@ static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) /* Refresh the local tail pointer. * cached_cons is r->size bigger than the real consumer pointer so * that this addition can be avoided in the more frequently - * executed code that computs free_entries in the beginning of - * this function. Without this optimization it whould have been + * executed code that computes free_entries in the beginning of + * this function. Without this optimization it would have been * free_entries = r->cached_prod - r->cached_cons + r->size. */ r->cached_cons = __atomic_load_n(r->consumer, __ATOMIC_ACQUIRE); diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index a29de0713f19..352adc8df2d1 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -2276,25 +2276,13 @@ static int testapp_xdp_metadata_copy(struct test_spec *test) { struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; - struct bpf_map *data_map; - int count = 0; - int key = 0; test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_populate_metadata, skel_tx->progs.xsk_xdp_populate_metadata, skel_rx->maps.xsk, skel_tx->maps.xsk); test->ifobj_rx->use_metadata = true; - data_map = bpf_object__find_map_by_name(skel_rx->obj, "xsk_xdp_.bss"); - if (!data_map || !bpf_map__is_internal(data_map)) { - ksft_print_msg("Error: could not find bss section of XDP program\n"); - return TEST_FAILURE; - } - - if (bpf_map_update_elem(bpf_map__fd(data_map), &key, &count, BPF_ANY)) { - ksft_print_msg("Error: could not update count element\n"); - return TEST_FAILURE; - } + skel_rx->bss->count = 0; return testapp_validate_traffic(test); } diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 0e89fcff4d05..44c52f620fda 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -522,6 +522,18 @@ int proc_mount_contains(const char *option) return strstr(buf, option) != NULL; } +int cgroup_feature(const char *feature) +{ + char buf[PAGE_SIZE]; + ssize_t read; + + read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf)); + if (read < 0) + return read; + + return strstr(buf, feature) != NULL; +} + ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) { char path[PATH_MAX]; diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index c69cab66254b..9dc90a1b386d 100644 --- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -60,6 +60,7 @@ extern int cg_run_nowait(const char *cgroup, extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); int proc_mount_contains(const char *option); +int cgroup_feature(const char *feature); extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); extern pid_t clone_into_cgroup(int cgroup_fd); diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 8730645d363a..dfb763819581 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -804,6 +804,662 @@ cleanup: return ret; } +/* + * Get the current frozen_usec for the cgroup. + */ +static long cg_check_freezetime(const char *cgroup) +{ + return cg_read_key_long(cgroup, "cgroup.stat.local", + "frozen_usec "); +} + +/* + * Test that the freeze time will behave as expected for an empty cgroup. + */ +static int test_cgfreezer_time_empty(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_empty"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create an empty cgroup and check that its freeze time + * is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 2) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 3) Unfreeze the cgroup. Check that the freeze time is + * larger than at 2). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Check the freeze time again to ensure that it has not + * changed. + */ + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * A simple test for cgroup freezer time accounting. This test follows + * the same flow as test_cgfreezer_time_empty, but with a single process + * in the cgroup. + */ +static int test_cgfreezer_time_simple(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_simple"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create a cgroup and check that its freeze time is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Populate the cgroup with one child and check that the + * freeze time is still 0. + */ + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr > prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 3) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 4) Unfreeze the cgroup. Check that the freeze time is + * larger than at 3). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Sleep for 1000 us. Check that the freeze time is the + * same as at 4). + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that freezer time accounting works as expected, even while we're + * populating a cgroup with processes. + */ +static int test_cgfreezer_time_populate(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + int i; + + cgroup = cg_name(root, "cg_time_test_populate"); + if (!cgroup) + goto cleanup; + + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 1) Populate the cgroup with 100 processes. Check that + * the freeze time is 0. + */ + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Wait for the group to become fully populated. Check + * that the freeze time is 0. + */ + if (cg_wait_for_proc_count(cgroup, 100)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 3) Freeze the cgroup and then populate it with 100 more + * processes. Check that the freeze time continues to grow. + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Wait for the group to become fully populated. Check + * that the freeze time is larger than at 3). + */ + if (cg_wait_for_proc_count(cgroup, 200)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Unfreeze the cgroup. Check that the freeze time is + * larger than at 4). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 6) Kill the processes. Check that the freeze time is the + * same as it was at 5). + */ + if (cg_killall(cgroup)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 7) Freeze and unfreeze the cgroup. Check that the freeze + * time is larger than it was at 6). + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that frozen time for a cgroup continues to work as expected, + * even as processes are migrated. Frozen cgroup A's freeze time should + * continue to increase and running cgroup B's should stay 0. + */ +static int test_cgfreezer_time_migrate(const char *root) +{ + long prev_A, curr_A, curr_B; + char *cgroup[2] = {0}; + int ret = KSFT_FAIL; + int pid; + + cgroup[0] = cg_name(root, "cg_time_test_migrate_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(root, "cg_time_test_migrate_B"); + if (!cgroup[1]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + pid = cg_run_nowait(cgroup[0], child_fn, NULL); + if (pid < 0) + goto cleanup; + + if (cg_wait_for_proc_count(cgroup[0], 1)) + goto cleanup; + + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A) { + debug("Expect time (%ld) to be 0\n", curr_A); + goto cleanup; + } + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + /* + * Freeze cgroup A. + */ + if (cg_freeze_wait(cgroup[0], true)) + goto cleanup; + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be > 0\n", curr_A); + goto cleanup; + } + + /* + * Migrate from A (frozen) to B (running). + */ + if (cg_enter(cgroup[1], pid)) + goto cleanup; + + usleep(1000); + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr_A, prev_A); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup[0]) + cg_destroy(cgroup[0]); + free(cgroup[0]); + if (cgroup[1]) + cg_destroy(cgroup[1]); + free(cgroup[1]); + return ret; +} + +/* + * The test creates a cgroup and freezes it. Then it creates a child cgroup. + * After that it checks that the child cgroup has a non-zero freeze time + * that is less than the parent's. Next, it freezes the child, unfreezes + * the parent, and sleeps. Finally, it checks that the child's freeze + * time has grown larger than the parent's. + */ +static int test_cgfreezer_time_parent(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_parent_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_parent_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_freeze_wait(parent, true)) + goto cleanup; + + usleep(1000); + if (cg_create(child)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + /* + * Since the parent was frozen the entire time the child cgroup + * was being created, we expect the parent's freeze time to be + * larger than the child's. + * + * Ideally, we would be able to check both times simultaneously, + * but here we get the child's after we get the parent's. + */ + ptime = cg_check_freezetime(parent); + ctime = cg_check_freezetime(child); + if (ptime <= ctime) { + debug("Expect ptime (%ld) > ctime (%ld)\n", ptime, ctime); + goto cleanup; + } + + if (cg_freeze_nowait(child, true)) + goto cleanup; + + if (cg_freeze_wait(parent, false)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + usleep(100000); + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + + if (ctime <= ptime) { + debug("Expect ctime (%ld) > ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates a parent cgroup and a child cgroup. Then, it freezes + * the child and checks that the child's freeze time is greater than the + * parent's, which should be zero. + */ +static int test_cgfreezer_time_child(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_child_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_child_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(child)) + goto cleanup; + + if (cg_freeze_wait(child, true)) + goto cleanup; + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + if (ptime != 0) { + debug("Expect ptime (%ld) to be 0\n", ptime); + goto cleanup; + } + + if (ctime <= ptime) { + debug("Expect ctime (%ld) <= ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates the following hierarchy: + * A + * | + * B + * | + * C + * + * Then it freezes the cgroups in the order C, B, A. + * Then it unfreezes the cgroups in the order A, B, C. + * Then it checks that C's freeze time is larger than B's and + * that B's is larger than A's. + */ +static int test_cgfreezer_time_nested(const char *root) +{ + char *cgroup[3] = {0}; + int ret = KSFT_FAIL; + long time[3] = {0}; + int i; + + cgroup[0] = cg_name(root, "cg_test_time_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(cgroup[0], "B"); + if (!cgroup[1]) + goto cleanup; + + cgroup[2] = cg_name(cgroup[1], "C"); + if (!cgroup[2]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + if (cg_create(cgroup[2])) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[0], true)) + goto cleanup; + + usleep(1000); + + if (cg_freeze_nowait(cgroup[0], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], false)) + goto cleanup; + + time[2] = cg_check_freezetime(cgroup[2]); + time[1] = cg_check_freezetime(cgroup[1]); + time[0] = cg_check_freezetime(cgroup[0]); + + if (time[2] <= time[1]) { + debug("Expect C's time (%ld) > B's time (%ld)", time[2], time[1]); + goto cleanup; + } + + if (time[1] <= time[0]) { + debug("Expect B's time (%ld) > A's time (%ld)", time[1], time[0]); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + for (i = 2; i >= 0 && cgroup[i]; i--) { + cg_destroy(cgroup[i]); + free(cgroup[i]); + } + + return ret; +} + #define T(x) { x, #x } struct cgfreezer_test { int (*fn)(const char *root); @@ -819,6 +1475,13 @@ struct cgfreezer_test { T(test_cgfreezer_stopped), T(test_cgfreezer_ptraced), T(test_cgfreezer_vfork), + T(test_cgfreezer_time_empty), + T(test_cgfreezer_time_simple), + T(test_cgfreezer_time_populate), + T(test_cgfreezer_time_migrate), + T(test_cgfreezer_time_parent), + T(test_cgfreezer_time_child), + T(test_cgfreezer_time_nested), }; #undef T diff --git a/tools/testing/selftests/cgroup/test_pids.c b/tools/testing/selftests/cgroup/test_pids.c index 9ecb83c6cc5c..d8a1d1cd5007 100644 --- a/tools/testing/selftests/cgroup/test_pids.c +++ b/tools/testing/selftests/cgroup/test_pids.c @@ -77,6 +77,9 @@ static int test_pids_events(const char *root) char *cg_parent = NULL, *cg_child = NULL; int pid; + if (cgroup_feature("pids_localevents") <= 0) + return KSFT_SKIP; + cg_parent = cg_name(root, "pids_parent"); cg_child = cg_name(cg_parent, "pids_child"); if (!cg_parent || !cg_child) diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index 5a5a7a5f7e1d..a4ac80bb1003 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -446,9 +446,6 @@ TEST_F(coredump, socket_detect_userspace_client) if (info.coredump_mask & PIDFD_COREDUMPED) goto out; - if (read(fd_coredump, &c, 1) < 1) - goto out; - exit_code = EXIT_SUCCESS; out: if (fd_peer_pidfd >= 0) diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 5b230deb19e8..9a3499827d4b 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -4,6 +4,7 @@ TEST_GEN_FILES += access_memory access_memory_even TEST_FILES = _damon_sysfs.py +TEST_FILES += drgn_dump_damon_status.py # functionality tests TEST_PROGS += sysfs.sh diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 2b10854e4b1e..44b98f17f8ff 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -10,7 +10,8 @@ TEST_PROGS := \ mode-2-recovery-updelay.sh \ bond_options.sh \ bond-eth-type-change.sh \ - bond_macvlan_ipvlan.sh + bond_macvlan_ipvlan.sh \ + bond_passive_lacp.sh TEST_FILES := \ lag_lib.sh \ diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index 7bc148889ca7..187b478d0ddf 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -7,6 +7,8 @@ ALL_TESTS=" prio arp_validate num_grat_arp + fail_over_mac + vlan_over_bond " lib_dir=$(dirname "$0") @@ -352,8 +354,8 @@ garp_test() exp_num=$(echo "${param}" | cut -f6 -d ' ') active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") - slowwait_for_counter $((exp_num + 5)) $exp_num \ - tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" + slowwait_for_counter $((exp_num + 5)) $exp_num tc_rule_handle_stats_get \ + "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" &> /dev/null # check result real_num=$(tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}") @@ -376,6 +378,197 @@ num_grat_arp() done } +check_all_mac_same() +{ + RET=0 + # all slaves should have same mac address (with the first port's mac) + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + local eth1_mac=$(ip -n "$s_ns" -j link show eth1 | jq -r '.[]["address"]') + local eth2_mac=$(ip -n "$s_ns" -j link show eth2 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ] || [ "$eth0_mac" != "$bond_mac" ] || \ + [ "$eth1_mac" != "$bond_mac" ] || [ "$eth2_mac" != "$bond_mac" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_first() +{ + RET=0 + # bond mac address should be same with the first added slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_active() +{ + RET=0 + # bond mac address should be same with active slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + local active_slave_mac=$(ip -n "$s_ns" -j link show "$active_slave" | jq -r '.[]["address"]') + if [ "$bond_mac" != "$active_slave_mac" ]; then + RET=1 + fi +} + +check_backup_slave_mac_not_change() +{ + RET=0 + # backup slave's mac address is not changed + if ip -n "$s_ns" -d -j link show type bond_slave | jq -e '.[] + | select(.linkinfo.info_slave_data.state=="BACKUP") + | select(.address != .linkinfo.info_slave_data.perm_hwaddr)' &> /dev/null; then + RET=1 + fi +} + +check_backup_slave_mac_inherit() +{ + local backup_mac + RET=0 + + # backup slaves should use mac[1] or mac[2] + local backup_macs=$(ip -n "$s_ns" -d -j link show type bond_slave | \ + jq -r '.[] | select(.linkinfo.info_slave_data.state=="BACKUP") | .address') + for backup_mac in $backup_macs; do + if [ "$backup_mac" != "${mac[1]}" ] && [ "$backup_mac" != "${mac[2]}" ]; then + RET=1 + fi + done +} + +check_first_slave_random_mac() +{ + RET=0 + # remove the first added slave and added it back + ip -n "$s_ns" link set eth0 nomaster + ip -n "$s_ns" link set eth0 master bond0 + + # the first slave should use random mac address + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" = "${mac[0]}" ] && RET=1 + log_test "bond fail_over_mac follow" "random first slave mac" + + # remove the first slave, the permanent MAC address should be restored back + ip -n "$s_ns" link set eth0 nomaster + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" != "${mac[0]}" ] && RET=1 +} + +do_active_backup_failover() +{ + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + ip -n ${s_ns} link set ${active_slave} down + slowwait 2 active_slave_changed $active_slave + ip -n ${s_ns} link set ${active_slave} up +} + +fail_over_mac() +{ + # Bring down the first interface on the switch to force the bond to + # select another active interface instead of the first one that joined. + ip -n "$g_ns" link set s0 down + + # fail_over_mac none + bond_reset "mode active-backup miimon 100 fail_over_mac 0" + check_all_mac_same + log_test "fail_over_mac 0" "all slaves have same mac" + do_active_backup_failover + check_all_mac_same + log_test "fail_over_mac 0" "failover: all slaves have same mac" + + # fail_over_mac active + bond_reset "mode active-backup miimon 100 fail_over_mac 1" + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "backup slave mac is not changed" + do_active_backup_failover + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "failover: backup slave mac is not changed" + + # fail_over_mac follow + bond_reset "mode active-backup miimon 100 fail_over_mac 2" + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "backup slave mac inherit" + do_active_backup_failover + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "failover: bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "failover: backup slave mac inherit" + check_first_slave_random_mac + log_test "fail_over_mac 2" "first slave mac random" +} + +vlan_over_bond_arp() +{ + local mode="$1" + RET=0 + + bond_reset "mode $mode arp_interval 100 arp_ip_target 192.0.3.10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 101 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond arp" "$mode" +} + +vlan_over_bond_ns() +{ + local mode="$1" + RET=0 + + if skip_ns; then + log_test_skip "vlan_over_bond ns" "$mode" + return 0 + fi + + bond_reset "mode $mode arp_interval 100 ns_ip6_target 2001:db8::3:10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 102 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond ns" "$mode" +} + +vlan_over_bond() +{ + # add vlan 3 for client + ip -n "${c_ns}" link add eth0.3 link eth0 type vlan id 3 + ip -n "${c_ns}" link set eth0.3 up + ip -n "${c_ns}" addr add 192.0.3.10/24 dev eth0.3 + ip -n "${c_ns}" addr add 2001:db8::3:10/64 dev eth0.3 + + # Add tc rule to check the vlan pkts + tc -n "${c_ns}" qdisc add dev eth0.3 clsact + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol arp \ + handle 101 flower skip_hw arp_op request \ + arp_sip 192.0.3.1 arp_tip 192.0.3.10 action pass + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol ipv6 \ + handle 102 flower skip_hw ip_proto icmpv6 \ + type 135 src_ip 2001:db8::3:1 action pass + + vlan_over_bond_arp "active-backup" + vlan_over_bond_ns "active-backup" +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh b/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh new file mode 100755 index 000000000000..9c3b089813df --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test if a bond interface works with lacp_active=off. + +# shellcheck disable=SC2034 +REQUIRE_MZ=no +NUM_NETIFS=0 +lib_dir=$(dirname "$0") +# shellcheck disable=SC1091 +source "$lib_dir"/../../../net/forwarding/lib.sh + +# shellcheck disable=SC2317 +check_port_state() +{ + local netns=$1 + local port=$2 + local state=$3 + + ip -n "${netns}" -d -j link show "$port" | \ + jq -e ".[].linkinfo.info_slave_data.ad_actor_oper_port_state_str | index(\"${state}\") != null" > /dev/null +} + +check_pkt_count() +{ + RET=0 + local ns="$1" + local iface="$2" + + # wait 65s, one per 30s + slowwait_for_counter 65 2 tc_rule_handle_stats_get \ + "dev ${iface} egress" 101 ".packets" "-n ${ns}" &> /dev/null +} + +setup() { + setup_ns c_ns s_ns + + # shellcheck disable=SC2154 + ip -n "${c_ns}" link add eth0 type veth peer name eth0 netns "${s_ns}" + ip -n "${c_ns}" link add eth1 type veth peer name eth1 netns "${s_ns}" + + # Add tc filter to count the pkts + tc -n "${c_ns}" qdisc add dev eth0 clsact + tc -n "${c_ns}" filter add dev eth0 egress handle 101 protocol 0x8809 matchall action pass + tc -n "${s_ns}" qdisc add dev eth1 clsact + tc -n "${s_ns}" filter add dev eth1 egress handle 101 protocol 0x8809 matchall action pass + + ip -n "${s_ns}" link add bond0 type bond mode 802.3ad lacp_active on lacp_rate fast + ip -n "${s_ns}" link set eth0 master bond0 + ip -n "${s_ns}" link set eth1 master bond0 + + ip -n "${c_ns}" link add bond0 type bond mode 802.3ad lacp_active off lacp_rate fast + ip -n "${c_ns}" link set eth0 master bond0 + ip -n "${c_ns}" link set eth1 master bond0 + +} + +trap cleanup_all_ns EXIT +setup + +# The bond will send 2 lacpdu pkts during init time, let's wait at least 2s +# after interface up +ip -n "${c_ns}" link set bond0 up +sleep 2 + +# 1. The passive side shouldn't send LACPDU. +check_pkt_count "${c_ns}" "eth0" && RET=1 +log_test "802.3ad lacp_active off" "init port" + +ip -n "${s_ns}" link set bond0 up +# 2. The passive side should not have the 'active' flag. +RET=0 +slowwait 2 check_port_state "${c_ns}" "eth0" "active" && RET=1 +log_test "802.3ad lacp_active off" "port state active" + +# 3. The active side should have the 'active' flag. +RET=0 +slowwait 2 check_port_state "${s_ns}" "eth0" "active" || RET=1 +log_test "802.3ad lacp_active on" "port state active" + +# 4. Make sure the connection is not expired. +RET=0 +slowwait 5 check_port_state "${s_ns}" "eth0" "distributing" +slowwait 10 check_port_state "${s_ns}" "eth0" "expired" && RET=1 +log_test "bond 802.3ad lacp_active off" "port connection" + +# After testing, disconnect one port on each side to check the state. +ip -n "${s_ns}" link set eth0 nomaster +ip -n "${s_ns}" link set eth0 up +ip -n "${c_ns}" link set eth1 nomaster +ip -n "${c_ns}" link set eth1 up +# Due to Periodic Machine and Rx Machine state change, the bond will still +# send lacpdu pkts in a few seconds. sleep at lease 5s to make sure +# negotiation finished +sleep 5 + +# 5. The active side should keep sending LACPDU. +check_pkt_count "${s_ns}" "eth1" || RET=1 +log_test "bond 802.3ad lacp_active on" "port pkt after disconnect" + +# 6. The passive side shouldn't send LACPDU anymore. +check_pkt_count "${c_ns}" "eth0" && RET=1 +log_test "bond 802.3ad lacp_active off" "port pkt after disconnect" + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh index 195ef83cfbf1..167aa4a4a12a 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh @@ -39,6 +39,8 @@ g_ip4="192.0.2.254" s_ip6="2001:db8::1" c_ip6="2001:db8::10" g_ip6="2001:db8::254" +mac[0]="00:0a:0b:0c:0d:01" +mac[1]="00:0a:0b:0c:0d:02" gateway_create() { @@ -62,6 +64,7 @@ server_create() for i in $(seq 0 1); do ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh index 3a1333d9a85b..23a2932301cc 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh @@ -26,6 +26,7 @@ # +-------------------------------------+ source bond_topo_2d1c.sh +mac[2]="00:0a:0b:0c:0d:03" setup_prepare() { @@ -36,6 +37,7 @@ setup_prepare() # Add the extra device as we use 3 down links for bond0 local i=2 ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 ip -n ${s_ns} link set eth${i} master bond0 diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index dad4e5fda4db..832fa1caeb66 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -6,6 +6,8 @@ CONFIG_MACVLAN=y CONFIG_IPVLAN=y CONFIG_NET_ACT_GACT=y CONFIG_NET_CLS_FLOWER=y +CONFIG_NET_CLS_MATCHALL=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y CONFIG_VETH=y +CONFIG_VLAN_8021Q=m diff --git a/tools/testing/selftests/drivers/net/hw/csum.py b/tools/testing/selftests/drivers/net/hw/csum.py index cd23af875317..3e3a89a34afe 100755 --- a/tools/testing/selftests/drivers/net/hw/csum.py +++ b/tools/testing/selftests/drivers/net/hw/csum.py @@ -17,7 +17,7 @@ def test_receive(cfg, ipver="6", extra_args=None): ip_args = f"-{ipver} -S {cfg.remote_addr_v[ipver]} -D {cfg.addr_v[ipver]}" rx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -n 100 {ip_args} -r 1 -R {extra_args}" - tx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -n 100 {ip_args} -r 1 -T {extra_args}" + tx_cmd = f"{cfg.bin_remote} -i {cfg.remote_ifname} -n 100 {ip_args} -r 1 -T {extra_args}" with bkg(rx_cmd, exit_wait=True): wait_port_listen(34000, proto="udp") @@ -37,7 +37,7 @@ def test_transmit(cfg, ipver="6", extra_args=None): if extra_args != "-U -Z": extra_args += " -r 1" - rx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -L 1 -n 100 {ip_args} -R {extra_args}" + rx_cmd = f"{cfg.bin_remote} -i {cfg.remote_ifname} -L 1 -n 100 {ip_args} -R {extra_args}" tx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -L 1 -n 100 {ip_args} -T {extra_args}" with bkg(rx_cmd, host=cfg.remote, exit_wait=True): diff --git a/tools/testing/selftests/drivers/net/napi_threaded.py b/tools/testing/selftests/drivers/net/napi_threaded.py index b2698db39817..9699a100a87d 100755 --- a/tools/testing/selftests/drivers/net/napi_threaded.py +++ b/tools/testing/selftests/drivers/net/napi_threaded.py @@ -35,6 +35,8 @@ def _setup_deferred_cleanup(cfg) -> None: threaded = cmd(f"cat /sys/class/net/{cfg.ifname}/threaded").stdout defer(_set_threaded_state, cfg, threaded) + return combined + def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None: """ @@ -49,7 +51,7 @@ def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None: napi0_id = napis[0]['id'] napi1_id = napis[1]['id'] - _setup_deferred_cleanup(cfg) + qcnt = _setup_deferred_cleanup(cfg) # set threaded _set_threaded_state(cfg, 1) @@ -62,7 +64,7 @@ def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None: nl.napi_set({'id': napi1_id, 'threaded': 'disabled'}) cmd(f"ethtool -L {cfg.ifname} combined 1") - cmd(f"ethtool -L {cfg.ifname} combined 2") + cmd(f"ethtool -L {cfg.ifname} combined {qcnt}") _assert_napi_threaded_enabled(nl, napi0_id) _assert_napi_threaded_disabled(nl, napi1_id) @@ -80,7 +82,7 @@ def change_num_queues(cfg, nl) -> None: napi0_id = napis[0]['id'] napi1_id = napis[1]['id'] - _setup_deferred_cleanup(cfg) + qcnt = _setup_deferred_cleanup(cfg) # set threaded _set_threaded_state(cfg, 1) @@ -90,7 +92,7 @@ def change_num_queues(cfg, nl) -> None: _assert_napi_threaded_enabled(nl, napi1_id) cmd(f"ethtool -L {cfg.ifname} combined 1") - cmd(f"ethtool -L {cfg.ifname} combined 2") + cmd(f"ethtool -L {cfg.ifname} combined {qcnt}") # check napi threaded is set for both napis _assert_napi_threaded_enabled(nl, napi0_id) diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore index fcbdb1297e24..64ac0dfa46b7 100644 --- a/tools/testing/selftests/filesystems/.gitignore +++ b/tools/testing/selftests/filesystems/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only dnotify_test devpts_pts +fclog file_stressor anon_inode_test kernfs_test diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile index 73d4650af1a5..85427d7f19b9 100644 --- a/tools/testing/selftests/filesystems/Makefile +++ b/tools/testing/selftests/filesystems/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += $(KHDR_INCLUDES) -TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test +TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog TEST_GEN_PROGS_EXTENDED := dnotify_test include ../lib.mk diff --git a/tools/testing/selftests/filesystems/fclog.c b/tools/testing/selftests/filesystems/fclog.c new file mode 100644 index 000000000000..912a8b755c3b --- /dev/null +++ b/tools/testing/selftests/filesystems/fclog.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Aleksa Sarai <cyphar@cyphar.com> + * Copyright (C) 2025 SUSE LLC. + */ + +#include <assert.h> +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mount.h> + +#include "../kselftest_harness.h" + +#define ASSERT_ERRNO(expected, _t, seen) \ + __EXPECT(expected, #expected, \ + ({__typeof__(seen) _tmp_seen = (seen); \ + _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1) + +#define ASSERT_ERRNO_EQ(expected, seen) \ + ASSERT_ERRNO(expected, ==, seen) + +#define ASSERT_SUCCESS(seen) \ + ASSERT_ERRNO(0, <=, seen) + +FIXTURE(ns) +{ + int host_mntns; +}; + +FIXTURE_SETUP(ns) +{ + /* Stash the old mntns. */ + self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_mntns); + + /* Create a new mount namespace and make it private. */ + ASSERT_SUCCESS(unshare(CLONE_NEWNS)); + ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL)); +} + +FIXTURE_TEARDOWN(ns) +{ + ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS)); + ASSERT_SUCCESS(close(self->host_mntns)); +} + +TEST_F(ns, fscontext_log_enodata) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + /* A brand new fscontext has no log entries. */ + char buf[128] = {}; + for (int i = 0; i < 16; i++) + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_errorfc) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + char buf[128] = {}; + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_errorfc_after_fsmount) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + int mfd = fsmount(fsfd, FSMOUNT_CLOEXEC, MOUNT_ATTR_NOEXEC | MOUNT_ATTR_NOSUID); + ASSERT_SUCCESS(mfd); + ASSERT_SUCCESS(move_mount(mfd, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)); + + /* + * The fscontext log should still contain data even after + * FSCONFIG_CMD_CREATE and fsmount(). + */ + char buf[128] = {}; + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_emsgsize) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + char buf[128] = {}; + /* + * Attempting to read a message with too small a buffer should not + * result in the message getting consumed. + */ + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 0)); + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 1)); + for (int i = 0; i < 16; i++) + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 16)); + + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c index 63ce708d93ed..e4b7c2b457ee 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c @@ -2,6 +2,13 @@ // Copyright (c) 2025 Miklos Szeredi <miklos@szeredi.hu> #define _GNU_SOURCE + +// Needed for linux/fanotify.h +typedef struct { + int val[2]; +} __kernel_fsid_t; +#define __kernel_fsid_t __kernel_fsid_t + #include <fcntl.h> #include <sched.h> #include <stdio.h> @@ -10,20 +17,12 @@ #include <sys/mount.h> #include <unistd.h> #include <sys/syscall.h> +#include <sys/fanotify.h> #include "../../kselftest_harness.h" #include "../statmount/statmount.h" #include "../utils.h" -// Needed for linux/fanotify.h -#ifndef __kernel_fsid_t -typedef struct { - int val[2]; -} __kernel_fsid_t; -#endif - -#include <sys/fanotify.h> - static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; static const int mark_cmds[] = { diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c index 090a5ca65004..9f57ca46e3af 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c @@ -2,6 +2,13 @@ // Copyright (c) 2025 Miklos Szeredi <miklos@szeredi.hu> #define _GNU_SOURCE + +// Needed for linux/fanotify.h +typedef struct { + int val[2]; +} __kernel_fsid_t; +#define __kernel_fsid_t __kernel_fsid_t + #include <fcntl.h> #include <sched.h> #include <stdio.h> @@ -10,21 +17,12 @@ #include <sys/mount.h> #include <unistd.h> #include <sys/syscall.h> +#include <sys/fanotify.h> #include "../../kselftest_harness.h" -#include "../../pidfd/pidfd.h" #include "../statmount/statmount.h" #include "../utils.h" -// Needed for linux/fanotify.h -#ifndef __kernel_fsid_t -typedef struct { - int val[2]; -} __kernel_fsid_t; -#endif - -#include <sys/fanotify.h> - static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; static const int mark_types[] = { diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 8cfb87f7f7c5..490ace1f017e 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,12 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 +PKG_CONFIG ?= pkg-config +LIBNUMA_TEST = $(shell sh -c "$(PKG_CONFIG) numa --atleast-version 2.0.16 > /dev/null 2>&1 && echo SUFFICIENT || echo NO") + INCLUDES := -I../include -I../../ $(KHDR_INCLUDES) -CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread $(INCLUDES) $(KHDR_INCLUDES) +CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread -D_FILE_OFFSET_BITS=64 -D_TIME_BITS=64 $(INCLUDES) $(KHDR_INCLUDES) -DLIBNUMA_VER_$(LIBNUMA_TEST)=1 LDLIBS := -lpthread -lrt -lnuma LOCAL_HDRS := \ ../include/futextest.h \ - ../include/atomic.h \ - ../include/logging.h + ../include/atomic.h TEST_GEN_PROGS := \ futex_wait_timeout \ futex_wait_wouldblock \ diff --git a/tools/testing/selftests/futex/functional/futex_numa.c b/tools/testing/selftests/futex/functional/futex_numa.c index f29e4d627e79..e0a33510ccb6 100644 --- a/tools/testing/selftests/futex/functional/futex_numa.c +++ b/tools/testing/selftests/futex/functional/futex_numa.c @@ -5,9 +5,10 @@ #include <sys/mman.h> #include <fcntl.h> #include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> #include <time.h> #include <assert.h> -#include "logging.h" #include "futextest.h" #include "futex2test.h" diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index a9ecfb2d3932..d037a3f10ee8 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -16,9 +16,9 @@ #include <linux/futex.h> #include <sys/mman.h> -#include "logging.h" #include "futextest.h" #include "futex2test.h" +#include "../../kselftest_harness.h" #define MAX_THREADS 64 @@ -77,7 +77,7 @@ static void join_max_threads(void) } } -static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flags) +static void __test_futex(void *futex_ptr, int err_value, unsigned int futex_flags) { int to_wake, ret, i, need_exit = 0; @@ -88,11 +88,17 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag do { ret = futex2_wake(futex_ptr, to_wake, futex_flags); - if (must_fail) { - if (ret < 0) - break; - ksft_exit_fail_msg("futex2_wake(%d, 0x%x) should fail, but didn't\n", - to_wake, futex_flags); + + if (err_value) { + if (ret >= 0) + ksft_exit_fail_msg("futex2_wake(%d, 0x%x) should fail, but didn't\n", + to_wake, futex_flags); + + if (errno != err_value) + ksft_exit_fail_msg("futex2_wake(%d, 0x%x) expected error was %d, but returned %d (%s)\n", + to_wake, futex_flags, err_value, errno, strerror(errno)); + + break; } if (ret < 0) { ksft_exit_fail_msg("Failed futex2_wake(%d, 0x%x): %m\n", @@ -106,12 +112,12 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag join_max_threads(); for (i = 0; i < MAX_THREADS; i++) { - if (must_fail && thread_args[i].result != -1) { + if (err_value && thread_args[i].result != -1) { ksft_print_msg("Thread %d should fail but succeeded (%d)\n", i, thread_args[i].result); need_exit = 1; } - if (!must_fail && thread_args[i].result != 0) { + if (!err_value && thread_args[i].result != 0) { ksft_print_msg("Thread %d failed (%d)\n", i, thread_args[i].result); need_exit = 1; } @@ -120,58 +126,30 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag ksft_exit_fail_msg("Aborting due to earlier errors.\n"); } -static void test_futex(void *futex_ptr, int must_fail) +static void test_futex(void *futex_ptr, int err_value) { - __test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); + __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); } -static void test_futex_mpol(void *futex_ptr, int must_fail) +static void test_futex_mpol(void *futex_ptr, int err_value) { - __test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); + __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); } -static void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - -int main(int argc, char *argv[]) +TEST(futex_numa_mpol) { struct futex32_numa *futex_numa; - int mem_size, i; void *futex_ptr; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); + int mem_size; mem_size = sysconf(_SC_PAGE_SIZE); - futex_ptr = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + futex_ptr = mmap(NULL, mem_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (futex_ptr == MAP_FAILED) ksft_exit_fail_msg("mmap() for %d bytes failed\n", mem_size); + /* Create an invalid memory region for the "Memory out of range" test */ + mprotect(futex_ptr + mem_size, mem_size, PROT_NONE); + futex_numa = futex_ptr; ksft_print_msg("Regular test\n"); @@ -182,27 +160,31 @@ int main(int argc, char *argv[]) if (futex_numa->numa == FUTEX_NO_NODE) ksft_exit_fail_msg("NUMA node is left uninitialized\n"); - ksft_print_msg("Memory too small\n"); - test_futex(futex_ptr + mem_size - 4, 1); + /* FUTEX2_NUMA futex must be 8-byte aligned */ + ksft_print_msg("Mis-aligned futex\n"); + test_futex(futex_ptr + mem_size - 4, EINVAL); ksft_print_msg("Memory out of range\n"); - test_futex(futex_ptr + mem_size, 1); + test_futex(futex_ptr + mem_size, EFAULT); futex_numa->numa = FUTEX_NO_NODE; mprotect(futex_ptr, mem_size, PROT_READ); ksft_print_msg("Memory, RO\n"); - test_futex(futex_ptr, 1); + test_futex(futex_ptr, EFAULT); mprotect(futex_ptr, mem_size, PROT_NONE); ksft_print_msg("Memory, no access\n"); - test_futex(futex_ptr, 1); + test_futex(futex_ptr, EFAULT); mprotect(futex_ptr, mem_size, PROT_READ | PROT_WRITE); ksft_print_msg("Memory back to RW\n"); test_futex(futex_ptr, 0); + ksft_test_result_pass("futex2 memory boundary tests passed\n"); + /* MPOL test. Does not work as expected */ - for (i = 0; i < 4; i++) { +#ifdef LIBNUMA_VER_SUFFICIENT + for (int i = 0; i < 4; i++) { unsigned long nodemask; int ret; @@ -221,15 +203,17 @@ int main(int argc, char *argv[]) ret = futex2_wake(futex_ptr, 0, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); if (ret < 0) ksft_test_result_fail("Failed to wake 0 with MPOL: %m\n"); - if (0) - test_futex_mpol(futex_numa, 0); if (futex_numa->numa != i) { ksft_exit_fail_msg("Returned NUMA node is %d expected %d\n", futex_numa->numa, i); } } } - ksft_test_result_pass("NUMA MPOL tests passed\n"); - ksft_finished(); - return 0; + ksft_test_result_pass("futex2 MPOL hints test passed\n"); +#else + ksft_test_result_skip("futex2 MPOL hints test requires libnuma 2.0.16+\n"); +#endif + munmap(futex_ptr, mem_size * 2); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_priv_hash.c b/tools/testing/selftests/futex/functional/futex_priv_hash.c index aea001ac4946..3b7b5851f290 100644 --- a/tools/testing/selftests/futex/functional/futex_priv_hash.c +++ b/tools/testing/selftests/futex/functional/futex_priv_hash.c @@ -14,7 +14,7 @@ #include <linux/prctl.h> #include <sys/prctl.h> -#include "logging.h" +#include "../../kselftest_harness.h" #define MAX_THREADS 64 @@ -128,46 +128,14 @@ static void futex_dummy_op(void) ksft_exit_fail_msg("pthread_mutex_timedlock() did not timeout: %d.\n", ret); } -static void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -g Test global hash instead intead local immutable \n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - static const char *test_msg_auto_create = "Automatic hash bucket init on thread creation.\n"; static const char *test_msg_auto_inc = "Automatic increase with more than 16 CPUs\n"; -int main(int argc, char *argv[]) +TEST(priv_hash) { int futex_slots1, futex_slotsn, online_cpus; pthread_mutexattr_t mutex_attr_pi; int ret, retry = 20; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(21); ret = pthread_mutexattr_init(&mutex_attr_pi); ret |= pthread_mutexattr_setprotocol(&mutex_attr_pi, PTHREAD_PRIO_INHERIT); @@ -189,14 +157,14 @@ int main(int argc, char *argv[]) if (ret != 0) ksft_exit_fail_msg("pthread_join() failed: %d, %m\n", ret); - /* First thread, has to initialiaze private hash */ + /* First thread, has to initialize private hash */ futex_slots1 = futex_hash_slots_get(); if (futex_slots1 <= 0) { ksft_print_msg("Current hash buckets: %d\n", futex_slots1); - ksft_exit_fail_msg(test_msg_auto_create); + ksft_exit_fail_msg("%s", test_msg_auto_create); } - ksft_test_result_pass(test_msg_auto_create); + ksft_test_result_pass("%s", test_msg_auto_create); online_cpus = sysconf(_SC_NPROCESSORS_ONLN); ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS + 1); @@ -237,11 +205,11 @@ retry_getslots: } ksft_print_msg("Expected increase of hash buckets but got: %d -> %d\n", futex_slots1, futex_slotsn); - ksft_exit_fail_msg(test_msg_auto_inc); + ksft_exit_fail_msg("%s", test_msg_auto_inc); } - ksft_test_result_pass(test_msg_auto_inc); + ksft_test_result_pass("%s", test_msg_auto_inc); } else { - ksft_test_result_skip(test_msg_auto_inc); + ksft_test_result_skip("%s", test_msg_auto_inc); } ret = pthread_mutex_unlock(&global_lock); @@ -257,17 +225,17 @@ retry_getslots: futex_hash_slots_set_verify(2); join_max_threads(); - ksft_test_result(counter == MAX_THREADS, "Created of waited for %d of %d threads\n", + ksft_test_result(counter == MAX_THREADS, "Created and waited for %d of %d threads\n", counter, MAX_THREADS); counter = 0; - /* Once the user set something, auto reisze must be disabled */ + /* Once the user set something, auto resize must be disabled */ ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS); create_max_threads(thread_lock_fn); join_max_threads(); ret = futex_hash_slots_get(); - ksft_test_result(ret == 2, "No more auto-resize after manaul setting, got %d\n", + ksft_test_result(ret == 2, "No more auto-resize after manual setting, got %d\n", ret); futex_hash_slots_set_must_fail(1 << 29); @@ -280,7 +248,7 @@ retry_getslots: ret = futex_hash_slots_set(0); ksft_test_result(ret == 0, "Global hash request\n"); if (ret != 0) - goto out; + return; futex_hash_slots_set_must_fail(4); futex_hash_slots_set_must_fail(8); @@ -289,17 +257,14 @@ retry_getslots: futex_hash_slots_set_must_fail(6); ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS); - if (ret != 0) { + if (ret != 0) ksft_exit_fail_msg("pthread_barrier_init failed: %m\n"); - return 1; - } + create_max_threads(thread_lock_fn); join_max_threads(); ret = futex_hash_slots_get(); ksft_test_result(ret == 0, "Continue to use global hash\n"); - -out: - ksft_finished(); - return 0; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue.c b/tools/testing/selftests/futex/functional/futex_requeue.c index 51485be6eb2f..69e2555b6039 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue.c +++ b/tools/testing/selftests/futex/functional/futex_requeue.c @@ -7,24 +7,15 @@ #include <pthread.h> #include <limits.h> -#include "logging.h" + #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue" #define timeout_ns 30000000 #define WAKE_WAIT_US 10000 volatile futex_t *f1; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *waiterfn(void *arg) { struct timespec to; @@ -38,67 +29,49 @@ void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(requeue_single) { - pthread_t waiter[10]; - int res, ret = RET_PASS; - int c, i; volatile futex_t _f1 = 0; volatile futex_t f2 = 0; + pthread_t waiter[10]; + int res; f1 = &_f1; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(2); - ksft_print_msg("%s: Test futex_requeue\n", - basename(argv[0])); - /* * Requeue a waiter from f1 to f2, and wake f2. */ if (pthread_create(&waiter[0], NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Requeuing 1 futex from f1 to f2\n"); + ksft_print_dbg_msg("Requeuing 1 futex from f1 to f2\n"); res = futex_cmp_requeue(f1, 0, &f2, 0, 1, 0); - if (res != 1) { + if (res != 1) ksft_test_result_fail("futex_requeue simple returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; - } - - info("Waking 1 futex at f2\n"); + ksft_print_dbg_msg("Waking 1 futex at f2\n"); res = futex_wake(&f2, 1, 0); if (res != 1) { ksft_test_result_fail("futex_requeue simple returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_requeue simple succeeds\n"); } +} + +TEST(requeue_multiple) +{ + volatile futex_t _f1 = 0; + volatile futex_t f2 = 0; + pthread_t waiter[10]; + int res, i; + f1 = &_f1; /* * Create 10 waiters at f1. At futex_requeue, wake 3 and requeue 7. @@ -106,31 +79,28 @@ int main(int argc, char *argv[]) */ for (i = 0; i < 10; i++) { if (pthread_create(&waiter[i], NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); } usleep(WAKE_WAIT_US); - info("Waking 3 futexes at f1 and requeuing 7 futexes from f1 to f2\n"); + ksft_print_dbg_msg("Waking 3 futexes at f1 and requeuing 7 futexes from f1 to f2\n"); res = futex_cmp_requeue(f1, 0, &f2, 3, 7, 0); if (res != 10) { ksft_test_result_fail("futex_requeue many returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } - info("Waking INT_MAX futexes at f2\n"); + ksft_print_dbg_msg("Waking INT_MAX futexes at f2\n"); res = futex_wake(&f2, INT_MAX, 0); if (res != 7) { ksft_test_result_fail("futex_requeue many returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_requeue many succeeds\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi.c b/tools/testing/selftests/futex/functional/futex_requeue_pi.c index 215c6cb539b4..f299d75848cd 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi.c @@ -26,11 +26,11 @@ #include <stdlib.h> #include <signal.h> #include <string.h> + #include "atomic.h" #include "futextest.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue-pi" #define MAX_WAKE_ITERS 1000 #define THREAD_MAX 10 #define SIGNAL_PERIOD_US 100 @@ -42,12 +42,6 @@ futex_t f1 = FUTEX_INITIALIZER; futex_t f2 = FUTEX_INITIALIZER; futex_t wake_complete = FUTEX_INITIALIZER; -/* Test option defaults */ -static long timeout_ns; -static int broadcast; -static int owner; -static int locked; - struct thread_arg { long id; struct timespec *timeout; @@ -56,18 +50,73 @@ struct thread_arg { }; #define THREAD_ARG_INITIALIZER { 0, NULL, 0, 0 } -void usage(char *prog) +FIXTURE(args) { - printf("Usage: %s\n", prog); - printf(" -b Broadcast wakeup (all waiters)\n"); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -l Lock the pi futex across requeue\n"); - printf(" -o Use a third party pi futex owner during requeue (cancels -l)\n"); - printf(" -t N Timeout in nanoseconds (default: 0)\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} +}; + +FIXTURE_SETUP(args) +{ +}; + +FIXTURE_TEARDOWN(args) +{ +}; + +FIXTURE_VARIANT(args) +{ + long timeout_ns; + bool broadcast; + bool owner; + bool locked; +}; + +/* + * For a given timeout value, this macro creates a test input with all the + * possible combinations of valid arguments + */ +#define FIXTURE_VARIANT_ADD_TIMEOUT(timeout) \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout) \ +{ \ + .timeout_ns = timeout, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast_locked) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ + .locked = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast_owner) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ + .owner = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_locked) \ +{ \ + .timeout_ns = timeout, \ + .locked = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_owner) \ +{ \ + .timeout_ns = timeout, \ + .owner = true, \ +}; \ + +FIXTURE_VARIANT_ADD_TIMEOUT(0); +FIXTURE_VARIANT_ADD_TIMEOUT(5000); +FIXTURE_VARIANT_ADD_TIMEOUT(500000); +FIXTURE_VARIANT_ADD_TIMEOUT(2000000000); int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, int policy, int prio) @@ -81,26 +130,26 @@ int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); if (ret) { - error("pthread_attr_setinheritsched\n", ret); + ksft_exit_fail_msg("pthread_attr_setinheritsched\n"); return -1; } ret = pthread_attr_setschedpolicy(&attr, policy); if (ret) { - error("pthread_attr_setschedpolicy\n", ret); + ksft_exit_fail_msg("pthread_attr_setschedpolicy\n"); return -1; } schedp.sched_priority = prio; ret = pthread_attr_setschedparam(&attr, &schedp); if (ret) { - error("pthread_attr_setschedparam\n", ret); + ksft_exit_fail_msg("pthread_attr_setschedparam\n"); return -1; } ret = pthread_create(pth, &attr, func, arg); if (ret) { - error("pthread_create\n", ret); + ksft_exit_fail_msg("pthread_create\n"); return -1; } return 0; @@ -112,7 +161,7 @@ void *waiterfn(void *arg) struct thread_arg *args = (struct thread_arg *)arg; futex_t old_val; - info("Waiter %ld: running\n", args->id); + ksft_print_dbg_msg("Waiter %ld: running\n", args->id); /* Each thread sleeps for a different amount of time * This is to avoid races, because we don't lock the * external mutex here */ @@ -120,26 +169,25 @@ void *waiterfn(void *arg) old_val = f1; atomic_inc(&waiters_blocked); - info("Calling futex_wait_requeue_pi: %p (%u) -> %p\n", + ksft_print_dbg_msg("Calling futex_wait_requeue_pi: %p (%u) -> %p\n", &f1, f1, &f2); args->ret = futex_wait_requeue_pi(&f1, old_val, &f2, args->timeout, FUTEX_PRIVATE_FLAG); - info("waiter %ld woke with %d %s\n", args->id, args->ret, + ksft_print_dbg_msg("waiter %ld woke with %d %s\n", args->id, args->ret, args->ret < 0 ? strerror(errno) : ""); atomic_inc(&waiters_woken); if (args->ret < 0) { if (args->timeout && errno == ETIMEDOUT) args->ret = 0; else { - args->ret = RET_ERROR; - error("futex_wait_requeue_pi\n", errno); + ksft_exit_fail_msg("futex_wait_requeue_pi\n"); } futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - info("Waiter %ld: exiting with %d\n", args->id, args->ret); + ksft_print_dbg_msg("Waiter %ld: exiting with %d\n", args->id, args->ret); pthread_exit((void *)&args->ret); } @@ -152,14 +200,14 @@ void *broadcast_wakerfn(void *arg) int nr_wake = 1; int i = 0; - info("Waker: waiting for waiters to block\n"); + ksft_print_dbg_msg("Waker: waiting for waiters to block\n"); while (waiters_blocked.val < THREAD_MAX) usleep(1000); usleep(1000); - info("Waker: Calling broadcast\n"); + ksft_print_dbg_msg("Waker: Calling broadcast\n"); if (args->lock) { - info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", f2, &f2); futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } continue_requeue: @@ -167,16 +215,14 @@ void *broadcast_wakerfn(void *arg) args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, nr_wake, nr_requeue, FUTEX_PRIVATE_FLAG); if (args->ret < 0) { - args->ret = RET_ERROR; - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); } else if (++i < MAX_WAKE_ITERS) { task_count += args->ret; if (task_count < THREAD_MAX - waiters_woken.val) goto continue_requeue; } else { - error("max broadcast iterations (%d) reached with %d/%d tasks woken or requeued\n", - 0, MAX_WAKE_ITERS, task_count, THREAD_MAX); - args->ret = RET_ERROR; + ksft_exit_fail_msg("max broadcast iterations (%d) reached with %d/%d tasks woken or requeued\n", + MAX_WAKE_ITERS, task_count, THREAD_MAX); } futex_wake(&wake_complete, 1, FUTEX_PRIVATE_FLAG); @@ -187,7 +233,7 @@ void *broadcast_wakerfn(void *arg) if (args->ret > 0) args->ret = task_count; - info("Waker: exiting with %d\n", args->ret); + ksft_print_dbg_msg("Waker: exiting with %d\n", args->ret); pthread_exit((void *)&args->ret); } @@ -200,20 +246,20 @@ void *signal_wakerfn(void *arg) int nr_wake = 1; int i = 0; - info("Waker: waiting for waiters to block\n"); + ksft_print_dbg_msg("Waker: waiting for waiters to block\n"); while (waiters_blocked.val < THREAD_MAX) usleep(1000); usleep(1000); while (task_count < THREAD_MAX && waiters_woken.val < THREAD_MAX) { - info("task_count: %d, waiters_woken: %d\n", + ksft_print_dbg_msg("task_count: %d, waiters_woken: %d\n", task_count, waiters_woken.val); if (args->lock) { - info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", - f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", + f2, &f2); futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } - info("Waker: Calling signal\n"); + ksft_print_dbg_msg("Waker: Calling signal\n"); /* cond_signal */ old_val = f1; args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, @@ -221,28 +267,23 @@ void *signal_wakerfn(void *arg) FUTEX_PRIVATE_FLAG); if (args->ret < 0) args->ret = -errno; - info("futex: %x\n", f2); + ksft_print_dbg_msg("futex: %x\n", f2); if (args->lock) { - info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", - f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", + f2, &f2); futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); } - info("futex: %x\n", f2); - if (args->ret < 0) { - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); - args->ret = RET_ERROR; - break; - } + ksft_print_dbg_msg("futex: %x\n", f2); + if (args->ret < 0) + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); task_count += args->ret; usleep(SIGNAL_PERIOD_US); i++; /* we have to loop at least THREAD_MAX times */ if (i > MAX_WAKE_ITERS + THREAD_MAX) { - error("max signaling iterations (%d) reached, giving up on pending waiters.\n", - 0, MAX_WAKE_ITERS + THREAD_MAX); - args->ret = RET_ERROR; - break; + ksft_exit_fail_msg("max signaling iterations (%d) reached, giving up on pending waiters.\n", + MAX_WAKE_ITERS + THREAD_MAX); } } @@ -251,8 +292,8 @@ void *signal_wakerfn(void *arg) if (args->ret >= 0) args->ret = task_count; - info("Waker: exiting with %d\n", args->ret); - info("Waker: waiters_woken: %d\n", waiters_woken.val); + ksft_print_dbg_msg("Waker: exiting with %d\n", args->ret); + ksft_print_dbg_msg("Waker: waiters_woken: %d\n", waiters_woken.val); pthread_exit((void *)&args->ret); } @@ -269,35 +310,40 @@ void *third_party_blocker(void *arg) ret2 = futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); out: - if (args->ret || ret2) { - error("third_party_blocker() futex error", 0); - args->ret = RET_ERROR; - } + if (args->ret || ret2) + ksft_exit_fail_msg("third_party_blocker() futex error"); pthread_exit((void *)&args->ret); } -int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) +TEST_F(args, futex_requeue_pi) { - void *(*wakerfn)(void *) = signal_wakerfn; struct thread_arg blocker_arg = THREAD_ARG_INITIALIZER; struct thread_arg waker_arg = THREAD_ARG_INITIALIZER; pthread_t waiter[THREAD_MAX], waker, blocker; - struct timespec ts, *tsp = NULL; + void *(*wakerfn)(void *) = signal_wakerfn; + bool third_party_owner = variant->owner; + long timeout_ns = variant->timeout_ns; + bool broadcast = variant->broadcast; struct thread_arg args[THREAD_MAX]; - int *waiter_ret; - int i, ret = RET_PASS; + struct timespec ts, *tsp = NULL; + bool lock = variant->locked; + int *waiter_ret, i, ret = 0; + + ksft_print_msg( + "\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n", + broadcast, lock, third_party_owner, timeout_ns); if (timeout_ns) { time_t secs; - info("timeout_ns = %ld\n", timeout_ns); + ksft_print_dbg_msg("timeout_ns = %ld\n", timeout_ns); ret = clock_gettime(CLOCK_MONOTONIC, &ts); secs = (ts.tv_nsec + timeout_ns) / 1000000000; ts.tv_nsec = ((int64_t)ts.tv_nsec + timeout_ns) % 1000000000; ts.tv_sec += secs; - info("ts.tv_sec = %ld\n", ts.tv_sec); - info("ts.tv_nsec = %ld\n", ts.tv_nsec); + ksft_print_dbg_msg("ts.tv_sec = %ld\n", ts.tv_sec); + ksft_print_dbg_msg("ts.tv_nsec = %ld\n", ts.tv_nsec); tsp = &ts; } @@ -307,10 +353,7 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) if (third_party_owner) { if (create_rt_thread(&blocker, third_party_blocker, (void *)&blocker_arg, SCHED_FIFO, 1)) { - error("Creating third party blocker thread failed\n", - errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating third party blocker thread failed\n"); } } @@ -318,20 +361,16 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) for (i = 0; i < THREAD_MAX; i++) { args[i].id = i; args[i].timeout = tsp; - info("Starting thread %d\n", i); + ksft_print_dbg_msg("Starting thread %d\n", i); if (create_rt_thread(&waiter[i], waiterfn, (void *)&args[i], SCHED_FIFO, 1)) { - error("Creating waiting thread failed\n", errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating waiting thread failed\n"); } } waker_arg.lock = lock; if (create_rt_thread(&waker, wakerfn, (void *)&waker_arg, SCHED_FIFO, 1)) { - error("Creating waker thread failed\n", errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating waker thread failed\n"); } /* Wait for threads to finish */ @@ -345,7 +384,6 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) pthread_join(blocker, NULL); pthread_join(waker, NULL); -out: if (!ret) { if (*waiter_ret) ret = *waiter_ret; @@ -355,66 +393,8 @@ out: ret = blocker_arg.ret; } - return ret; + if (ret) + ksft_test_result_fail("fail"); } -int main(int argc, char *argv[]) -{ - char *test_name; - int c, ret; - - while ((c = getopt(argc, argv, "bchlot:v:")) != -1) { - switch (c) { - case 'b': - broadcast = 1; - break; - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'l': - locked = 1; - break; - case 'o': - owner = 1; - locked = 0; - break; - case 't': - timeout_ns = atoi(optarg); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test requeue functionality\n", basename(argv[0])); - ksft_print_msg( - "\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n", - broadcast, locked, owner, timeout_ns); - - ret = asprintf(&test_name, - "%s broadcast=%d locked=%d owner=%d timeout=%ldns", - TEST_NAME, broadcast, locked, owner, timeout_ns); - if (ret < 0) { - ksft_print_msg("Failed to generate test name\n"); - test_name = TEST_NAME; - } - - /* - * FIXME: unit_test is obsolete now that we parse options and the - * various style of runs are done by run.sh - simplify the code and move - * unit_test into main() - */ - ret = unit_test(broadcast, locked, owner, timeout_ns); - - print_result(test_name, ret); - return ret; -} +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c index d0a4d332ea44..77135a22a583 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c @@ -23,67 +23,32 @@ #include <stdlib.h> #include <string.h> #include <time.h> -#include "futextest.h" -#include "logging.h" -#define TEST_NAME "futex-requeue-pi-mismatched-ops" +#include "futextest.h" +#include "../../kselftest_harness.h" futex_t f1 = FUTEX_INITIALIZER; futex_t f2 = FUTEX_INITIALIZER; int child_ret = 0; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *blocking_child(void *arg) { child_ret = futex_wait(&f1, f1, NULL, FUTEX_PRIVATE_FLAG); if (child_ret < 0) { child_ret = -errno; - error("futex_wait\n", errno); + ksft_exit_fail_msg("futex_wait\n"); } return (void *)&child_ret; } -int main(int argc, char *argv[]) +TEST(requeue_pi_mismatched_ops) { - int ret = RET_PASS; pthread_t child; - int c; + int ret; - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Detect mismatched requeue_pi operations\n", - basename(argv[0])); + if (pthread_create(&child, NULL, blocking_child, NULL)) + ksft_exit_fail_msg("pthread_create\n"); - if (pthread_create(&child, NULL, blocking_child, NULL)) { - error("pthread_create\n", errno); - ret = RET_ERROR; - goto out; - } /* Allow the child to block in the kernel. */ sleep(1); @@ -102,34 +67,27 @@ int main(int argc, char *argv[]) * FUTEX_WAKE. */ ret = futex_wake(&f1, 1, FUTEX_PRIVATE_FLAG); - if (ret == 1) { - ret = RET_PASS; - } else if (ret < 0) { - error("futex_wake\n", errno); - ret = RET_ERROR; - } else { - error("futex_wake did not wake the child\n", 0); - ret = RET_ERROR; - } + if (ret == 1) + ret = 0; + else if (ret < 0) + ksft_exit_fail_msg("futex_wake\n"); + else + ksft_exit_fail_msg("futex_wake did not wake the child\n"); } else { - error("futex_cmp_requeue_pi\n", errno); - ret = RET_ERROR; + ksft_exit_fail_msg("futex_cmp_requeue_pi\n"); } } else if (ret > 0) { - fail("futex_cmp_requeue_pi failed to detect the mismatch\n"); - ret = RET_FAIL; + ksft_test_result_fail("futex_cmp_requeue_pi failed to detect the mismatch\n"); } else { - error("futex_cmp_requeue_pi found no waiters\n", 0); - ret = RET_ERROR; + ksft_exit_fail_msg("futex_cmp_requeue_pi found no waiters\n"); } pthread_join(child, NULL); - if (!ret) - ret = child_ret; - - out: - /* If the kernel crashes, we shouldn't return at all. */ - print_result(TEST_NAME, ret); - return ret; + if (!ret && !child_ret) + ksft_test_result_pass("futex_requeue_pi_mismatched_ops passed\n"); + else + ksft_test_result_pass("futex_requeue_pi_mismatched_ops failed\n"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c index c6b8f32990c8..e34ee0f9ebcc 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c @@ -24,11 +24,11 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> + #include "atomic.h" #include "futextest.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue-pi-signal-restart" #define DELAY_US 100 futex_t f1 = FUTEX_INITIALIZER; @@ -37,15 +37,6 @@ atomic_t requeued = ATOMIC_INITIALIZER; int waiter_ret = 0; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, int policy, int prio) { @@ -57,35 +48,28 @@ int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, memset(&schedp, 0, sizeof(schedp)); ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); - if (ret) { - error("pthread_attr_setinheritsched\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setinheritsched\n"); ret = pthread_attr_setschedpolicy(&attr, policy); - if (ret) { - error("pthread_attr_setschedpolicy\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setschedpolicy\n"); schedp.sched_priority = prio; ret = pthread_attr_setschedparam(&attr, &schedp); - if (ret) { - error("pthread_attr_setschedparam\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setschedparam\n"); ret = pthread_create(pth, &attr, func, arg); - if (ret) { - error("pthread_create\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_create\n"); + return 0; } void handle_signal(int signo) { - info("signal received %s requeue\n", + ksft_print_dbg_msg("signal received %s requeue\n", requeued.val ? "after" : "prior to"); } @@ -94,78 +78,46 @@ void *waiterfn(void *arg) unsigned int old_val; int res; - waiter_ret = RET_PASS; - - info("Waiter running\n"); - info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Waiter running\n"); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); old_val = f1; res = futex_wait_requeue_pi(&f1, old_val, &(f2), NULL, FUTEX_PRIVATE_FLAG); if (!requeued.val || errno != EWOULDBLOCK) { - fail("unexpected return from futex_wait_requeue_pi: %d (%s)\n", + ksft_test_result_fail("unexpected return from futex_wait_requeue_pi: %d (%s)\n", res, strerror(errno)); - info("w2:futex: %x\n", f2); + ksft_print_dbg_msg("w2:futex: %x\n", f2); if (!res) futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - waiter_ret = RET_FAIL; } - info("Waiter exiting with %d\n", waiter_ret); pthread_exit(NULL); } -int main(int argc, char *argv[]) +TEST(futex_requeue_pi_signal_restart) { unsigned int old_val; struct sigaction sa; pthread_t waiter; - int c, res, ret = RET_PASS; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test signal handling during requeue_pi\n", - basename(argv[0])); - ksft_print_msg("\tArguments: <none>\n"); + int res; sa.sa_handler = handle_signal; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; - if (sigaction(SIGUSR1, &sa, NULL)) { - error("sigaction\n", errno); - exit(1); - } + if (sigaction(SIGUSR1, &sa, NULL)) + ksft_exit_fail_msg("sigaction\n"); - info("m1:f2: %x\n", f2); - info("Creating waiter\n"); + ksft_print_dbg_msg("m1:f2: %x\n", f2); + ksft_print_dbg_msg("Creating waiter\n"); res = create_rt_thread(&waiter, waiterfn, NULL, SCHED_FIFO, 1); - if (res) { - error("Creating waiting thread failed", res); - ret = RET_ERROR; - goto out; - } + if (res) + ksft_exit_fail_msg("Creating waiting thread failed"); - info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); - info("m2:f2: %x\n", f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("m2:f2: %x\n", f2); futex_lock_pi(&f2, 0, 0, FUTEX_PRIVATE_FLAG); - info("m3:f2: %x\n", f2); + ksft_print_dbg_msg("m3:f2: %x\n", f2); while (1) { /* @@ -173,11 +125,11 @@ int main(int argc, char *argv[]) * restart futex_wait_requeue_pi() in the kernel. Wait for the * waiter to block on f1 again. */ - info("Issuing SIGUSR1 to waiter\n"); + ksft_print_dbg_msg("Issuing SIGUSR1 to waiter\n"); pthread_kill(waiter, SIGUSR1); usleep(DELAY_US); - info("Requeueing waiter via FUTEX_CMP_REQUEUE_PI\n"); + ksft_print_dbg_msg("Requeueing waiter via FUTEX_CMP_REQUEUE_PI\n"); old_val = f1; res = futex_cmp_requeue_pi(&f1, old_val, &(f2), 1, 0, FUTEX_PRIVATE_FLAG); @@ -191,12 +143,10 @@ int main(int argc, char *argv[]) atomic_set(&requeued, 1); break; } else if (res < 0) { - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); - ret = RET_ERROR; - break; + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); } } - info("m4:f2: %x\n", f2); + ksft_print_dbg_msg("m4:f2: %x\n", f2); /* * Signal the waiter after requeue, waiter should return from @@ -204,19 +154,14 @@ int main(int argc, char *argv[]) * futex_unlock_pi() can't happen before the signal wakeup is detected * in the kernel. */ - info("Issuing SIGUSR1 to waiter\n"); + ksft_print_dbg_msg("Issuing SIGUSR1 to waiter\n"); pthread_kill(waiter, SIGUSR1); - info("Waiting for waiter to return\n"); + ksft_print_dbg_msg("Waiting for waiter to return\n"); pthread_join(waiter, NULL); - info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", f2, &f2); futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - info("m5:f2: %x\n", f2); - - out: - if (ret == RET_PASS && waiter_ret) - ret = waiter_ret; - - print_result(TEST_NAME, ret); - return ret; + ksft_print_dbg_msg("m5:f2: %x\n", f2); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait.c b/tools/testing/selftests/futex/functional/futex_wait.c index 685140d9b93d..152ca4612886 100644 --- a/tools/testing/selftests/futex/functional/futex_wait.c +++ b/tools/testing/selftests/futex/functional/futex_wait.c @@ -9,25 +9,16 @@ #include <sys/shm.h> #include <sys/mman.h> #include <fcntl.h> -#include "logging.h" + #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait" #define timeout_ns 30000000 #define WAKE_WAIT_US 10000 #define SHM_PATH "futex_shm_file" void *futex; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - static void *waiterfn(void *arg) { struct timespec to; @@ -45,53 +36,37 @@ static void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(private_futex) { - int res, ret = RET_PASS, fd, c, shm_id; - u_int32_t f_private = 0, *shared_data; unsigned int flags = FUTEX_PRIVATE_FLAG; + u_int32_t f_private = 0; pthread_t waiter; - void *shm; + int res; futex = &f_private; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(3); - ksft_print_msg("%s: Test futex_wait\n", basename(argv[0])); - /* Testing a private futex */ - info("Calling private futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling private futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, (void *) &flags)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling private futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling private futex_wake on futex: %p\n", futex); res = futex_wake(futex, 1, FUTEX_PRIVATE_FLAG); if (res != 1) { ksft_test_result_fail("futex_wake private returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake private succeeds\n"); } +} + +TEST(anon_page) +{ + u_int32_t *shared_data; + pthread_t waiter; + int res, shm_id; /* Testing an anon page shared memory */ shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); @@ -105,67 +80,65 @@ int main(int argc, char *argv[]) *shared_data = 0; futex = shared_data; - info("Calling shared (page anon) futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (page anon) futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling shared (page anon) futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (page anon) futex_wake on futex: %p\n", futex); res = futex_wake(futex, 1, 0); if (res != 1) { ksft_test_result_fail("futex_wake shared (page anon) returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake shared (page anon) succeeds\n"); } + shmdt(shared_data); +} + +TEST(file_backed) +{ + u_int32_t f_private = 0; + pthread_t waiter; + int res, fd; + void *shm; /* Testing a file backed shared memory */ fd = open(SHM_PATH, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); - if (fd < 0) { - perror("open"); - exit(1); - } + if (fd < 0) + ksft_exit_fail_msg("open"); - if (ftruncate(fd, sizeof(f_private))) { - perror("ftruncate"); - exit(1); - } + if (ftruncate(fd, sizeof(f_private))) + ksft_exit_fail_msg("ftruncate"); shm = mmap(NULL, sizeof(f_private), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (shm == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (shm == MAP_FAILED) + ksft_exit_fail_msg("mmap"); memcpy(shm, &f_private, sizeof(f_private)); futex = shm; - info("Calling shared (file backed) futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (file backed) futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling shared (file backed) futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (file backed) futex_wake on futex: %p\n", futex); res = futex_wake(shm, 1, 0); if (res != 1) { ksft_test_result_fail("futex_wake shared (file backed) returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake shared (file backed) succeeds\n"); } - /* Freeing resources */ - shmdt(shared_data); munmap(shm, sizeof(f_private)); remove(SHM_PATH); close(fd); - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c index fb4148f23fa3..8952ebda14ab 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c +++ b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c @@ -27,10 +27,9 @@ #include <libgen.h> #include <signal.h> -#include "logging.h" #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-private-mapped-file" #define PAGE_SZ 4096 char pad[PAGE_SZ] = {1}; @@ -40,86 +39,44 @@ char pad2[PAGE_SZ] = {1}; #define WAKE_WAIT_US 3000000 struct timespec wait_timeout = { .tv_sec = 5, .tv_nsec = 0}; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *thr_futex_wait(void *arg) { int ret; - info("futex wait\n"); + ksft_print_dbg_msg("futex wait\n"); ret = futex_wait(&val, 1, &wait_timeout, 0); - if (ret && errno != EWOULDBLOCK && errno != ETIMEDOUT) { - error("futex error.\n", errno); - print_result(TEST_NAME, RET_ERROR); - exit(RET_ERROR); - } + if (ret && errno != EWOULDBLOCK && errno != ETIMEDOUT) + ksft_exit_fail_msg("futex error.\n"); if (ret && errno == ETIMEDOUT) - fail("waiter timedout\n"); + ksft_exit_fail_msg("waiter timedout\n"); - info("futex_wait: ret = %d, errno = %d\n", ret, errno); + ksft_print_dbg_msg("futex_wait: ret = %d, errno = %d\n", ret, errno); return NULL; } -int main(int argc, char **argv) +TEST(wait_private_mapped_file) { pthread_t thr; - int ret = RET_PASS; int res; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg( - "%s: Test the futex value of private file mappings in FUTEX_WAIT\n", - basename(argv[0])); - - ret = pthread_create(&thr, NULL, thr_futex_wait, NULL); - if (ret < 0) { - fprintf(stderr, "pthread_create error\n"); - ret = RET_ERROR; - goto out; - } - - info("wait a while\n"); + + res = pthread_create(&thr, NULL, thr_futex_wait, NULL); + if (res < 0) + ksft_exit_fail_msg("pthread_create error\n"); + + ksft_print_dbg_msg("wait a while\n"); usleep(WAKE_WAIT_US); val = 2; res = futex_wake(&val, 1, 0); - info("futex_wake %d\n", res); - if (res != 1) { - fail("FUTEX_WAKE didn't find the waiting thread.\n"); - ret = RET_FAIL; - } + ksft_print_dbg_msg("futex_wake %d\n", res); + if (res != 1) + ksft_exit_fail_msg("FUTEX_WAKE didn't find the waiting thread.\n"); - info("join\n"); + ksft_print_dbg_msg("join\n"); pthread_join(thr, NULL); - out: - print_result(TEST_NAME, ret); - return ret; + ksft_test_result_pass("wait_private_mapped_file"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c index d183f878360b..0c8766aced2e 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c +++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c @@ -16,26 +16,15 @@ *****************************************************************************/ #include <pthread.h> + #include "futextest.h" #include "futex2test.h" -#include "logging.h" - -#define TEST_NAME "futex-wait-timeout" +#include "../../kselftest_harness.h" static long timeout_ns = 100000; /* 100us default timeout */ static futex_t futex_pi; static pthread_barrier_t barrier; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -t N Timeout in nanoseconds (default: 100,000)\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - /* * Get a PI lock and hold it forever, so the main thread lock_pi will block * and we can test the timeout @@ -47,13 +36,13 @@ void *get_pi_lock(void *arg) ret = futex_lock_pi(&futex_pi, NULL, 0, 0); if (ret != 0) - error("futex_lock_pi failed\n", ret); + ksft_exit_fail_msg("futex_lock_pi failed\n"); pthread_barrier_wait(&barrier); /* Blocks forever */ ret = futex_wait(&lock, 0, NULL, 0); - error("futex_wait failed\n", ret); + ksft_exit_fail_msg("futex_wait failed\n"); return NULL; } @@ -61,12 +50,11 @@ void *get_pi_lock(void *arg) /* * Check if the function returned the expected error */ -static void test_timeout(int res, int *ret, char *test_name, int err) +static void test_timeout(int res, char *test_name, int err) { if (!res || errno != err) { ksft_test_result_fail("%s returned %d\n", test_name, res < 0 ? errno : res); - *ret = RET_FAIL; } else { ksft_test_result_pass("%s succeeds\n", test_name); } @@ -78,10 +66,8 @@ static void test_timeout(int res, int *ret, char *test_name, int err) static int futex_get_abs_timeout(clockid_t clockid, struct timespec *to, long timeout_ns) { - if (clock_gettime(clockid, to)) { - error("clock_gettime failed\n", errno); - return errno; - } + if (clock_gettime(clockid, to)) + ksft_exit_fail_msg("clock_gettime failed\n"); to->tv_nsec += timeout_ns; @@ -93,83 +79,66 @@ static int futex_get_abs_timeout(clockid_t clockid, struct timespec *to, return 0; } -int main(int argc, char *argv[]) +TEST(wait_bitset) { futex_t f1 = FUTEX_INITIALIZER; - int res, ret = RET_PASS; struct timespec to; - pthread_t thread; - int c; - struct futex_waitv waitv = { - .uaddr = (uintptr_t)&f1, - .val = f1, - .flags = FUTEX_32, - .__reserved = 0 - }; - - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 't': - timeout_ns = atoi(optarg); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(9); - ksft_print_msg("%s: Block on a futex and wait for timeout\n", - basename(argv[0])); - ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); - - pthread_barrier_init(&barrier, NULL, 2); - pthread_create(&thread, NULL, get_pi_lock, NULL); + int res; /* initialize relative timeout */ to.tv_sec = 0; to.tv_nsec = timeout_ns; res = futex_wait(&f1, f1, &to, 0); - test_timeout(res, &ret, "futex_wait relative", ETIMEDOUT); + test_timeout(res, "futex_wait relative", ETIMEDOUT); /* FUTEX_WAIT_BITSET with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_bitset(&f1, f1, &to, 1, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_wait_bitset realtime", ETIMEDOUT); + test_timeout(res, "futex_wait_bitset realtime", ETIMEDOUT); /* FUTEX_WAIT_BITSET with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_bitset(&f1, f1, &to, 1, 0); - test_timeout(res, &ret, "futex_wait_bitset monotonic", ETIMEDOUT); + test_timeout(res, "futex_wait_bitset monotonic", ETIMEDOUT); +} + +TEST(requeue_pi) +{ + futex_t f1 = FUTEX_INITIALIZER; + struct timespec to; + int res; /* FUTEX_WAIT_REQUEUE_PI with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_requeue_pi(&f1, f1, &futex_pi, &to, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_wait_requeue_pi realtime", ETIMEDOUT); + test_timeout(res, "futex_wait_requeue_pi realtime", ETIMEDOUT); /* FUTEX_WAIT_REQUEUE_PI with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_requeue_pi(&f1, f1, &futex_pi, &to, 0); - test_timeout(res, &ret, "futex_wait_requeue_pi monotonic", ETIMEDOUT); + test_timeout(res, "futex_wait_requeue_pi monotonic", ETIMEDOUT); + +} + +TEST(lock_pi) +{ + struct timespec to; + pthread_t thread; + int res; + + /* Create a thread that will lock forever so any waiter will timeout */ + pthread_barrier_init(&barrier, NULL, 2); + pthread_create(&thread, NULL, get_pi_lock, NULL); /* Wait until the other thread calls futex_lock_pi() */ pthread_barrier_wait(&barrier); pthread_barrier_destroy(&barrier); + /* * FUTEX_LOCK_PI with CLOCK_REALTIME * Due to historical reasons, FUTEX_LOCK_PI supports only realtime @@ -181,26 +150,38 @@ int main(int argc, char *argv[]) * smaller than realtime and the syscall will timeout immediately. */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_lock_pi(&futex_pi, &to, 0, 0); - test_timeout(res, &ret, "futex_lock_pi realtime", ETIMEDOUT); + test_timeout(res, "futex_lock_pi realtime", ETIMEDOUT); /* Test operations that don't support FUTEX_CLOCK_REALTIME */ res = futex_lock_pi(&futex_pi, NULL, 0, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_lock_pi invalid timeout flag", ENOSYS); + test_timeout(res, "futex_lock_pi invalid timeout flag", ENOSYS); +} + +TEST(waitv) +{ + futex_t f1 = FUTEX_INITIALIZER; + struct futex_waitv waitv = { + .uaddr = (uintptr_t)&f1, + .val = f1, + .flags = FUTEX_32, + .__reserved = 0, + }; + struct timespec to; + int res; /* futex_waitv with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); - test_timeout(res, &ret, "futex_waitv monotonic", ETIMEDOUT); + test_timeout(res, "futex_waitv monotonic", ETIMEDOUT); /* futex_waitv with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_REALTIME); - test_timeout(res, &ret, "futex_waitv realtime", ETIMEDOUT); - - ksft_print_cnts(); - return ret; + test_timeout(res, "futex_waitv realtime", ETIMEDOUT); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c index ed9cd07e31c1..ce2301500d83 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c +++ b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c @@ -29,95 +29,55 @@ #include <linux/futex.h> #include <libgen.h> -#include "logging.h" #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-uninitialized-heap" #define WAIT_US 5000000 static int child_blocked = 1; -static int child_ret; +static bool child_ret; void *buf; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *wait_thread(void *arg) { int res; - child_ret = RET_PASS; + child_ret = true; res = futex_wait(buf, 1, NULL, 0); child_blocked = 0; if (res != 0 && errno != EWOULDBLOCK) { - error("futex failure\n", errno); - child_ret = RET_ERROR; + ksft_exit_fail_msg("futex failure\n"); + child_ret = false; } pthread_exit(NULL); } -int main(int argc, char **argv) +TEST(futex_wait_uninitialized_heap) { - int c, ret = RET_PASS; long page_size; pthread_t thr; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } + int ret; page_size = sysconf(_SC_PAGESIZE); buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); - if (buf == (void *)-1) { - error("mmap\n", errno); - exit(1); - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test the uninitialized futex value in FUTEX_WAIT\n", - basename(argv[0])); - + if (buf == (void *)-1) + ksft_exit_fail_msg("mmap\n"); ret = pthread_create(&thr, NULL, wait_thread, NULL); - if (ret) { - error("pthread_create\n", errno); - ret = RET_ERROR; - goto out; - } + if (ret) + ksft_exit_fail_msg("pthread_create\n"); - info("waiting %dus for child to return\n", WAIT_US); + ksft_print_dbg_msg("waiting %dus for child to return\n", WAIT_US); usleep(WAIT_US); - ret = child_ret; - if (child_blocked) { - fail("child blocked in kernel\n"); - ret = RET_FAIL; - } + if (child_blocked) + ksft_test_result_fail("child blocked in kernel\n"); - out: - print_result(TEST_NAME, ret); - return ret; + if (!child_ret) + ksft_test_result_fail("child error\n"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c index 2d8230da9064..36b7a54a4085 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c @@ -21,72 +21,44 @@ #include <stdlib.h> #include <string.h> #include <time.h> + #include "futextest.h" #include "futex2test.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-wouldblock" #define timeout_ns 100000 -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - -int main(int argc, char *argv[]) +TEST(futex_wait_wouldblock) { struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; futex_t f1 = FUTEX_INITIALIZER; - int res, ret = RET_PASS; - int c; - struct futex_waitv waitv = { - .uaddr = (uintptr_t)&f1, - .val = f1+1, - .flags = FUTEX_32, - .__reserved = 0 - }; + int res; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(2); - ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", - basename(argv[0])); - - info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + ksft_print_dbg_msg("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG); if (!res || errno != EWOULDBLOCK) { ksft_test_result_fail("futex_wait returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wait\n"); } +} - if (clock_gettime(CLOCK_MONOTONIC, &to)) { - error("clock_gettime failed\n", errno); - return errno; - } +TEST(futex_waitv_wouldblock) +{ + struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; + futex_t f1 = FUTEX_INITIALIZER; + struct futex_waitv waitv = { + .uaddr = (uintptr_t)&f1, + .val = f1 + 1, + .flags = FUTEX_32, + .__reserved = 0, + }; + int res; + + if (clock_gettime(CLOCK_MONOTONIC, &to)) + ksft_exit_fail_msg("clock_gettime failed %d\n", errno); to.tv_nsec += timeout_ns; @@ -95,17 +67,15 @@ int main(int argc, char *argv[]) to.tv_nsec -= 1000000000; } - info("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + ksft_print_dbg_msg("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); if (!res || errno != EWOULDBLOCK) { ksft_test_result_fail("futex_waitv returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_waitv.c b/tools/testing/selftests/futex/functional/futex_waitv.c index a94337f677e1..c684b10eb76e 100644 --- a/tools/testing/selftests/futex/functional/futex_waitv.c +++ b/tools/testing/selftests/futex/functional/futex_waitv.c @@ -15,25 +15,16 @@ #include <pthread.h> #include <stdint.h> #include <sys/shm.h> + #include "futextest.h" #include "futex2test.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait" #define WAKE_WAIT_US 10000 #define NR_FUTEXES 30 static struct futex_waitv waitv[NR_FUTEXES]; u_int32_t futexes[NR_FUTEXES] = {0}; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *waiterfn(void *arg) { struct timespec to; @@ -41,7 +32,7 @@ void *waiterfn(void *arg) /* setting absolute timeout for futex2 */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -57,34 +48,10 @@ void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(private_waitv) { pthread_t waiter; - int res, ret = RET_PASS; - struct timespec to; - int c, i; - - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(7); - ksft_print_msg("%s: Test FUTEX_WAITV\n", - basename(argv[0])); + int res, i; for (i = 0; i < NR_FUTEXES; i++) { waitv[i].uaddr = (uintptr_t)&futexes[i]; @@ -95,7 +62,7 @@ int main(int argc, char *argv[]) /* Private waitv */ if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); @@ -104,10 +71,15 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv private\n"); } +} + +TEST(shared_waitv) +{ + pthread_t waiter; + int res, i; /* Shared waitv */ for (i = 0; i < NR_FUTEXES; i++) { @@ -128,7 +100,7 @@ int main(int argc, char *argv[]) } if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); @@ -137,19 +109,24 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake shared returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv shared\n"); } for (i = 0; i < NR_FUTEXES; i++) shmdt(u64_to_ptr(waitv[i].uaddr)); +} + +TEST(invalid_flag) +{ + struct timespec to; + int res; /* Testing a waiter without FUTEX_32 flag */ waitv[0].flags = FUTEX_PRIVATE_FLAG; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -158,17 +135,22 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv without FUTEX_32\n"); } +} + +TEST(unaligned_address) +{ + struct timespec to; + int res; /* Testing a waiter with an unaligned address */ waitv[0].flags = FUTEX_PRIVATE_FLAG | FUTEX_32; waitv[0].uaddr = 1; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -177,16 +159,21 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv with an unaligned address\n"); } +} + +TEST(null_address) +{ + struct timespec to; + int res; /* Testing a NULL address for waiters.uaddr */ waitv[0].uaddr = 0x00000000; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -195,14 +182,13 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv NULL address in waitv.uaddr\n"); } /* Testing a NULL address for *waiters */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -211,14 +197,19 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv NULL address in *waiters\n"); } +} + +TEST(invalid_clockid) +{ + struct timespec to; + int res; /* Testing an invalid clockid */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -227,11 +218,9 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv invalid clockid\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 81739849f299..e88545c06d57 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -18,74 +18,36 @@ # ############################################################################### -# Test for a color capable console -if [ -z "$USE_COLOR" ]; then - tput setf 7 || tput setaf 7 - if [ $? -eq 0 ]; then - USE_COLOR=1 - tput sgr0 - fi -fi -if [ "$USE_COLOR" -eq 1 ]; then - COLOR="-c" -fi - - echo -# requeue pi testing -# without timeouts -./futex_requeue_pi $COLOR -./futex_requeue_pi $COLOR -b -./futex_requeue_pi $COLOR -b -l -./futex_requeue_pi $COLOR -b -o -./futex_requeue_pi $COLOR -l -./futex_requeue_pi $COLOR -o -# with timeouts -./futex_requeue_pi $COLOR -b -l -t 5000 -./futex_requeue_pi $COLOR -l -t 5000 -./futex_requeue_pi $COLOR -b -l -t 500000 -./futex_requeue_pi $COLOR -l -t 500000 -./futex_requeue_pi $COLOR -b -t 5000 -./futex_requeue_pi $COLOR -t 5000 -./futex_requeue_pi $COLOR -b -t 500000 -./futex_requeue_pi $COLOR -t 500000 -./futex_requeue_pi $COLOR -b -o -t 5000 -./futex_requeue_pi $COLOR -l -t 5000 -./futex_requeue_pi $COLOR -b -o -t 500000 -./futex_requeue_pi $COLOR -l -t 500000 -# with long timeout -./futex_requeue_pi $COLOR -b -l -t 2000000000 -./futex_requeue_pi $COLOR -l -t 2000000000 - +./futex_requeue_pi echo -./futex_requeue_pi_mismatched_ops $COLOR +./futex_requeue_pi_mismatched_ops echo -./futex_requeue_pi_signal_restart $COLOR +./futex_requeue_pi_signal_restart echo -./futex_wait_timeout $COLOR +./futex_wait_timeout echo -./futex_wait_wouldblock $COLOR +./futex_wait_wouldblock echo -./futex_wait_uninitialized_heap $COLOR -./futex_wait_private_mapped_file $COLOR +./futex_wait_uninitialized_heap +./futex_wait_private_mapped_file echo -./futex_wait $COLOR +./futex_wait echo -./futex_requeue $COLOR +./futex_requeue echo -./futex_waitv $COLOR +./futex_waitv echo -./futex_priv_hash $COLOR -./futex_priv_hash -g $COLOR +./futex_priv_hash echo -./futex_numa_mpol $COLOR +./futex_numa_mpol diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h index 7a5fd1d5355e..3d48e9789d9f 100644 --- a/tools/testing/selftests/futex/include/futextest.h +++ b/tools/testing/selftests/futex/include/futextest.h @@ -58,6 +58,17 @@ typedef volatile u_int32_t futex_t; #define SYS_futex SYS_futex_time64 #endif +/* + * On 32bit systems if we use "-D_FILE_OFFSET_BITS=64 -D_TIME_BITS=64" or if + * we are using a newer compiler then the size of the timestamps will be 64bit, + * however, the SYS_futex will still point to the 32bit futex system call. + */ +#if __SIZEOF_POINTER__ == 4 && defined(SYS_futex_time64) && \ + defined(_TIME_BITS) && _TIME_BITS == 64 +# undef SYS_futex +# define SYS_futex SYS_futex_time64 +#endif + /** * futex() - SYS_futex syscall wrapper * @uaddr: address of first futex diff --git a/tools/testing/selftests/futex/include/logging.h b/tools/testing/selftests/futex/include/logging.h deleted file mode 100644 index 874c69ce5cce..000000000000 --- a/tools/testing/selftests/futex/include/logging.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/****************************************************************************** - * - * Copyright © International Business Machines Corp., 2009 - * - * DESCRIPTION - * Glibc independent futex library for testing kernel functionality. - * - * AUTHOR - * Darren Hart <dvhart@linux.intel.com> - * - * HISTORY - * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com> - * - *****************************************************************************/ - -#ifndef _LOGGING_H -#define _LOGGING_H - -#include <stdio.h> -#include <string.h> -#include <unistd.h> -#include <linux/futex.h> -#include "kselftest.h" - -/* - * Define PASS, ERROR, and FAIL strings with and without color escape - * sequences, default to no color. - */ -#define ESC 0x1B, '[' -#define BRIGHT '1' -#define GREEN '3', '2' -#define YELLOW '3', '3' -#define RED '3', '1' -#define ESCEND 'm' -#define BRIGHT_GREEN ESC, BRIGHT, ';', GREEN, ESCEND -#define BRIGHT_YELLOW ESC, BRIGHT, ';', YELLOW, ESCEND -#define BRIGHT_RED ESC, BRIGHT, ';', RED, ESCEND -#define RESET_COLOR ESC, '0', 'm' -static const char PASS_COLOR[] = {BRIGHT_GREEN, ' ', 'P', 'A', 'S', 'S', - RESET_COLOR, 0}; -static const char ERROR_COLOR[] = {BRIGHT_YELLOW, 'E', 'R', 'R', 'O', 'R', - RESET_COLOR, 0}; -static const char FAIL_COLOR[] = {BRIGHT_RED, ' ', 'F', 'A', 'I', 'L', - RESET_COLOR, 0}; -static const char INFO_NORMAL[] = " INFO"; -static const char PASS_NORMAL[] = " PASS"; -static const char ERROR_NORMAL[] = "ERROR"; -static const char FAIL_NORMAL[] = " FAIL"; -const char *INFO = INFO_NORMAL; -const char *PASS = PASS_NORMAL; -const char *ERROR = ERROR_NORMAL; -const char *FAIL = FAIL_NORMAL; - -/* Verbosity setting for INFO messages */ -#define VQUIET 0 -#define VCRITICAL 1 -#define VINFO 2 -#define VMAX VINFO -int _verbose = VCRITICAL; - -/* Functional test return codes */ -#define RET_PASS 0 -#define RET_ERROR -1 -#define RET_FAIL -2 - -/** - * log_color() - Use colored output for PASS, ERROR, and FAIL strings - * @use_color: use color (1) or not (0) - */ -void log_color(int use_color) -{ - if (use_color) { - PASS = PASS_COLOR; - ERROR = ERROR_COLOR; - FAIL = FAIL_COLOR; - } else { - PASS = PASS_NORMAL; - ERROR = ERROR_NORMAL; - FAIL = FAIL_NORMAL; - } -} - -/** - * log_verbosity() - Set verbosity of test output - * @verbose: Enable (1) verbose output or not (0) - * - * Currently setting verbose=1 will enable INFO messages and 0 will disable - * them. FAIL and ERROR messages are always displayed. - */ -void log_verbosity(int level) -{ - if (level > VMAX) - level = VMAX; - else if (level < 0) - level = 0; - _verbose = level; -} - -/** - * print_result() - Print standard PASS | ERROR | FAIL results - * @ret: the return value to be considered: 0 | RET_ERROR | RET_FAIL - * - * print_result() is primarily intended for functional tests. - */ -void print_result(const char *test_name, int ret) -{ - switch (ret) { - case RET_PASS: - ksft_test_result_pass("%s\n", test_name); - ksft_print_cnts(); - return; - case RET_ERROR: - ksft_test_result_error("%s\n", test_name); - ksft_print_cnts(); - return; - case RET_FAIL: - ksft_test_result_fail("%s\n", test_name); - ksft_print_cnts(); - return; - } -} - -/* log level macros */ -#define info(message, vargs...) \ -do { \ - if (_verbose >= VINFO) \ - fprintf(stderr, "\t%s: "message, INFO, ##vargs); \ -} while (0) - -#define error(message, err, args...) \ -do { \ - if (_verbose >= VCRITICAL) {\ - if (err) \ - fprintf(stderr, "\t%s: %s: "message, \ - ERROR, strerror(err), ##args); \ - else \ - fprintf(stderr, "\t%s: "message, ERROR, ##args); \ - } \ -} while (0) - -#define fail(message, args...) \ -do { \ - if (_verbose >= VCRITICAL) \ - fprintf(stderr, "\t%s: "message, FAIL, ##args); \ -} while (0) - -#endif diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 651fc9f13c08..45c14323a618 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -113,7 +113,7 @@ static bool fail_nth_next(struct __test_metadata *_metadata, * necessarily mean a test failure, just that the limit has to be made * bigger. */ - ASSERT_GT(400, nth_state->iteration); + ASSERT_GT(1000, nth_state->iteration); if (nth_state->iteration != 0) { ssize_t res; ssize_t res2; diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index c3b6d2604b1e..8deeb4b72e73 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -54,6 +54,7 @@ #include <stdlib.h> #include <unistd.h> #include <stdarg.h> +#include <stdbool.h> #include <string.h> #include <stdio.h> #include <sys/utsname.h> @@ -104,6 +105,7 @@ struct ksft_count { static struct ksft_count ksft_cnt; static unsigned int ksft_plan; +static bool ksft_debug_enabled; static inline unsigned int ksft_test_num(void) { @@ -175,6 +177,18 @@ static inline __printf(1, 2) void ksft_print_msg(const char *msg, ...) va_end(args); } +static inline void ksft_print_dbg_msg(const char *msg, ...) +{ + va_list args; + + if (!ksft_debug_enabled) + return; + + va_start(args, msg); + ksft_print_msg(msg, args); + va_end(args); +} + static inline void ksft_perror(const char *msg) { ksft_print_msg("%s: %s (%d)\n", msg, strerror(errno), errno); diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 2925e47db995..3f66e862e83e 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -751,7 +751,7 @@ for (; _metadata->trigger; _metadata->trigger = \ __bail(_assert, _metadata)) -#define is_signed_type(var) (!!(((__typeof__(var))(-1)) < (__typeof__(var))1)) +#define is_signed_var(var) (!!(((__typeof__(var))(-1)) < (__typeof__(var))1)) #define __EXPECT(_expected, _expected_str, _seen, _seen_str, _t, _assert) do { \ /* Avoid multiple evaluation of the cases */ \ @@ -759,7 +759,7 @@ __typeof__(_seen) __seen = (_seen); \ if (!(__exp _t __seen)) { \ /* Report with actual signedness to avoid weird output. */ \ - switch (is_signed_type(__exp) * 2 + is_signed_type(__seen)) { \ + switch (is_signed_var(__exp) * 2 + is_signed_var(__seen)) { \ case 0: { \ uintmax_t __exp_print = (uintmax_t)__exp; \ uintmax_t __seen_print = (uintmax_t)__seen; \ @@ -1091,7 +1091,7 @@ static int test_harness_argv_check(int argc, char **argv) { int opt; - while ((opt = getopt(argc, argv, "hlF:f:V:v:t:T:r:")) != -1) { + while ((opt = getopt(argc, argv, "dhlF:f:V:v:t:T:r:")) != -1) { switch (opt) { case 'f': case 'F': @@ -1104,12 +1104,16 @@ static int test_harness_argv_check(int argc, char **argv) case 'l': test_harness_list_tests(); return KSFT_SKIP; + case 'd': + ksft_debug_enabled = true; + break; case 'h': default: fprintf(stderr, - "Usage: %s [-h|-l] [-t|-T|-v|-V|-f|-F|-r name]\n" + "Usage: %s [-h|-l|-d] [-t|-T|-v|-V|-f|-F|-r name]\n" "\t-h print help\n" "\t-l list all tests\n" + "\t-d enable debug prints\n" "\n" "\t-t name include test\n" "\t-T name exclude test\n" @@ -1142,8 +1146,9 @@ static bool test_enabled(int argc, char **argv, int opt; optind = 1; - while ((opt = getopt(argc, argv, "F:f:V:v:t:T:r:")) != -1) { - has_positive |= islower(opt); + while ((opt = getopt(argc, argv, "dF:f:V:v:t:T:r:")) != -1) { + if (opt != 'd') + has_positive |= islower(opt); switch (tolower(opt)) { case 't': diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index f6fe7a07a0a2..41b40c676d7f 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -169,6 +169,7 @@ TEST_GEN_PROGS_arm64 += arm64/vgic_irq TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3 +TEST_GEN_PROGS_arm64 += arm64/kvm-uuid TEST_GEN_PROGS_arm64 += access_tracking_perf_test TEST_GEN_PROGS_arm64 += arch_timer TEST_GEN_PROGS_arm64 += coalesced_io_test diff --git a/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c b/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c index cef8f7323ceb..713005b6f508 100644 --- a/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c @@ -146,7 +146,7 @@ static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu) val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1)); - el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val); + el0 = FIELD_GET(ID_AA64PFR0_EL1_EL0, val); return el0 == ID_AA64PFR0_EL1_EL0_IMP; } diff --git a/tools/testing/selftests/kvm/arm64/debug-exceptions.c b/tools/testing/selftests/kvm/arm64/debug-exceptions.c index e34963956fbc..1d431de8729c 100644 --- a/tools/testing/selftests/kvm/arm64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/arm64/debug-exceptions.c @@ -116,12 +116,12 @@ static void reset_debug_state(void) /* Reset all bcr/bvr/wcr/wvr registers */ dfr0 = read_sysreg(id_aa64dfr0_el1); - brps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), dfr0); + brps = FIELD_GET(ID_AA64DFR0_EL1_BRPs, dfr0); for (i = 0; i <= brps; i++) { write_dbgbcr(i, 0); write_dbgbvr(i, 0); } - wrps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), dfr0); + wrps = FIELD_GET(ID_AA64DFR0_EL1_WRPs, dfr0); for (i = 0; i <= wrps; i++) { write_dbgwcr(i, 0); write_dbgwvr(i, 0); @@ -418,7 +418,7 @@ static void guest_code_ss(int test_cnt) static int debug_version(uint64_t id_aa64dfr0) { - return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), id_aa64dfr0); + return FIELD_GET(ID_AA64DFR0_EL1_DebugVer, id_aa64dfr0); } static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn) @@ -539,14 +539,14 @@ void test_guest_debug_exceptions_all(uint64_t aa64dfr0) int b, w, c; /* Number of breakpoints */ - brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), aa64dfr0) + 1; + brp_num = FIELD_GET(ID_AA64DFR0_EL1_BRPs, aa64dfr0) + 1; __TEST_REQUIRE(brp_num >= 2, "At least two breakpoints are required"); /* Number of watchpoints */ - wrp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), aa64dfr0) + 1; + wrp_num = FIELD_GET(ID_AA64DFR0_EL1_WRPs, aa64dfr0) + 1; /* Number of context aware breakpoints */ - ctx_brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_CTX_CMPs), aa64dfr0) + 1; + ctx_brp_num = FIELD_GET(ID_AA64DFR0_EL1_CTX_CMPs, aa64dfr0) + 1; pr_debug("%s brp_num:%d, wrp_num:%d, ctx_brp_num:%d\n", __func__, brp_num, wrp_num, ctx_brp_num); diff --git a/tools/testing/selftests/kvm/arm64/kvm-uuid.c b/tools/testing/selftests/kvm/arm64/kvm-uuid.c new file mode 100644 index 000000000000..af9581b860f1 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/kvm-uuid.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Check that nobody has tampered with KVM's UID + +#include <errno.h> +#include <linux/arm-smccc.h> +#include <asm/kvm.h> +#include <kvm_util.h> + +#include "processor.h" + +/* + * Do NOT redefine these constants, or try to replace them with some + * "common" version. They are hardcoded here to detect any potential + * breakage happening in the rest of the kernel. + * + * KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 + */ +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 0xb66fb428U +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 0xe911c52eU +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 0x564bcaa9U +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3 0x743a004dU + +static void guest_code(void) +{ + struct arm_smccc_res res = {}; + + smccc_hvc(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, 0, 0, 0, 0, 0, 0, 0, &res); + + __GUEST_ASSERT(res.a0 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 && + res.a1 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 && + res.a2 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 && + res.a3 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3, + "Unexpected KVM-specific UID %lx %lx %lx %lx\n", res.a0, res.a1, res.a2, res.a3); + GUEST_DONE(); +} + +int main (int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + bool guest_done = false; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + while (!guest_done) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + break; + case UCALL_DONE: + guest_done = true; + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_PRINTF: + printf("%s", uc.buffer); + break; + default: + TEST_FAIL("Unexpected guest exit"); + } + } + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/arm64/no-vgic-v3.c b/tools/testing/selftests/kvm/arm64/no-vgic-v3.c index ebd70430c89d..f222538e6084 100644 --- a/tools/testing/selftests/kvm/arm64/no-vgic-v3.c +++ b/tools/testing/selftests/kvm/arm64/no-vgic-v3.c @@ -54,7 +54,7 @@ static void guest_code(void) * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having * hidden the feature at runtime without any other userspace action. */ - __GUEST_ASSERT(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), + __GUEST_ASSERT(FIELD_GET(ID_AA64PFR0_EL1_GIC, read_sysreg(id_aa64pfr0_el1)) == 0, "GICv3 wrongly advertised"); @@ -165,7 +165,7 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, NULL); pfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1)); - __TEST_REQUIRE(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), pfr0), + __TEST_REQUIRE(FIELD_GET(ID_AA64PFR0_EL1_GIC, pfr0), "GICv3 not supported."); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/arm64/page_fault_test.c b/tools/testing/selftests/kvm/arm64/page_fault_test.c index dc6559dad9d8..4ccbd389d133 100644 --- a/tools/testing/selftests/kvm/arm64/page_fault_test.c +++ b/tools/testing/selftests/kvm/arm64/page_fault_test.c @@ -95,14 +95,14 @@ static bool guest_check_lse(void) uint64_t isar0 = read_sysreg(id_aa64isar0_el1); uint64_t atomic; - atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC), isar0); + atomic = FIELD_GET(ID_AA64ISAR0_EL1_ATOMIC, isar0); return atomic >= 2; } static bool guest_check_dc_zva(void) { uint64_t dczid = read_sysreg(dczid_el0); - uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_EL0_DZP), dczid); + uint64_t dzp = FIELD_GET(DCZID_EL0_DZP, dczid); return dzp == 0; } @@ -195,7 +195,7 @@ static bool guest_set_ha(void) uint64_t hadbs, tcr; /* Skip if HA is not supported. */ - hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS), mmfr1); + hadbs = FIELD_GET(ID_AA64MMFR1_EL1_HAFDBS, mmfr1); if (hadbs == 0) return false; diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index d3bf9204409c..189321e96925 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -243,6 +243,7 @@ static void guest_code(void) GUEST_REG_SYNC(SYS_ID_AA64MMFR0_EL1); GUEST_REG_SYNC(SYS_ID_AA64MMFR1_EL1); GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1); + GUEST_REG_SYNC(SYS_ID_AA64MMFR3_EL1); GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1); GUEST_REG_SYNC(SYS_CTR_EL0); GUEST_REG_SYNC(SYS_MIDR_EL1); @@ -594,8 +595,8 @@ static void test_user_set_mte_reg(struct kvm_vcpu *vcpu) */ val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1)); - mte = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), val); - mte_frac = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac), val); + mte = FIELD_GET(ID_AA64PFR1_EL1_MTE, val); + mte_frac = FIELD_GET(ID_AA64PFR1_EL1_MTE_frac, val); if (mte != ID_AA64PFR1_EL1_MTE_MTE2 || mte_frac != ID_AA64PFR1_EL1_MTE_frac_NI) { ksft_test_result_skip("MTE_ASYNC or MTE_ASYMM are supported, nothing to test\n"); @@ -612,7 +613,7 @@ static void test_user_set_mte_reg(struct kvm_vcpu *vcpu) } val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1)); - mte_frac = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac), val); + mte_frac = FIELD_GET(ID_AA64PFR1_EL1_MTE_frac, val); if (mte_frac == ID_AA64PFR1_EL1_MTE_frac_NI) ksft_test_result_pass("ID_AA64PFR1_EL1.MTE_frac=0 accepted and still 0xF\n"); else @@ -774,7 +775,7 @@ int main(void) /* Check for AARCH64 only system */ val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1)); - el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val); + el0 = FIELD_GET(ID_AA64PFR0_EL1_EL0, val); aarch64_only = (el0 == ID_AA64PFR0_EL1_EL0_IMP); ksft_print_header(); diff --git a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c index f16b3b27e32e..a0c4ab839155 100644 --- a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c +++ b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c @@ -441,7 +441,7 @@ static void create_vpmu_vm(void *guest_code) /* Make sure that PMUv3 support is indicated in the ID register */ dfr0 = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1)); - pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), dfr0); + pmuver = FIELD_GET(ID_AA64DFR0_EL1_PMUVer, dfr0); TEST_ASSERT(pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP, "Unexpected PMUVER (0x%x) on the vCPU with PMUv3", pmuver); diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 9d69904cb608..eb115123d741 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -573,15 +573,15 @@ void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k, err = ioctl(vcpu_fd, KVM_GET_ONE_REG, ®); TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd)); - gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN4), val); + gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN4, val); *ipa4k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN4_NI, ID_AA64MMFR0_EL1_TGRAN4_52_BIT); - gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN64), val); + gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN64, val); *ipa64k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN64_NI, ID_AA64MMFR0_EL1_TGRAN64_IMP); - gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN16), val); + gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN16, val); *ipa16k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN16_NI, ID_AA64MMFR0_EL1_TGRAN16_52_BIT); diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index d30625c18259..c744c603d688 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -1554,8 +1554,8 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc) } /* Read from the page to populate the shared zeropage. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, pagesize); munmap: diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index b0d42eb04e3a..8dd81c0a4a5a 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -145,7 +145,7 @@ static bool try_access_buf(char *ptr, bool write) if (write) *ptr = 'x'; else - FORCE_READ(ptr); + FORCE_READ(*ptr); } signal_jump_set = false; diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index 1afe14b9dc0c..c5940c0595be 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -50,8 +50,10 @@ void read_fault_pages(void *addr, unsigned long nr_pages) unsigned long i; for (i = 0; i < nr_pages; i++) { + unsigned long *addr2 = + ((unsigned long *)(addr + (i * huge_page_size))); /* Prevent the compiler from optimizing out the entire loop: */ - FORCE_READ(((unsigned long *)(addr + (i * huge_page_size)))); + FORCE_READ(*addr2); } } diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index c5a73617796a..ea945eebec2f 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -110,7 +110,7 @@ void *access_mem(void *ptr) * the memory access actually happens and prevents the compiler * from optimizing away this entire loop. */ - FORCE_READ((uint64_t *)ptr); + FORCE_READ(*(uint64_t *)ptr); } return NULL; diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index fccf9e797a0c..5bd52a951cbd 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -5,10 +5,14 @@ #define _GNU_SOURCE #include <errno.h> +#include <fcntl.h> +#include <linux/userfaultfd.h> #include <stdlib.h> #include <stdio.h> #include <string.h> +#include <sys/ioctl.h> #include <sys/mman.h> +#include <syscall.h> #include <time.h> #include <stdbool.h> @@ -168,6 +172,7 @@ static bool is_range_mapped(FILE *maps_fp, unsigned long start, if (first_val <= start && second_val >= end) { success = true; + fflush(maps_fp); break; } } @@ -175,6 +180,15 @@ static bool is_range_mapped(FILE *maps_fp, unsigned long start, return success; } +/* Check if [ptr, ptr + size) mapped in /proc/self/maps. */ +static bool is_ptr_mapped(FILE *maps_fp, void *ptr, unsigned long size) +{ + unsigned long start = (unsigned long)ptr; + unsigned long end = start + size; + + return is_range_mapped(maps_fp, start, end); +} + /* * Returns the start address of the mapping on success, else returns * NULL on failure. @@ -733,6 +747,249 @@ out: dont_unmap ? " [dontunnmap]" : ""); } +#ifdef __NR_userfaultfd +static void mremap_move_multi_invalid_vmas(FILE *maps_fp, + unsigned long page_size) +{ + char *test_name = "mremap move multiple invalid vmas"; + const size_t size = 10 * page_size; + bool success = true; + char *ptr, *tgt_ptr; + int uffd, err, i; + void *res; + struct uffdio_api api = { + .api = UFFD_API, + .features = UFFD_EVENT_PAGEFAULT, + }; + + uffd = syscall(__NR_userfaultfd, O_NONBLOCK); + if (uffd == -1) { + err = errno; + perror("userfaultfd"); + if (err == EPERM) { + ksft_test_result_skip("%s - missing uffd", test_name); + return; + } + success = false; + goto out; + } + if (ioctl(uffd, UFFDIO_API, &api)) { + perror("ioctl UFFDIO_API"); + success = false; + goto out_close_uffd; + } + + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + success = false; + goto out_close_uffd; + } + + tgt_ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (tgt_ptr == MAP_FAILED) { + perror("mmap"); + success = false; + goto out_close_uffd; + } + if (munmap(tgt_ptr, size)) { + perror("munmap"); + success = false; + goto out_unmap; + } + + /* + * Unmap so we end up with: + * + * 0 2 4 6 8 10 offset in buffer + * |*| |*| |*| |*| |*| + * |*| |*| |*| |*| |*| + * + * Additionally, register each with UFFD. + */ + for (i = 0; i < 10; i += 2) { + void *unmap_ptr = &ptr[(i + 1) * page_size]; + unsigned long start = (unsigned long)&ptr[i * page_size]; + struct uffdio_register reg = { + .range = { + .start = start, + .len = page_size, + }, + .mode = UFFDIO_REGISTER_MODE_MISSING, + }; + + if (ioctl(uffd, UFFDIO_REGISTER, ®) == -1) { + perror("ioctl UFFDIO_REGISTER"); + success = false; + goto out_unmap; + } + if (munmap(unmap_ptr, page_size)) { + perror("munmap"); + success = false; + goto out_unmap; + } + } + + /* + * Now try to move the entire range which is invalid for multi VMA move. + * + * This will fail, and no VMA should be moved, as we check this ahead of + * time. + */ + res = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr); + err = errno; + if (res != MAP_FAILED) { + fprintf(stderr, "mremap() succeeded for multi VMA uffd armed\n"); + success = false; + goto out_unmap; + } + if (err != EFAULT) { + errno = err; + perror("mrmeap() unexpected error"); + success = false; + goto out_unmap; + } + if (is_ptr_mapped(maps_fp, tgt_ptr, page_size)) { + fprintf(stderr, + "Invalid uffd-armed VMA at start of multi range moved\n"); + success = false; + goto out_unmap; + } + + /* + * Now try to move a single VMA, this should succeed as not multi VMA + * move. + */ + res = mremap(ptr, page_size, page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr); + if (res == MAP_FAILED) { + perror("mremap single invalid-multi VMA"); + success = false; + goto out_unmap; + } + + /* + * Unmap the VMA, and remap a non-uffd registered (therefore, multi VMA + * move valid) VMA at the start of ptr range. + */ + if (munmap(tgt_ptr, page_size)) { + perror("munmap"); + success = false; + goto out_unmap; + } + res = mmap(ptr, page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + if (res == MAP_FAILED) { + perror("mmap"); + success = false; + goto out_unmap; + } + + /* + * Now try to move the entire range, we should succeed in moving the + * first VMA, but no others, and report a failure. + */ + res = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr); + err = errno; + if (res != MAP_FAILED) { + fprintf(stderr, "mremap() succeeded for multi VMA uffd armed\n"); + success = false; + goto out_unmap; + } + if (err != EFAULT) { + errno = err; + perror("mrmeap() unexpected error"); + success = false; + goto out_unmap; + } + if (!is_ptr_mapped(maps_fp, tgt_ptr, page_size)) { + fprintf(stderr, "Valid VMA not moved\n"); + success = false; + goto out_unmap; + } + + /* + * Unmap the VMA, and map valid VMA at start of ptr range, and replace + * all existing multi-move invalid VMAs, except the last, with valid + * multi-move VMAs. + */ + if (munmap(tgt_ptr, page_size)) { + perror("munmap"); + success = false; + goto out_unmap; + } + if (munmap(ptr, size - 2 * page_size)) { + perror("munmap"); + success = false; + goto out_unmap; + } + for (i = 0; i < 8; i += 2) { + res = mmap(&ptr[i * page_size], page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + if (res == MAP_FAILED) { + perror("mmap"); + success = false; + goto out_unmap; + } + } + + /* + * Now try to move the entire range, we should succeed in moving all but + * the last VMA, and report a failure. + */ + res = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr); + err = errno; + if (res != MAP_FAILED) { + fprintf(stderr, "mremap() succeeded for multi VMA uffd armed\n"); + success = false; + goto out_unmap; + } + if (err != EFAULT) { + errno = err; + perror("mrmeap() unexpected error"); + success = false; + goto out_unmap; + } + + for (i = 0; i < 10; i += 2) { + bool is_mapped = is_ptr_mapped(maps_fp, + &tgt_ptr[i * page_size], page_size); + + if (i < 8 && !is_mapped) { + fprintf(stderr, "Valid VMA not moved at %d\n", i); + success = false; + goto out_unmap; + } else if (i == 8 && is_mapped) { + fprintf(stderr, "Invalid VMA moved at %d\n", i); + success = false; + goto out_unmap; + } + } + +out_unmap: + if (munmap(tgt_ptr, size)) + perror("munmap tgt"); + if (munmap(ptr, size)) + perror("munmap src"); +out_close_uffd: + close(uffd); +out: + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} +#else +static void mremap_move_multi_invalid_vmas(FILE *maps_fp, unsigned long page_size) +{ + char *test_name = "mremap move multiple invalid vmas"; + + ksft_test_result_skip("%s - missing uffd", test_name); +} +#endif /* __NR_userfaultfd */ + /* Returns the time taken for the remap on success else returns -1. */ static long long remap_region(struct config c, unsigned int threshold_mb, char *rand_addr) @@ -1074,7 +1331,7 @@ int main(int argc, char **argv) char *rand_addr; size_t rand_size; int num_expand_tests = 2; - int num_misc_tests = 8; + int num_misc_tests = 9; struct test test_cases[MAX_TEST] = {}; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; @@ -1197,8 +1454,6 @@ int main(int argc, char **argv) mremap_expand_merge(maps_fp, page_size); mremap_expand_merge_offset(maps_fp, page_size); - fclose(maps_fp); - mremap_move_within_range(pattern_seed, rand_addr); mremap_move_1mb_from_start(pattern_seed, rand_addr); mremap_shrink_multiple_vmas(page_size, /* inplace= */true); @@ -1207,6 +1462,9 @@ int main(int argc, char **argv) mremap_move_multiple_vmas(pattern_seed, page_size, /* dontunmap= */ true); mremap_move_multiple_vmas_split(pattern_seed, page_size, /* dontunmap= */ false); mremap_move_multiple_vmas_split(pattern_seed, page_size, /* dontunmap= */ true); + mremap_move_multi_invalid_vmas(maps_fp, page_size); + + fclose(maps_fp); if (run_perf_tests) { ksft_print_msg("\n%s\n", diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 0d4209eef0c3..e6face7c0166 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1525,7 +1525,7 @@ void zeropfn_tests(void) ret = madvise(mem, hpage_size, MADV_HUGEPAGE); if (!ret) { - FORCE_READ(mem); + FORCE_READ(*mem); ret = pagemap_ioctl(mem, hpage_size, &vec, 1, 0, 0, PAGE_IS_PFNZERO, 0, 0, PAGE_IS_PFNZERO); diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 05de1fc0005b..44a3f8a58806 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -439,8 +439,11 @@ int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd, } madvise(*addr, fd_size, MADV_HUGEPAGE); - for (size_t i = 0; i < fd_size; i++) - FORCE_READ((*addr + i)); + for (size_t i = 0; i < fd_size; i++) { + char *addr2 = *addr + i; + + FORCE_READ(*addr2); + } if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index c20298ae98ea..b55d1809debc 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -23,7 +23,7 @@ * anything with it in order to trigger a read page fault. We therefore must use * volatile to stop the compiler from optimising this away. */ -#define FORCE_READ(x) (*(volatile typeof(x) *)x) +#define FORCE_READ(x) (*(const volatile typeof(x) *)&(x)) extern unsigned int __page_size; extern unsigned int __page_shift; diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index b1e4618399be..a688871a98eb 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -107,6 +107,26 @@ #endif #endif +#ifndef __NR_open_tree_attr + #if defined __alpha__ + #define __NR_open_tree_attr 577 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_open_tree_attr (467 + 4000) + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_open_tree_attr (467 + 6000) + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_open_tree_attr (467 + 5000) + #endif + #elif defined __ia64__ + #define __NR_open_tree_attr (467 + 1024) + #else + #define __NR_open_tree_attr 467 + #endif +#endif + #ifndef MOUNT_ATTR_IDMAP #define MOUNT_ATTR_IDMAP 0x00100000 #endif @@ -121,6 +141,12 @@ static inline int sys_mount_setattr(int dfd, const char *path, unsigned int flag return syscall(__NR_mount_setattr, dfd, path, flags, attr, size); } +static inline int sys_open_tree_attr(int dfd, const char *path, unsigned int flags, + struct mount_attr *attr, size_t size) +{ + return syscall(__NR_open_tree_attr, dfd, path, flags, attr, size); +} + static ssize_t write_nointr(int fd, const void *buf, size_t count) { ssize_t ret; @@ -1222,6 +1248,12 @@ TEST_F(mount_setattr_idmapped, attached_mount_inside_current_mount_namespace) attr.userns_fd = get_userns_fd(0, 10000, 10000); ASSERT_GE(attr.userns_fd, 0); ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + /* + * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way + * to bypass this mount_setattr() restriction. + */ + ASSERT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); ASSERT_EQ(close(open_tree_fd), 0); } @@ -1255,6 +1287,12 @@ TEST_F(mount_setattr_idmapped, attached_mount_outside_current_mount_namespace) ASSERT_GE(attr.userns_fd, 0); ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + /* + * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way + * to bypass this mount_setattr() restriction. + */ + ASSERT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); ASSERT_EQ(close(open_tree_fd), 0); } @@ -1321,6 +1359,19 @@ TEST_F(mount_setattr_idmapped, detached_mount_outside_current_mount_namespace) ASSERT_EQ(close(open_tree_fd), 0); } +static bool expected_uid_gid(int dfd, const char *path, int flags, + uid_t expected_uid, gid_t expected_gid) +{ + int ret; + struct stat st; + + ret = fstatat(dfd, path, &st, flags); + if (ret < 0) + return false; + + return st.st_uid == expected_uid && st.st_gid == expected_gid; +} + /** * Validate that currently changing the idmapping of an idmapped mount fails. */ @@ -1331,6 +1382,8 @@ TEST_F(mount_setattr_idmapped, change_idmapping) .attr_set = MOUNT_ATTR_IDMAP, }; + ASSERT_TRUE(expected_uid_gid(-EBADF, "/mnt/D", 0, 0, 0)); + if (!mount_setattr_supported()) SKIP(return, "mount_setattr syscall not supported"); @@ -1348,27 +1401,25 @@ TEST_F(mount_setattr_idmapped, change_idmapping) AT_EMPTY_PATH, &attr, sizeof(attr)), 0); ASSERT_EQ(close(attr.userns_fd), 0); + EXPECT_FALSE(expected_uid_gid(open_tree_fd, ".", 0, 0, 0)); + EXPECT_TRUE(expected_uid_gid(open_tree_fd, ".", 0, 10000, 10000)); + /* Change idmapping on a detached mount that is already idmapped. */ attr.userns_fd = get_userns_fd(0, 20000, 10000); ASSERT_GE(attr.userns_fd, 0); ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + /* + * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way + * to bypass this mount_setattr() restriction. + */ + EXPECT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + EXPECT_FALSE(expected_uid_gid(open_tree_fd, ".", 0, 20000, 20000)); + EXPECT_TRUE(expected_uid_gid(open_tree_fd, ".", 0, 10000, 10000)); + ASSERT_EQ(close(attr.userns_fd), 0); ASSERT_EQ(close(open_tree_fd), 0); } -static bool expected_uid_gid(int dfd, const char *path, int flags, - uid_t expected_uid, gid_t expected_gid) -{ - int ret; - struct stat st; - - ret = fstatat(dfd, path, &st, flags); - if (ret < 0) - return false; - - return st.st_uid == expected_uid && st.st_gid == expected_gid; -} - TEST_F(mount_setattr_idmapped, idmap_mount_tree_invalid) { int open_tree_fd = -EBADF; diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore new file mode 100644 index 000000000000..ccfb40837a73 --- /dev/null +++ b/tools/testing/selftests/namespaces/.gitignore @@ -0,0 +1,3 @@ +nsid_test +file_handle_test +init_ino_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile new file mode 100644 index 000000000000..5fe4b3dc07d3 --- /dev/null +++ b/tools/testing/selftests/namespaces/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) + +TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test + +include ../lib.mk + diff --git a/tools/testing/selftests/namespaces/config b/tools/testing/selftests/namespaces/config new file mode 100644 index 000000000000..d09836260262 --- /dev/null +++ b/tools/testing/selftests/namespaces/config @@ -0,0 +1,7 @@ +CONFIG_UTS_NS=y +CONFIG_TIME_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CGROUPS=y diff --git a/tools/testing/selftests/namespaces/file_handle_test.c b/tools/testing/selftests/namespaces/file_handle_test.c new file mode 100644 index 000000000000..f1bc5773f552 --- /dev/null +++ b/tools/testing/selftests/namespaces/file_handle_test.c @@ -0,0 +1,1429 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <grp.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/unistd.h> +#include "../kselftest_harness.h" + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ +#endif + +TEST(nsfs_net_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open a namespace file descriptor */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT as unprivileged user */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + if (fd < 0 && errno == EPERM) { + SKIP(free(handle); close(ns_fd); + return, + "Permission denied for unprivileged user (expected)"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_uts_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open UTS namespace file descriptor */ + ns_fd = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_ipc_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open IPC namespace file descriptor */ + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_pid_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open PID namespace file descriptor */ + ns_fd = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_mnt_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open mount namespace file descriptor */ + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_user_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open user namespace file descriptor */ + ns_fd = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_cgroup_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open cgroup namespace file descriptor */ + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); return, "cgroup namespace not available"); + } + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_time_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open time namespace file descriptor */ + ns_fd = open("/proc/self/ns/time", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); return, "time namespace not available"); + } + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_user_net_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current network namespace */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create network namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's network namespace handle from new user+net namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new network namespace"); + } + + /* Should fail with permission denied since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_uts_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current UTS namespace */ + ns_fd = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new UTS namespace */ + ret = unshare(CLONE_NEWUTS); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create UTS namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's UTS namespace handle from new user+uts namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new UTS namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_ipc_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current IPC namespace */ + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new IPC namespace */ + ret = unshare(CLONE_NEWIPC); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create IPC namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's IPC namespace handle from new user+ipc namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new IPC namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_mnt_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current mount namespace */ + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new mount namespace */ + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create mount namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's mount namespace handle from new user+mnt namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new mount namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_cgroup_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current cgroup namespace */ + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); + return, "cgroup namespace not available"); + } + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new cgroup namespace */ + ret = unshare(CLONE_NEWCGROUP); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create cgroup namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's cgroup namespace handle from new user+cgroup namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new cgroup namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_pid_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current PID namespace */ + ns_fd = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new PID namespace - requires fork to take effect */ + ret = unshare(CLONE_NEWPID); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create PID namespace */ + close(pipefd[1]); + exit(0); + } + + /* Fork again for PID namespace to take effect */ + pid_t child_pid = fork(); + if (child_pid < 0) { + write(pipefd[1], "N", + 1); /* Unable to fork in PID namespace */ + close(pipefd[1]); + exit(0); + } + + if (child_pid == 0) { + /* Grandchild in new PID namespace */ + /* Try to open parent's PID namespace handle from new user+pid namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", + 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child_pid, NULL, 0); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new PID namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_time_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current time namespace */ + ns_fd = open("/proc/self/ns/time", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); + return, "time namespace not available"); + } + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new time namespace - requires fork to take effect */ + ret = unshare(CLONE_NEWTIME); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create time namespace */ + close(pipefd[1]); + exit(0); + } + + /* Fork again for time namespace to take effect */ + pid_t child_pid = fork(); + if (child_pid < 0) { + write(pipefd[1], "N", + 1); /* Unable to fork in time namespace */ + close(pipefd[1]); + exit(0); + } + + if (child_pid == 0) { + /* Grandchild in new time namespace */ + /* Try to open parent's time namespace handle from new user+time namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", + 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child_pid, NULL, 0); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new time namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_open_flags) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open a namespace file descriptor */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Test invalid flags that should fail */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_WRONLY); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDWR); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TRUNC); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECT); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EINVAL); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TMPFILE); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EINVAL); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECTORY); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, ENOTDIR); + + close(ns_fd); + free(handle); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/init_ino_test.c b/tools/testing/selftests/namespaces/init_ino_test.c new file mode 100644 index 000000000000..5b6993c3740b --- /dev/null +++ b/tools/testing/selftests/namespaces/init_ino_test.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2025 Christian Brauner <brauner@kernel.org> + +#define _GNU_SOURCE +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <linux/nsfs.h> + +#include "../kselftest_harness.h" + +struct ns_info { + const char *name; + const char *proc_path; + unsigned int expected_ino; +}; + +static struct ns_info namespaces[] = { + { "ipc", "/proc/1/ns/ipc", IPC_NS_INIT_INO }, + { "uts", "/proc/1/ns/uts", UTS_NS_INIT_INO }, + { "user", "/proc/1/ns/user", USER_NS_INIT_INO }, + { "pid", "/proc/1/ns/pid", PID_NS_INIT_INO }, + { "cgroup", "/proc/1/ns/cgroup", CGROUP_NS_INIT_INO }, + { "time", "/proc/1/ns/time", TIME_NS_INIT_INO }, + { "net", "/proc/1/ns/net", NET_NS_INIT_INO }, + { "mnt", "/proc/1/ns/mnt", MNT_NS_INIT_INO }, +}; + +TEST(init_namespace_inodes) +{ + struct stat st; + + for (int i = 0; i < sizeof(namespaces) / sizeof(namespaces[0]); i++) { + int ret = stat(namespaces[i].proc_path, &st); + + /* Some namespaces might not be available (e.g., time namespace on older kernels) */ + if (ret < 0) { + if (errno == ENOENT) { + ksft_test_result_skip("%s namespace not available\n", + namespaces[i].name); + continue; + } + ASSERT_GE(ret, 0) + TH_LOG("Failed to stat %s: %s", + namespaces[i].proc_path, strerror(errno)); + } + + ASSERT_EQ(st.st_ino, namespaces[i].expected_ino) + TH_LOG("Namespace %s has inode 0x%lx, expected 0x%x", + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); + + ksft_print_msg("Namespace %s: inode 0x%lx matches expected 0x%x\n", + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c new file mode 100644 index 000000000000..e28accd74a57 --- /dev/null +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -0,0 +1,986 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <assert.h> +#include <fcntl.h> +#include <inttypes.h> +#include <libgen.h> +#include <limits.h> +#include <pthread.h> +#include <string.h> +#include <sys/mount.h> +#include <poll.h> +#include <sys/epoll.h> +#include <sys/resource.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <unistd.h> +#include <linux/fs.h> +#include <linux/limits.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" + +TEST(nsid_mntns_basic) +{ + __u64 mnt_ns_id = 0; + int fd_mntns; + int ret; + + /* Open the current mount namespace */ + fd_mntns = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(fd_mntns, 0); + + /* Get the mount namespace ID */ + ret = ioctl(fd_mntns, NS_GET_MNTNS_ID, &mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(mnt_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 mnt_ns_id2 = 0; + ret = ioctl(fd_mntns, NS_GET_ID, &mnt_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(mnt_ns_id, mnt_ns_id2); + + close(fd_mntns); +} + +TEST(nsid_mntns_separate) +{ + __u64 parent_mnt_ns_id = 0; + __u64 child_mnt_ns_id = 0; + int fd_parent_mntns, fd_child_mntns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's mount namespace ID */ + fd_parent_mntns = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(fd_parent_mntns, 0); + ret = ioctl(fd_parent_mntns, NS_GET_ID, &parent_mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_mnt_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new mount namespace */ + ret = unshare(CLONE_NEWNS); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_mntns); + SKIP(return, "No permission to create mount namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's mount namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/mnt", pid); + fd_child_mntns = open(path, O_RDONLY); + ASSERT_GE(fd_child_mntns, 0); + + /* Get child's mount namespace ID */ + ret = ioctl(fd_child_mntns, NS_GET_ID, &child_mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_mnt_ns_id, 0); + + /* Parent and child should have different mount namespace IDs */ + ASSERT_NE(parent_mnt_ns_id, child_mnt_ns_id); + + close(fd_parent_mntns); + close(fd_child_mntns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_cgroupns_basic) +{ + __u64 cgroup_ns_id = 0; + int fd_cgroupns; + int ret; + + /* Open the current cgroup namespace */ + fd_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); + ASSERT_GE(fd_cgroupns, 0); + + /* Get the cgroup namespace ID */ + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(cgroup_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 cgroup_ns_id2 = 0; + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(cgroup_ns_id, cgroup_ns_id2); + + close(fd_cgroupns); +} + +TEST(nsid_cgroupns_separate) +{ + __u64 parent_cgroup_ns_id = 0; + __u64 child_cgroup_ns_id = 0; + int fd_parent_cgroupns, fd_child_cgroupns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's cgroup namespace ID */ + fd_parent_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); + ASSERT_GE(fd_parent_cgroupns, 0); + ret = ioctl(fd_parent_cgroupns, NS_GET_ID, &parent_cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_cgroup_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new cgroup namespace */ + ret = unshare(CLONE_NEWCGROUP); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_cgroupns); + SKIP(return, "No permission to create cgroup namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's cgroup namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/cgroup", pid); + fd_child_cgroupns = open(path, O_RDONLY); + ASSERT_GE(fd_child_cgroupns, 0); + + /* Get child's cgroup namespace ID */ + ret = ioctl(fd_child_cgroupns, NS_GET_ID, &child_cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_cgroup_ns_id, 0); + + /* Parent and child should have different cgroup namespace IDs */ + ASSERT_NE(parent_cgroup_ns_id, child_cgroup_ns_id); + + close(fd_parent_cgroupns); + close(fd_child_cgroupns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_ipcns_basic) +{ + __u64 ipc_ns_id = 0; + int fd_ipcns; + int ret; + + /* Open the current IPC namespace */ + fd_ipcns = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(fd_ipcns, 0); + + /* Get the IPC namespace ID */ + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(ipc_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 ipc_ns_id2 = 0; + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(ipc_ns_id, ipc_ns_id2); + + close(fd_ipcns); +} + +TEST(nsid_ipcns_separate) +{ + __u64 parent_ipc_ns_id = 0; + __u64 child_ipc_ns_id = 0; + int fd_parent_ipcns, fd_child_ipcns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's IPC namespace ID */ + fd_parent_ipcns = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(fd_parent_ipcns, 0); + ret = ioctl(fd_parent_ipcns, NS_GET_ID, &parent_ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_ipc_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new IPC namespace */ + ret = unshare(CLONE_NEWIPC); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_ipcns); + SKIP(return, "No permission to create IPC namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's IPC namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/ipc", pid); + fd_child_ipcns = open(path, O_RDONLY); + ASSERT_GE(fd_child_ipcns, 0); + + /* Get child's IPC namespace ID */ + ret = ioctl(fd_child_ipcns, NS_GET_ID, &child_ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_ipc_ns_id, 0); + + /* Parent and child should have different IPC namespace IDs */ + ASSERT_NE(parent_ipc_ns_id, child_ipc_ns_id); + + close(fd_parent_ipcns); + close(fd_child_ipcns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_utsns_basic) +{ + __u64 uts_ns_id = 0; + int fd_utsns; + int ret; + + /* Open the current UTS namespace */ + fd_utsns = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(fd_utsns, 0); + + /* Get the UTS namespace ID */ + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(uts_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 uts_ns_id2 = 0; + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(uts_ns_id, uts_ns_id2); + + close(fd_utsns); +} + +TEST(nsid_utsns_separate) +{ + __u64 parent_uts_ns_id = 0; + __u64 child_uts_ns_id = 0; + int fd_parent_utsns, fd_child_utsns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's UTS namespace ID */ + fd_parent_utsns = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(fd_parent_utsns, 0); + ret = ioctl(fd_parent_utsns, NS_GET_ID, &parent_uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_uts_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new UTS namespace */ + ret = unshare(CLONE_NEWUTS); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_utsns); + SKIP(return, "No permission to create UTS namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's UTS namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid); + fd_child_utsns = open(path, O_RDONLY); + ASSERT_GE(fd_child_utsns, 0); + + /* Get child's UTS namespace ID */ + ret = ioctl(fd_child_utsns, NS_GET_ID, &child_uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_uts_ns_id, 0); + + /* Parent and child should have different UTS namespace IDs */ + ASSERT_NE(parent_uts_ns_id, child_uts_ns_id); + + close(fd_parent_utsns); + close(fd_child_utsns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_userns_basic) +{ + __u64 user_ns_id = 0; + int fd_userns; + int ret; + + /* Open the current user namespace */ + fd_userns = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(fd_userns, 0); + + /* Get the user namespace ID */ + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(user_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 user_ns_id2 = 0; + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(user_ns_id, user_ns_id2); + + close(fd_userns); +} + +TEST(nsid_userns_separate) +{ + __u64 parent_user_ns_id = 0; + __u64 child_user_ns_id = 0; + int fd_parent_userns, fd_child_userns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's user namespace ID */ + fd_parent_userns = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(fd_parent_userns, 0); + ret = ioctl(fd_parent_userns, NS_GET_ID, &parent_user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_user_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new user namespace */ + ret = unshare(CLONE_NEWUSER); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_userns); + SKIP(return, "No permission to create user namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's user namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/user", pid); + fd_child_userns = open(path, O_RDONLY); + ASSERT_GE(fd_child_userns, 0); + + /* Get child's user namespace ID */ + ret = ioctl(fd_child_userns, NS_GET_ID, &child_user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_user_ns_id, 0); + + /* Parent and child should have different user namespace IDs */ + ASSERT_NE(parent_user_ns_id, child_user_ns_id); + + close(fd_parent_userns); + close(fd_child_userns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_timens_basic) +{ + __u64 time_ns_id = 0; + int fd_timens; + int ret; + + /* Open the current time namespace */ + fd_timens = open("/proc/self/ns/time", O_RDONLY); + if (fd_timens < 0) { + SKIP(return, "Time namespaces not supported"); + } + + /* Get the time namespace ID */ + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(time_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 time_ns_id2 = 0; + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(time_ns_id, time_ns_id2); + + close(fd_timens); +} + +TEST(nsid_timens_separate) +{ + __u64 parent_time_ns_id = 0; + __u64 child_time_ns_id = 0; + int fd_parent_timens, fd_child_timens; + int ret; + pid_t pid; + int pipefd[2]; + + /* Open the current time namespace */ + fd_parent_timens = open("/proc/self/ns/time", O_RDONLY); + if (fd_parent_timens < 0) { + SKIP(return, "Time namespaces not supported"); + } + + /* Get parent's time namespace ID */ + ret = ioctl(fd_parent_timens, NS_GET_ID, &parent_time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_time_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new time namespace */ + ret = unshare(CLONE_NEWTIME); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES || errno == EINVAL) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Fork a grandchild to actually enter the new namespace */ + pid_t grandchild = fork(); + if (grandchild == 0) { + /* Grandchild is in the new namespace */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + pause(); + _exit(0); + } else if (grandchild > 0) { + /* Child writes grandchild PID and waits */ + write(pipefd[1], "Y", 1); + write(pipefd[1], &grandchild, sizeof(grandchild)); + close(pipefd[1]); + pause(); /* Keep the parent alive to maintain the grandchild */ + _exit(0); + } else { + _exit(1); + } + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_timens); + close(pipefd[0]); + SKIP(return, "Cannot create time namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + pid_t grandchild_pid; + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + close(pipefd[0]); + + /* Open grandchild's time namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/time", grandchild_pid); + fd_child_timens = open(path, O_RDONLY); + ASSERT_GE(fd_child_timens, 0); + + /* Get child's time namespace ID */ + ret = ioctl(fd_child_timens, NS_GET_ID, &child_time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_time_ns_id, 0); + + /* Parent and child should have different time namespace IDs */ + ASSERT_NE(parent_time_ns_id, child_time_ns_id); + + close(fd_parent_timens); + close(fd_child_timens); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_pidns_basic) +{ + __u64 pid_ns_id = 0; + int fd_pidns; + int ret; + + /* Open the current PID namespace */ + fd_pidns = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(fd_pidns, 0); + + /* Get the PID namespace ID */ + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(pid_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 pid_ns_id2 = 0; + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(pid_ns_id, pid_ns_id2); + + close(fd_pidns); +} + +TEST(nsid_pidns_separate) +{ + __u64 parent_pid_ns_id = 0; + __u64 child_pid_ns_id = 0; + int fd_parent_pidns, fd_child_pidns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's PID namespace ID */ + fd_parent_pidns = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(fd_parent_pidns, 0); + ret = ioctl(fd_parent_pidns, NS_GET_ID, &parent_pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_pid_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new PID namespace */ + ret = unshare(CLONE_NEWPID); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Fork a grandchild to actually enter the new namespace */ + pid_t grandchild = fork(); + if (grandchild == 0) { + /* Grandchild is in the new namespace */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + pause(); + _exit(0); + } else if (grandchild > 0) { + /* Child writes grandchild PID and waits */ + write(pipefd[1], "Y", 1); + write(pipefd[1], &grandchild, sizeof(grandchild)); + close(pipefd[1]); + pause(); /* Keep the parent alive to maintain the grandchild */ + _exit(0); + } else { + _exit(1); + } + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_pidns); + close(pipefd[0]); + SKIP(return, "No permission to create PID namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + pid_t grandchild_pid; + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + close(pipefd[0]); + + /* Open grandchild's PID namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/pid", grandchild_pid); + fd_child_pidns = open(path, O_RDONLY); + ASSERT_GE(fd_child_pidns, 0); + + /* Get child's PID namespace ID */ + ret = ioctl(fd_child_pidns, NS_GET_ID, &child_pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_pid_ns_id, 0); + + /* Parent and child should have different PID namespace IDs */ + ASSERT_NE(parent_pid_ns_id, child_pid_ns_id); + + close(fd_parent_pidns); + close(fd_child_pidns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_netns_basic) +{ + __u64 net_ns_id = 0; + __u64 netns_cookie = 0; + int fd_netns; + int sock; + socklen_t optlen; + int ret; + + /* Open the current network namespace */ + fd_netns = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(fd_netns, 0); + + /* Get the network namespace ID via ioctl */ + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(net_ns_id, 0); + + /* Create a socket to get the SO_NETNS_COOKIE */ + sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(sock, 0); + + /* Get the network namespace cookie via socket option */ + optlen = sizeof(netns_cookie); + ret = getsockopt(sock, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + ASSERT_EQ(optlen, sizeof(netns_cookie)); + + /* The namespace ID and cookie should be identical */ + ASSERT_EQ(net_ns_id, netns_cookie); + + /* Verify we can get the same ID again */ + __u64 net_ns_id2 = 0; + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(net_ns_id, net_ns_id2); + + close(sock); + close(fd_netns); +} + +TEST(nsid_netns_separate) +{ + __u64 parent_net_ns_id = 0; + __u64 parent_netns_cookie = 0; + __u64 child_net_ns_id = 0; + __u64 child_netns_cookie = 0; + int fd_parent_netns, fd_child_netns; + int parent_sock, child_sock; + socklen_t optlen; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's network namespace ID */ + fd_parent_netns = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(fd_parent_netns, 0); + ret = ioctl(fd_parent_netns, NS_GET_ID, &parent_net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_net_ns_id, 0); + + /* Get parent's network namespace cookie */ + parent_sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(parent_sock, 0); + optlen = sizeof(parent_netns_cookie); + ret = getsockopt(parent_sock, SOL_SOCKET, SO_NETNS_COOKIE, &parent_netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + + /* Verify parent's ID and cookie match */ + ASSERT_EQ(parent_net_ns_id, parent_netns_cookie); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_netns); + close(parent_sock); + SKIP(return, "No permission to create network namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's network namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/net", pid); + fd_child_netns = open(path, O_RDONLY); + ASSERT_GE(fd_child_netns, 0); + + /* Get child's network namespace ID */ + ret = ioctl(fd_child_netns, NS_GET_ID, &child_net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_net_ns_id, 0); + + /* Create socket in child's namespace to get cookie */ + ret = setns(fd_child_netns, CLONE_NEWNET); + if (ret == 0) { + child_sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(child_sock, 0); + + optlen = sizeof(child_netns_cookie); + ret = getsockopt(child_sock, SOL_SOCKET, SO_NETNS_COOKIE, &child_netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + + /* Verify child's ID and cookie match */ + ASSERT_EQ(child_net_ns_id, child_netns_cookie); + + close(child_sock); + + /* Return to parent namespace */ + setns(fd_parent_netns, CLONE_NEWNET); + } + + /* Parent and child should have different network namespace IDs */ + ASSERT_NE(parent_net_ns_id, child_net_ns_id); + if (child_netns_cookie != 0) { + ASSERT_NE(parent_netns_cookie, child_netns_cookie); + } + + close(fd_parent_netns); + close(fd_child_netns); + close(parent_sock); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index b31a71f2b372..2b31d4a93ad7 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -99,6 +99,7 @@ TEST_GEN_PROGS += bind_wildcard TEST_GEN_PROGS += bind_timewait TEST_PROGS += test_vxlan_mdb.sh TEST_PROGS += test_bridge_neigh_suppress.sh +TEST_PROGS += test_vxlan_nh.sh TEST_PROGS += test_vxlan_nolocalbypass.sh TEST_PROGS += test_bridge_backup_port.sh TEST_PROGS += test_neigh.sh @@ -115,6 +116,7 @@ TEST_PROGS += skf_net_off.sh TEST_GEN_FILES += skf_net_off TEST_GEN_FILES += tfo TEST_PROGS += tfo_passive.sh +TEST_PROGS += broadcast_ether_dst.sh TEST_PROGS += broadcast_pmtu.sh TEST_PROGS += ipv6_force_forwarding.sh diff --git a/tools/testing/selftests/net/bind_bhash.c b/tools/testing/selftests/net/bind_bhash.c index 57ff67a3751e..da04b0b19b73 100644 --- a/tools/testing/selftests/net/bind_bhash.c +++ b/tools/testing/selftests/net/bind_bhash.c @@ -75,7 +75,7 @@ static void *setup(void *arg) int *array = (int *)arg; for (i = 0; i < MAX_CONNECTIONS; i++) { - sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr); + sock_fd = bind_socket(SO_REUSEPORT, setup_addr); if (sock_fd < 0) { ret = sock_fd; pthread_exit(&ret); @@ -103,7 +103,7 @@ int main(int argc, const char *argv[]) setup_addr = use_v6 ? setup_addr_v6 : setup_addr_v4; - listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr); + listener_fd = bind_socket(SO_REUSEPORT, setup_addr); if (listen(listener_fd, 100) < 0) { perror("listen failed"); return -1; diff --git a/tools/testing/selftests/net/broadcast_ether_dst.sh b/tools/testing/selftests/net/broadcast_ether_dst.sh new file mode 100755 index 000000000000..334a7eca8a80 --- /dev/null +++ b/tools/testing/selftests/net/broadcast_ether_dst.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Author: Brett A C Sheffield <bacs@librecast.net> +# Author: Oscar Maes <oscmaes92@gmail.com> +# +# Ensure destination ethernet field is correctly set for +# broadcast packets + +source lib.sh + +CLIENT_IP4="192.168.0.1" +GW_IP4="192.168.0.2" + +setup() { + setup_ns CLIENT_NS SERVER_NS + + ip -net "${SERVER_NS}" link add link1 type veth \ + peer name link0 netns "${CLIENT_NS}" + + ip -net "${CLIENT_NS}" link set link0 up + ip -net "${CLIENT_NS}" addr add "${CLIENT_IP4}"/24 dev link0 + + ip -net "${SERVER_NS}" link set link1 up + + ip -net "${CLIENT_NS}" route add default via "${GW_IP4}" + ip netns exec "${CLIENT_NS}" arp -s "${GW_IP4}" 00:11:22:33:44:55 +} + +cleanup() { + rm -f "${CAPFILE}" "${OUTPUT}" + ip -net "${SERVER_NS}" link del link1 + cleanup_ns "${CLIENT_NS}" "${SERVER_NS}" +} + +test_broadcast_ether_dst() { + local rc=0 + CAPFILE=$(mktemp -u cap.XXXXXXXXXX) + OUTPUT=$(mktemp -u out.XXXXXXXXXX) + + echo "Testing ethernet broadcast destination" + + # start tcpdump listening for icmp + # tcpdump will exit after receiving a single packet + # timeout will kill tcpdump if it is still running after 2s + timeout 2s ip netns exec "${CLIENT_NS}" \ + tcpdump -i link0 -c 1 -w "${CAPFILE}" icmp &> "${OUTPUT}" & + pid=$! + slowwait 1 grep -qs "listening" "${OUTPUT}" + + # send broadcast ping + ip netns exec "${CLIENT_NS}" \ + ping -W0.01 -c1 -b 255.255.255.255 &> /dev/null + + # wait for tcpdump for exit after receiving packet + wait "${pid}" + + # compare ethernet destination field to ff:ff:ff:ff:ff:ff + ether_dst=$(tcpdump -r "${CAPFILE}" -tnne 2>/dev/null | \ + awk '{sub(/,/,"",$3); print $3}') + if [[ "${ether_dst}" == "ff:ff:ff:ff:ff:ff" ]]; then + echo "[ OK ]" + rc="${ksft_pass}" + else + echo "[FAIL] expected dst ether addr to be ff:ff:ff:ff:ff:ff," \ + "got ${ether_dst}" + rc="${ksft_fail}" + fi + + return "${rc}" +} + +if [ ! -x "$(command -v tcpdump)" ]; then + echo "SKIP: Could not run test without tcpdump tool" + exit "${ksft_skip}" +fi + +trap cleanup EXIT + +setup +test_broadcast_ether_dst + +exit $? diff --git a/tools/testing/selftests/net/can/config b/tools/testing/selftests/net/can/config new file mode 100644 index 000000000000..188f79796670 --- /dev/null +++ b/tools/testing/selftests/net/can/config @@ -0,0 +1,3 @@ +CONFIG_CAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_VCAN=m diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index b39f748c2572..2b0a90581e2f 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -467,8 +467,8 @@ ipv6_fdb_grp_fcnal() log_test $? 0 "Get Fdb nexthop group by id" # fdb nexthop group can only contain fdb nexthops - run_cmd "$IP nexthop add id 63 via 2001:db8:91::4" - run_cmd "$IP nexthop add id 64 via 2001:db8:91::5" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::4 dev veth1" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::5 dev veth1" run_cmd "$IP nexthop add id 103 group 63/64 fdb" log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" @@ -494,6 +494,26 @@ ipv6_fdb_grp_fcnal() run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb" log_test $? 2 "Fdb Nexthop with encap" + # Replace FDB nexthop to non-FDB and vice versa + run_cmd "$IP nexthop add id 70 via 2001:db8:91::2 fdb" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 dev veth1" + log_test $? 0 "Replace FDB nexthop to non-FDB nexthop" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 fdb" + log_test $? 0 "Replace non-FDB nexthop to FDB nexthop" + + # Replace FDB nexthop address while in a group + run_cmd "$IP nexthop add id 71 group 70 fdb" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::3 fdb" + log_test $? 0 "Replace FDB nexthop address while in a group" + + # Cannot replace FDB nexthop to non-FDB and vice versa while in a group + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 dev veth1" + log_test $? 2 "Replace FDB nexthop to non-FDB nexthop while in a group" + run_cmd "$IP nexthop add id 72 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 73 group 72" + run_cmd "$IP nexthop replace id 72 via 2001:db8:91::2 fdb" + log_test $? 2 "Replace non-FDB nexthop to FDB nexthop while in a group" + run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100" run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" log_test $? 0 "Fdb mac add with nexthop group" @@ -547,15 +567,15 @@ ipv4_fdb_grp_fcnal() log_test $? 0 "Get Fdb nexthop group by id" # fdb nexthop group can only contain fdb nexthops - run_cmd "$IP nexthop add id 14 via 172.16.1.2" - run_cmd "$IP nexthop add id 15 via 172.16.1.3" + run_cmd "$IP nexthop add id 14 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 15 via 172.16.1.3 dev veth1" run_cmd "$IP nexthop add id 103 group 14/15 fdb" log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" # Non fdb nexthop group can not contain fdb nexthops run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb" run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb" - run_cmd "$IP nexthop add id 104 group 14/15" + run_cmd "$IP nexthop add id 104 group 16/17" log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" # fdb nexthop cannot have blackhole @@ -574,6 +594,26 @@ ipv4_fdb_grp_fcnal() run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb" log_test $? 2 "Fdb Nexthop with encap" + # Replace FDB nexthop to non-FDB and vice versa + run_cmd "$IP nexthop add id 18 via 172.16.1.2 fdb" + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 dev veth1" + log_test $? 0 "Replace FDB nexthop to non-FDB nexthop" + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 fdb" + log_test $? 0 "Replace non-FDB nexthop to FDB nexthop" + + # Replace FDB nexthop address while in a group + run_cmd "$IP nexthop add id 19 group 18 fdb" + run_cmd "$IP nexthop replace id 18 via 172.16.1.3 fdb" + log_test $? 0 "Replace FDB nexthop address while in a group" + + # Cannot replace FDB nexthop to non-FDB and vice versa while in a group + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 dev veth1" + log_test $? 2 "Replace FDB nexthop to non-FDB nexthop while in a group" + run_cmd "$IP nexthop add id 20 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 21 group 20" + run_cmd "$IP nexthop replace id 20 via 172.16.1.2 fdb" + log_test $? 2 "Replace non-FDB nexthop to FDB nexthop while in a group" + run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100" run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" log_test $? 0 "Fdb mac add with nexthop group" @@ -582,7 +622,7 @@ ipv4_fdb_grp_fcnal() run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self" log_test $? 255 "Fdb mac add with nexthop" - run_cmd "$IP ro add 172.16.0.0/22 nhid 15" + run_cmd "$IP ro add 172.16.0.0/22 nhid 16" log_test $? 2 "Route add with fdb nexthop" run_cmd "$IP ro add 172.16.0.0/22 nhid 103" diff --git a/tools/testing/selftests/net/forwarding/router.sh b/tools/testing/selftests/net/forwarding/router.sh index b98ea9449b8b..dfb6646cb97b 100755 --- a/tools/testing/selftests/net/forwarding/router.sh +++ b/tools/testing/selftests/net/forwarding/router.sh @@ -18,6 +18,8 @@ # | 2001:db8:1::1/64 2001:db8:2::1/64 | # | | # +-----------------------------------------------------------------+ +# +#shellcheck disable=SC2034 # SC doesn't see our uses of global variables ALL_TESTS=" ping_ipv4 @@ -27,6 +29,7 @@ ALL_TESTS=" ipv4_sip_equal_dip ipv6_sip_equal_dip ipv4_dip_link_local + ipv4_sip_link_local " NUM_NETIFS=4 @@ -330,6 +333,32 @@ ipv4_dip_link_local() tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower } +ipv4_sip_link_local() +{ + local sip=169.254.1.1 + + RET=0 + + # Disable rpfilter to prevent packets to be dropped because of it. + sysctl_set net.ipv4.conf.all.rp_filter 0 + sysctl_set net.ipv4.conf."$rp1".rp_filter 0 + + tc filter add dev "$rp2" egress protocol ip pref 1 handle 101 \ + flower src_ip "$sip" action pass + + $MZ "$h1" -t udp "sp=54321,dp=12345" -c 5 -d 1msec -b "$rp1mac" \ + -A "$sip" -B 198.51.100.2 -q + + tc_check_packets "dev $rp2 egress" 101 5 + check_err $? "Packets were dropped" + + log_test "IPv4 source IP is link-local" + + tc filter del dev "$rp2" egress protocol ip pref 1 handle 101 flower + sysctl_restore net.ipv4.conf."$rp1".rp_filter + sysctl_restore net.ipv4.conf.all.rp_filter +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/sch_ets.sh b/tools/testing/selftests/net/forwarding/sch_ets.sh index 1f6f53e284b5..6269d5e23487 100755 --- a/tools/testing/selftests/net/forwarding/sch_ets.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets.sh @@ -11,6 +11,7 @@ ALL_TESTS=" ets_test_strict ets_test_mixed ets_test_dwrr + ets_test_plug classifier_mode ets_test_strict ets_test_mixed diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh index 08240d3e3c87..79d837a2868a 100644 --- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh @@ -224,3 +224,11 @@ ets_test_dwrr() ets_set_dwrr_two_bands xfail_on_slow ets_dwrr_test_01 } + +ets_test_plug() +{ + ets_change_qdisc $put 2 "3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3" "1514 1514" + tc qdisc add dev $put handle 20: parent 10:4 plug + start_traffic_pktsize 100 $h1.10 192.0.2.1 192.0.2.2 00:c1:a0:c1:a0:00 "-c 1" + ets_qdisc_setup $put 2 +} diff --git a/tools/testing/selftests/net/lib/xdp_native.bpf.c b/tools/testing/selftests/net/lib/xdp_native.bpf.c index 521ba38f2ddd..df4eea5c192b 100644 --- a/tools/testing/selftests/net/lib/xdp_native.bpf.c +++ b/tools/testing/selftests/net/lib/xdp_native.bpf.c @@ -14,6 +14,8 @@ #define MAX_PAYLOAD_LEN 5000 #define MAX_HDR_LEN 64 +extern int bpf_xdp_pull_data(struct xdp_md *xdp, __u32 len) __ksym __weak; + enum { XDP_MODE = 0, XDP_PORT = 1, @@ -68,30 +70,57 @@ static void record_stats(struct xdp_md *ctx, __u32 stat_type) static struct udphdr *filter_udphdr(struct xdp_md *ctx, __u16 port) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; struct udphdr *udph = NULL; - struct ethhdr *eth = data; + void *data, *data_end; + struct ethhdr *eth; + int err; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth)); + if (err) + return NULL; + + data_end = (void *)(long)ctx->data_end; + data = eth = (void *)(long)ctx->data; if (data + sizeof(*eth) > data_end) return NULL; if (eth->h_proto == bpf_htons(ETH_P_IP)) { - struct iphdr *iph = data + sizeof(*eth); + struct iphdr *iph; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*iph) + + sizeof(*udph)); + if (err) + return NULL; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + iph = data + sizeof(*eth); if (iph + 1 > (struct iphdr *)data_end || iph->protocol != IPPROTO_UDP) return NULL; - udph = (void *)eth + sizeof(*iph) + sizeof(*eth); - } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { - struct ipv6hdr *ipv6h = data + sizeof(*eth); + udph = data + sizeof(*iph) + sizeof(*eth); + } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { + struct ipv6hdr *ipv6h; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*ipv6h) + + sizeof(*udph)); + if (err) + return NULL; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + ipv6h = data + sizeof(*eth); if (ipv6h + 1 > (struct ipv6hdr *)data_end || ipv6h->nexthdr != IPPROTO_UDP) return NULL; - udph = (void *)eth + sizeof(*ipv6h) + sizeof(*eth); + udph = data + sizeof(*ipv6h) + sizeof(*eth); } else { return NULL; } @@ -145,17 +174,34 @@ static void swap_machdr(void *data) static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; struct udphdr *udph = NULL; - struct ethhdr *eth = data; + void *data, *data_end; + struct ethhdr *eth; + int err; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth)); + if (err) + return XDP_PASS; + + data_end = (void *)(long)ctx->data_end; + data = eth = (void *)(long)ctx->data; if (data + sizeof(*eth) > data_end) return XDP_PASS; if (eth->h_proto == bpf_htons(ETH_P_IP)) { - struct iphdr *iph = data + sizeof(*eth); - __be32 tmp_ip = iph->saddr; + struct iphdr *iph; + __be32 tmp_ip; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*iph) + + sizeof(*udph)); + if (err) + return XDP_PASS; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + iph = data + sizeof(*eth); if (iph + 1 > (struct iphdr *)data_end || iph->protocol != IPPROTO_UDP) @@ -169,8 +215,10 @@ static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) return XDP_PASS; record_stats(ctx, STATS_RX); + eth = data; swap_machdr((void *)eth); + tmp_ip = iph->saddr; iph->saddr = iph->daddr; iph->daddr = tmp_ip; @@ -178,9 +226,19 @@ static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) return XDP_TX; - } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { - struct ipv6hdr *ipv6h = data + sizeof(*eth); + } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { struct in6_addr tmp_ipv6; + struct ipv6hdr *ipv6h; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*ipv6h) + + sizeof(*udph)); + if (err) + return XDP_PASS; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + ipv6h = data + sizeof(*eth); if (ipv6h + 1 > (struct ipv6hdr *)data_end || ipv6h->nexthdr != IPPROTO_UDP) @@ -194,6 +252,7 @@ static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) return XDP_PASS; record_stats(ctx, STATS_RX); + eth = data; swap_machdr((void *)eth); __builtin_memcpy(&tmp_ipv6, &ipv6h->saddr, sizeof(tmp_ipv6)); diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 7a3cb4c09e45..d847ff1737c3 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -28,7 +28,7 @@ flush_pids() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGKILL &>/dev/null diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index ac1349c4b9e5..b148cadb96d0 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -183,9 +183,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; @@ -1092,6 +1093,7 @@ int main_loop_s(int listensock) struct pollfd polls; socklen_t salen; int remotesock; + int err = 0; int fd = 0; again: @@ -1124,7 +1126,7 @@ again: SOCK_TEST_TCPULP(remotesock, 0); memset(&winfo, 0, sizeof(winfo)); - copyfd_io(fd, remotesock, 1, true, &winfo); + err = copyfd_io(fd, remotesock, 1, true, &winfo); } else { perror("accept"); return 1; @@ -1133,10 +1135,10 @@ again: if (cfg_input) close(fd); - if (--cfg_repeat > 0) + if (!err && --cfg_repeat > 0) goto again; - return 0; + return err; } static void init_rng(void) @@ -1246,7 +1248,7 @@ void xdisconnect(int fd) else xerror("bad family"); - strcpy(cmd, "ss -M | grep -q "); + strcpy(cmd, "ss -Mnt | grep -q "); cmdlen = strlen(cmd); if (!inet_ntop(addr.ss_family, raw_addr, &cmd[cmdlen], sizeof(cmd) - cmdlen)) @@ -1256,7 +1258,7 @@ void xdisconnect(int fd) /* * wait until the pending data is completely flushed and all - * the MPTCP sockets reached the closed status. + * the sockets reached the closed status. * disconnect will bypass/ignore/drop any pending data. */ for (i = 0; ; i += msec_sleep) { diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 5e3c56253274..47ecb5b3836e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -134,7 +134,7 @@ ns4="" TEST_GROUP="" # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "$cin_disconnect" @@ -211,6 +211,11 @@ if $checksum; then done fi +if $capture; then + rndh="${ns1:4}" + mptcp_lib_pr_info "Packet capture files will have this prefix: ${rndh}-" +fi + set_ethtool_flags() { local ns="$1" local dev="$2" @@ -361,7 +366,6 @@ do_transfer() if $capture; then local capuser - local rndh="${connector_ns:4}" if [ -z $SUDO_USER ] ; then capuser="" else diff --git a/tools/testing/selftests/net/mptcp/mptcp_inq.c b/tools/testing/selftests/net/mptcp/mptcp_inq.c index 3cf1e2a612ce..f3bcaa48df8f 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_inq.c +++ b/tools/testing/selftests/net/mptcp/mptcp_inq.c @@ -75,9 +75,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index b8af65373b3a..7fd555b123b9 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -8,7 +8,7 @@ # ShellCheck incorrectly believes that most of the code here is unreachable # because it's invoked by variable name, see how the "tests" array is used -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 . "$(dirname "${0}")/mptcp_lib.sh" @@ -3842,6 +3842,7 @@ endpoint_tests() # remove and re-add if reset_with_events "delete re-add signal" && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=0 pm_nl_set_limits $ns1 0 3 pm_nl_set_limits $ns2 3 3 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 09cd24b2ae46..d62e653d48b0 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -384,7 +384,7 @@ mptcp_lib_make_file() { mptcp_lib_print_file_err() { ls -l "${1}" 1>&2 echo "Trailing bytes are: " - tail -c 27 "${1}" + tail -c 32 "${1}" | od -x | head -n2 } # $1: input file ; $2: output file ; $3: what kind of file diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index 9934a68df237..112c07c4c37a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -162,9 +162,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; @@ -666,22 +667,26 @@ static void process_one_client(int fd, int pipefd) do_getsockopts(&s, fd, ret, ret2); if (s.mptcpi_rcv_delta != (uint64_t)ret + 1) - xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64, s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - ret); + xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64 ", diff %" PRId64, + s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - (ret + 1)); /* be nice when running on top of older kernel */ if (s.pkt_stats_avail) { if (s.last_sample.mptcpi_bytes_sent != ret2) - xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_sent, ret2, s.last_sample.mptcpi_bytes_sent - ret2); if (s.last_sample.mptcpi_bytes_received != ret) - xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_received, ret, s.last_sample.mptcpi_bytes_received - ret); if (s.last_sample.mptcpi_bytes_acked != ret) - xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64, - s.last_sample.mptcpi_bytes_acked, ret2, - s.last_sample.mptcpi_bytes_acked - ret2); + xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, + s.last_sample.mptcpi_bytes_acked, ret, + s.last_sample.mptcpi_bytes_acked - ret); } close(fd); diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 418a903c3a4d..f01989be6e9b 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -95,7 +95,7 @@ init() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { mptcp_lib_ns_exit "${ns1}" "${ns2}" "${ns_sbox}" diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 2e6648a2b2c0..ec6a87588191 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -32,7 +32,7 @@ ns1="" err=$(mktemp) # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "${err}" @@ -70,8 +70,9 @@ format_endpoints() { mptcp_lib_pm_nl_format_endpoints "${@}" } +# This function is invoked indirectly +#shellcheck disable=SC2317,SC2329 get_endpoint() { - # shellcheck disable=SC2317 # invoked indirectly mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}" } @@ -198,6 +199,7 @@ set_limits 1 9 2>/dev/null check "get_limits" "${default_limits}" "subflows above hard limit" set_limits 8 8 +flush_endpoint ## to make sure it doesn't affect the limits check "get_limits" "$(format_limits 8 8)" "set limits" flush_endpoint diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 994a556f46c1..93fea3442216 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -188,6 +188,13 @@ static int capture_events(int fd, int event_group) fprintf(stderr, ",error:%u", *(__u8 *)RTA_DATA(attrs)); else if (attrs->rta_type == MPTCP_ATTR_SERVER_SIDE) fprintf(stderr, ",server_side:%u", *(__u8 *)RTA_DATA(attrs)); + else if (attrs->rta_type == MPTCP_ATTR_FLAGS) { + __u16 flags = *(__u16 *)RTA_DATA(attrs); + + /* only print when present, easier */ + if (flags & MPTCP_PM_EV_FLAG_DENY_JOIN_ID0) + fprintf(stderr, ",deny_join_id0:1"); + } attrs = RTA_NEXT(attrs, msg_len); } diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 2329c2f8519b..1903e8e84a31 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -35,7 +35,7 @@ usage() { } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "$cout" "$sout" diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 333064b0b5ac..3d45991f24ed 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -94,7 +94,7 @@ test_fail() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { print_title "Cleanup" @@ -201,6 +201,9 @@ make_connection() is_v6="v4" fi + # set this on the client side only: will not affect the rest + ip netns exec "$ns2" sysctl -q net.mptcp.allow_join_initial_addr_port=0 + :>"$client_evts" :>"$server_evts" @@ -223,23 +226,28 @@ make_connection() local client_token local client_port local client_serverside + local client_nojoin local server_token local server_serverside + local server_nojoin client_token=$(mptcp_lib_evts_get_info token "$client_evts") client_port=$(mptcp_lib_evts_get_info sport "$client_evts") client_serverside=$(mptcp_lib_evts_get_info server_side "$client_evts") + client_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$client_evts") server_token=$(mptcp_lib_evts_get_info token "$server_evts") server_serverside=$(mptcp_lib_evts_get_info server_side "$server_evts") + server_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$server_evts") print_test "Established IP${is_v6} MPTCP Connection ns2 => ns1" - if [ "$client_token" != "" ] && [ "$server_token" != "" ] && [ "$client_serverside" = 0 ] && - [ "$server_serverside" = 1 ] + if [ "${client_token}" != "" ] && [ "${server_token}" != "" ] && + [ "${client_serverside}" = 0 ] && [ "${server_serverside}" = 1 ] && + [ "${client_nojoin:-0}" = 0 ] && [ "${server_nojoin:-0}" = 1 ] then test_pass print_title "Connection info: ${client_addr}:${client_port} -> ${connect_addr}:${app_port}" else - test_fail "Expected tokens (c:${client_token} - s:${server_token}) and server (c:${client_serverside} - s:${server_serverside})" + test_fail "Expected tokens (c:${client_token} - s:${server_token}), server (c:${client_serverside} - s:${server_serverside}), nojoin (c:${client_nojoin} - s:${server_nojoin})" mptcp_lib_result_print_all_tap exit ${KSFT_FAIL} fi diff --git a/tools/testing/selftests/net/netfilter/conntrack_clash.sh b/tools/testing/selftests/net/netfilter/conntrack_clash.sh index 606a43a60f73..7fc6c5dbd551 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_clash.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_clash.sh @@ -99,7 +99,7 @@ run_one_clash_test() local entries local cre - if ! ip netns exec "$ns" ./udpclash $daddr $dport;then + if ! ip netns exec "$ns" timeout 30 ./udpclash $daddr $dport;then echo "INFO: did not receive expected number of replies for $daddr:$dport" ip netns exec "$ctns" conntrack -S # don't fail: check if clash resolution triggered after all. diff --git a/tools/testing/selftests/net/netfilter/conntrack_resize.sh b/tools/testing/selftests/net/netfilter/conntrack_resize.sh index 788cd56ea4a0..615fe3c6f405 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_resize.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_resize.sh @@ -187,7 +187,7 @@ ct_udpclash() [ -x udpclash ] || return while [ $now -lt $end ]; do - ip netns exec "$ns" ./udpclash 127.0.0.1 $((RANDOM%65536)) > /dev/null 2>&1 + ip netns exec "$ns" timeout 30 ./udpclash 127.0.0.1 $((RANDOM%65536)) > /dev/null 2>&1 now=$(date +%s) done @@ -277,6 +277,7 @@ check_taint() insert_flood() { local n="$1" + local timeout="$2" local r=0 r=$((RANDOM%$insert_count)) @@ -302,7 +303,7 @@ test_floodresize_all() read tainted_then < /proc/sys/kernel/tainted for n in "$nsclient1" "$nsclient2";do - insert_flood "$n" & + insert_flood "$n" "$timeout" & done # resize table constantly while flood/insert/dump/flushs diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index a4ee5496f2a1..45832df98295 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -20,6 +20,7 @@ ret=0 SOCAT_TIMEOUT=60 nsin="" +nsin_small="" ns1out="" ns2out="" @@ -36,7 +37,7 @@ cleanup() { cleanup_all_ns - rm -f "$nsin" "$ns1out" "$ns2out" + rm -f "$nsin" "$nsin_small" "$ns1out" "$ns2out" [ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns="$log_netns" } @@ -72,6 +73,7 @@ lmtu=1500 rmtu=2000 filesize=$((2 * 1024 * 1024)) +filesize_small=$((filesize / 16)) usage(){ echo "nft_flowtable.sh [OPTIONS]" @@ -89,7 +91,10 @@ do o) omtu=$OPTARG;; l) lmtu=$OPTARG;; r) rmtu=$OPTARG;; - s) filesize=$OPTARG;; + s) + filesize=$OPTARG + filesize_small=$((OPTARG / 16)) + ;; *) usage;; esac done @@ -215,6 +220,7 @@ if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then fi nsin=$(mktemp) +nsin_small=$(mktemp) ns1out=$(mktemp) ns2out=$(mktemp) @@ -265,6 +271,7 @@ check_counters() check_dscp() { local what=$1 + local pmtud="$2" local ok=1 local counter @@ -277,37 +284,39 @@ check_dscp() local pc4z=${counter%*bytes*} local pc4z=${pc4z#*packets} + local failmsg="FAIL: pmtu $pmtu: $what counters do not match, expected" + case "$what" in "dscp_none") if [ "$pc4" -gt 0 ] || [ "$pc4z" -eq 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 + echo "$failmsg dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_fwd") if [ "$pc4" -eq 0 ] || [ "$pc4z" -eq 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 + echo "$failmsg dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_ingress") if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 + echo "$failmsg dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_egress") if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 + echo "$failmsg dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; *) - echo "FAIL: Unknown DSCP check" 1>&2 + echo "$failmsg: Unknown DSCP check" 1>&2 ret=1 ok=0 esac @@ -319,9 +328,9 @@ check_dscp() check_transfer() { - in=$1 - out=$2 - what=$3 + local in=$1 + local out=$2 + local what=$3 if ! cmp "$in" "$out" > /dev/null 2>&1; then echo "FAIL: file mismatch for $what" 1>&2 @@ -342,25 +351,39 @@ test_tcp_forwarding_ip() { local nsa=$1 local nsb=$2 - local dstip=$3 - local dstport=$4 + local pmtu=$3 + local dstip=$4 + local dstport=$5 local lret=0 + local socatc + local socatl + local infile="$nsin" + + if [ $pmtu -eq 0 ]; then + infile="$nsin_small" + fi - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$nsin" > "$ns2out" & + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" & lpid=$! busywait 1000 listener_ready - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$nsin" > "$ns1out" + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" + socatc=$? wait $lpid + socatl=$? - if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then + if [ $socatl -ne 0 ] || [ $socatc -ne 0 ];then + rc=1 + fi + + if ! check_transfer "$infile" "$ns2out" "ns1 -> ns2"; then lret=1 ret=1 fi - if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then + if ! check_transfer "$infile" "$ns1out" "ns1 <- ns2"; then lret=1 ret=1 fi @@ -370,14 +393,16 @@ test_tcp_forwarding_ip() test_tcp_forwarding() { - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 + local pmtu="$3" + + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 return $? } test_tcp_forwarding_set_dscp() { - check_dscp "dscp_none" + local pmtu="$3" ip netns exec "$nsr1" nft -f - <<EOF table netdev dscpmangle { @@ -388,8 +413,8 @@ table netdev dscpmangle { } EOF if [ $? -eq 0 ]; then - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 - check_dscp "dscp_ingress" + test_tcp_forwarding_ip "$1" "$2" "$3" 10.0.2.99 12345 + check_dscp "dscp_ingress" "$pmtu" ip netns exec "$nsr1" nft delete table netdev dscpmangle else @@ -405,10 +430,10 @@ table netdev dscpmangle { } EOF if [ $? -eq 0 ]; then - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 - check_dscp "dscp_egress" + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 + check_dscp "dscp_egress" "$pmtu" - ip netns exec "$nsr1" nft flush table netdev dscpmangle + ip netns exec "$nsr1" nft delete table netdev dscpmangle else echo "SKIP: Could not load netdev:egress for veth1" fi @@ -416,48 +441,53 @@ fi # partial. If flowtable really works, then both dscp-is-0 and dscp-is-cs3 # counters should have seen packets (before and after ft offload kicks in). ip netns exec "$nsr1" nft -a insert rule inet filter forward ip dscp set cs3 - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 - check_dscp "dscp_fwd" + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 + check_dscp "dscp_fwd" "$pmtu" } test_tcp_forwarding_nat() { + local nsa="$1" + local nsb="$2" + local pmtu="$3" + local what="$4" local lret - local pmtu - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 - lret=$? + [ "$pmtu" -eq 0 ] && what="$what (pmtu disabled)" - pmtu=$3 - what=$4 + test_tcp_forwarding_ip "$nsa" "$nsb" "$pmtu" 10.0.2.99 12345 + lret=$? if [ "$lret" -eq 0 ] ; then if [ "$pmtu" -eq 1 ] ;then - check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what" + check_counters "flow offload for ns1/ns2 with masquerade $what" else echo "PASS: flow offload for ns1/ns2 with masquerade $what" fi - test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.6.6.6 1666 lret=$? if [ "$pmtu" -eq 1 ] ;then - check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what" + check_counters "flow offload for ns1/ns2 with dnat $what" elif [ "$lret" -eq 0 ] ; then echo "PASS: flow offload for ns1/ns2 with dnat $what" fi + else + echo "FAIL: flow offload for ns1/ns2 with dnat $what" fi return $lret } make_file "$nsin" "$filesize" +make_file "$nsin_small" "$filesize_small" # First test: # No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed. # Due to MTU mismatch in both directions, all packets (except small packets like pure # acks) have to be handled by normal forwarding path. Therefore, packet counters # are not checked. -if test_tcp_forwarding "$ns1" "$ns2"; then +if test_tcp_forwarding "$ns1" "$ns2" 0; then echo "PASS: flow offloaded for ns1/ns2" else echo "FAIL: flow offload for ns1/ns2:" 1>&2 @@ -489,8 +519,9 @@ table ip nat { } EOF +check_dscp "dscp_none" "0" if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then - echo "FAIL: flow offload for ns1/ns2 with dscp update" 1>&2 + echo "FAIL: flow offload for ns1/ns2 with dscp update and no pmtu discovery" 1>&2 exit 0 fi @@ -513,6 +544,14 @@ ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null # For earlier tests (large mtus), packets cannot be handled via flowtable # (except pure acks and other small packets). ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null +ip netns exec "$ns2" nft reset counters table inet filter >/dev/null + +if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 ""; then + echo "FAIL: flow offload for ns1/ns2 with dscp update and pmtu discovery" 1>&2 + exit 0 +fi + +ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 @@ -644,7 +683,7 @@ ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1 ip -net "$ns2" route add default via 10.0.2.1 ip -net "$ns2" route add default via dead:2::1 -if test_tcp_forwarding "$ns1" "$ns2"; then +if test_tcp_forwarding "$ns1" "$ns2" 1; then check_counters "ipsec tunnel mode for ns1/ns2" else echo "FAIL: ipsec tunnel mode for ns1/ns2" @@ -668,7 +707,7 @@ if [ "$1" = "" ]; then fi echo "re-run with random mtus and file size: -o $o -l $l -r $r -s $filesize" - $0 -o "$o" -l "$l" -r "$r" -s "$filesize" + $0 -o "$o" -l "$l" -r "$r" -s "$filesize" || ret=1 fi exit $ret diff --git a/tools/testing/selftests/net/netfilter/udpclash.c b/tools/testing/selftests/net/netfilter/udpclash.c index 85c7b906ad08..79de163d61ab 100644 --- a/tools/testing/selftests/net/netfilter/udpclash.c +++ b/tools/testing/selftests/net/netfilter/udpclash.c @@ -29,7 +29,7 @@ struct thread_args { int sockfd; }; -static int wait = 1; +static volatile int wait = 1; static void *thread_main(void *varg) { diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index 3c8d3455d8e7..b327d3061ed5 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -25,6 +25,7 @@ tests=" nat_related_v4 ip4-nat-related: ICMP related matches work with SNAT netlink_checks ovsnl: validate netlink attrs and settings upcall_interfaces ovs: test the upcall interfaces + tunnel_metadata ovs: test extraction of tunnel metadata drop_reason drop: test drop reasons are emitted psample psample: Sampling packets with psample" @@ -113,13 +114,13 @@ ovs_add_dp () { } ovs_add_if () { - info "Adding IF to DP: br:$2 if:$3" - if [ "$4" != "-u" ]; then - ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if "$2" "$3" \ - || return 1 + info "Adding IF to DP: br:$3 if:$4 ($2)" + if [ "$5" != "-u" ]; then + ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if \ + -t "$2" "$3" "$4" || return 1 else python3 $ovs_base/ovs-dpctl.py add-if \ - -u "$2" "$3" >$ovs_dir/$3.out 2>$ovs_dir/$3.err & + -u -t "$2" "$3" "$4" >$ovs_dir/$4.out 2>$ovs_dir/$4.err & pid=$! on_exit "ovs_sbx $1 kill -TERM $pid 2>/dev/null" fi @@ -166,9 +167,9 @@ ovs_add_netns_and_veths () { fi if [ "$7" != "-u" ]; then - ovs_add_if "$1" "$2" "$4" || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" || return 1 else - ovs_add_if "$1" "$2" "$4" -u || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" -u || return 1 fi if [ $TRACING -eq 1 ]; then @@ -756,6 +757,79 @@ test_upcall_interfaces() { return 0 } +ovs_add_kernel_tunnel() { + local sbxname=$1; shift + local ns=$1; shift + local tnl_type=$1; shift + local name=$1; shift + local addr=$1; shift + + info "setting up kernel ${tnl_type} tunnel ${name}" + ovs_sbx "${sbxname}" ip -netns ${ns} link add dev ${name} type ${tnl_type} $* || return 1 + on_exit "ovs_sbx ${sbxname} ip -netns ${ns} link del ${name} >/dev/null 2>&1" + ovs_sbx "${sbxname}" ip -netns ${ns} addr add dev ${name} ${addr} || return 1 + ovs_sbx "${sbxname}" ip -netns ${ns} link set dev ${name} mtu 1450 up || return 1 +} + +test_tunnel_metadata() { + which arping >/dev/null 2>&1 || return $ksft_skip + + sbxname="test_tunnel_metadata" + sbx_add "${sbxname}" || return 1 + + info "setting up new DP" + ovs_add_dp "${sbxname}" tdp0 -V 2:1 || return 1 + + ovs_add_netns_and_veths "${sbxname}" tdp0 tns left0 l0 \ + 172.31.110.1/24 || return 1 + + info "removing veth interface from openvswitch and setting IP" + ovs_del_if "${sbxname}" tdp0 left0 || return 1 + ovs_sbx "${sbxname}" ip addr add 172.31.110.2/24 dev left0 || return 1 + ovs_sbx "${sbxname}" ip link set left0 up || return 1 + + info "setting up tunnel port in openvswitch" + ovs_add_if "${sbxname}" "vxlan" tdp0 ovs-vxlan0 -u || return 1 + on_exit "ovs_sbx ${sbxname} ip link del ovs-vxlan0" + ovs_wait ip link show ovs-vxlan0 &>/dev/null || return 1 + ovs_sbx "${sbxname}" ip link set ovs-vxlan0 up || return 1 + + configs=$(echo ' + 1 172.31.221.1/24 1155332 32 set udpcsum flags\(df\|csum\) + 2 172.31.222.1/24 1234567 45 set noudpcsum flags\(df\) + 3 172.31.223.1/24 1020304 23 unset udpcsum flags\(csum\) + 4 172.31.224.1/24 1357986 15 unset noudpcsum' | sed '/^$/d') + + while read -r i addr id ttl df csum flags; do + ovs_add_kernel_tunnel "${sbxname}" tns vxlan vxlan${i} ${addr} \ + remote 172.31.110.2 id ${id} dstport 4789 \ + ttl ${ttl} df ${df} ${csum} || return 1 + done <<< "${configs}" + + ovs_wait grep -q 'listening on upcall packet handler' \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + + info "sending arping" + for i in 1 2 3 4; do + ovs_sbx "${sbxname}" ip netns exec tns \ + arping -I vxlan${i} 172.31.22${i}.2 -c 1 \ + >${ovs_dir}/arping.stdout 2>${ovs_dir}/arping.stderr + done + + info "checking that received decapsulated packets carry correct metadata" + while read -r i addr id ttl df csum flags; do + arp_hdr="arp\\(sip=172.31.22${i}.1,tip=172.31.22${i}.2,op=1,sha=" + addrs="src=172.31.110.1,dst=172.31.110.2" + ports="tp_src=[0-9]*,tp_dst=4789" + tnl_md="tunnel\\(tun_id=${id},${addrs},ttl=${ttl},${ports},${flags}\\)" + + ovs_sbx "${sbxname}" grep -qE "MISS upcall.*${tnl_md}.*${arp_hdr}" \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + done <<< "${configs}" + + return 0 +} + run_test() { ( tname="$1" diff --git a/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt new file mode 100644 index 000000000000..26794e7ddfd5 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +`./defaults.sh + ./set_sysctls.py /proc/sys/net/ipv4/tcp_fastopen=0x602 /proc/sys/net/ipv4/tcp_timestamps=0` + + 0 socket(..., SOCK_STREAM|SOCK_NONBLOCK, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:10(10) win 32792 <mss 1460,nop,nop,sackOK> + +0 > S. 0:0(0) ack 11 win 65535 <mss 1460,nop,nop,sackOK> + +// sk->sk_state is TCP_SYN_RECV + +.1 accept(3, ..., ...) = 4 + +// tcp_disconnect() sets sk->sk_state to TCP_CLOSE + +0 connect(4, AF_UNSPEC, ...) = 0 + +0 > R. 1:1(0) ack 11 win 65535 + +// connect() sets sk->sk_state to TCP_SYN_SENT + +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 connect(4, ..., ...) = -1 EINPROGRESS (Operation is now in progress) + +0 > S 0:0(0) win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +// tp->fastopen_rsk must be NULL + +1 > S 0:0(0) win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 8> diff --git a/tools/testing/selftests/net/test_vxlan_nh.sh b/tools/testing/selftests/net/test_vxlan_nh.sh new file mode 100755 index 000000000000..20f3369f776b --- /dev/null +++ b/tools/testing/selftests/net/test_vxlan_nh.sh @@ -0,0 +1,223 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh +TESTS=" + basic_tx_ipv4 + basic_tx_ipv6 + learning + proxy_ipv4 + proxy_ipv6 +" +VERBOSE=0 + +################################################################################ +# Utilities + +run_cmd() +{ + local cmd="$1" + local out + local stderr="2>/dev/null" + + if [ "$VERBOSE" = "1" ]; then + echo "COMMAND: $cmd" + stderr= + fi + + out=$(eval "$cmd" "$stderr") + rc=$? + if [ "$VERBOSE" -eq 1 ] && [ -n "$out" ]; then + echo " $out" + fi + + return $rc +} + +################################################################################ +# Cleanup + +exit_cleanup_all() +{ + cleanup_all_ns + exit "${EXIT_STATUS}" +} + +################################################################################ +# Tests + +nh_stats_get() +{ + ip -n "$ns1" -s -j nexthop show id 10 | jq ".[][\"group_stats\"][][\"packets\"]" +} + +tc_stats_get() +{ + tc_rule_handle_stats_get "dev dummy1 egress" 101 ".packets" "-n $ns1" +} + +basic_tx_common() +{ + local af_str=$1; shift + local proto=$1; shift + local local_addr=$1; shift + local plen=$1; shift + local remote_addr=$1; shift + + RET=0 + + # Test basic Tx functionality. Check that stats are incremented on + # both the FDB nexthop group and the egress device. + + run_cmd "ip -n $ns1 link add name dummy1 up type dummy" + run_cmd "ip -n $ns1 route add $remote_addr/$plen dev dummy1" + run_cmd "tc -n $ns1 qdisc add dev dummy1 clsact" + run_cmd "tc -n $ns1 filter add dev dummy1 egress proto $proto pref 1 handle 101 flower ip_proto udp dst_ip $remote_addr dst_port 4789 action pass" + + run_cmd "ip -n $ns1 address add $local_addr/$plen dev lo" + + run_cmd "ip -n $ns1 nexthop add id 1 via $remote_addr fdb" + run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb" + + run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local $local_addr dstport 4789" + run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static nhid 10" + + run_cmd "ip netns exec $ns1 mausezahn vx0 -a own -b 00:11:22:33:44:55 -c 1 -q" + + busywait "$BUSYWAIT_TIMEOUT" until_counter_is "== 1" nh_stats_get > /dev/null + check_err $? "FDB nexthop group stats did not increase" + + busywait "$BUSYWAIT_TIMEOUT" until_counter_is "== 1" tc_stats_get > /dev/null + check_err $? "tc filter stats did not increase" + + log_test "VXLAN FDB nexthop: $af_str basic Tx" +} + +basic_tx_ipv4() +{ + basic_tx_common "IPv4" ipv4 192.0.2.1 32 192.0.2.2 +} + +basic_tx_ipv6() +{ + basic_tx_common "IPv6" ipv6 2001:db8:1::1 128 2001:db8:1::2 +} + +learning() +{ + RET=0 + + # When learning is enabled on the VXLAN device, an incoming packet + # might try to refresh an FDB entry that points to an FDB nexthop group + # instead of an ordinary remote destination. Check that the kernel does + # not crash in this situation. + + run_cmd "ip -n $ns1 address add 192.0.2.1/32 dev lo" + run_cmd "ip -n $ns1 address add 192.0.2.2/32 dev lo" + + run_cmd "ip -n $ns1 nexthop add id 1 via 192.0.2.3 fdb" + run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb" + + run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local 192.0.2.1 dstport 12345 localbypass" + run_cmd "ip -n $ns1 link add name vx1 up type vxlan id 10020 local 192.0.2.2 dstport 54321 learning" + + run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static dst 192.0.2.2 port 54321 vni 10020" + run_cmd "bridge -n $ns1 fdb add 00:aa:bb:cc:dd:ee dev vx1 self static nhid 10" + + run_cmd "ip netns exec $ns1 mausezahn vx0 -a 00:aa:bb:cc:dd:ee -b 00:11:22:33:44:55 -c 1 -q" + + log_test "VXLAN FDB nexthop: learning" +} + +proxy_common() +{ + local af_str=$1; shift + local local_addr=$1; shift + local plen=$1; shift + local remote_addr=$1; shift + local neigh_addr=$1; shift + local ping_cmd=$1; shift + + RET=0 + + # When the "proxy" option is enabled on the VXLAN device, the device + # will suppress ARP requests and IPv6 Neighbor Solicitation messages if + # it is able to reply on behalf of the remote host. That is, if a + # matching and valid neighbor entry is configured on the VXLAN device + # whose MAC address is not behind the "any" remote (0.0.0.0 / ::). The + # FDB entry for the neighbor's MAC address might point to an FDB + # nexthop group instead of an ordinary remote destination. Check that + # the kernel does not crash in this situation. + + run_cmd "ip -n $ns1 address add $local_addr/$plen dev lo" + + run_cmd "ip -n $ns1 nexthop add id 1 via $remote_addr fdb" + run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb" + + run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local $local_addr dstport 4789 proxy" + + run_cmd "ip -n $ns1 neigh add $neigh_addr lladdr 00:11:22:33:44:55 nud perm dev vx0" + + run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static nhid 10" + + run_cmd "ip netns exec $ns1 $ping_cmd" + + log_test "VXLAN FDB nexthop: $af_str proxy" +} + +proxy_ipv4() +{ + proxy_common "IPv4" 192.0.2.1 32 192.0.2.2 192.0.2.3 \ + "arping -b -c 1 -s 192.0.2.1 -I vx0 192.0.2.3" +} + +proxy_ipv6() +{ + proxy_common "IPv6" 2001:db8:1::1 128 2001:db8:1::2 2001:db8:1::3 \ + "ndisc6 -r 1 -s 2001:db8:1::1 -w 1 2001:db8:1::3 vx0" +} + +################################################################################ +# Usage + +usage() +{ + cat <<EOF +usage: ${0##*/} OPTS + + -t <test> Test(s) to run (default: all) + (options: $TESTS) + -p Pause on fail + -v Verbose mode (show commands and output) +EOF +} + +################################################################################ +# Main + +while getopts ":t:pvh" opt; do + case $opt in + t) TESTS=$OPTARG;; + p) PAUSE_ON_FAIL=yes;; + v) VERBOSE=$((VERBOSE + 1));; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +require_command mausezahn +require_command arping +require_command ndisc6 +require_command jq + +if ! ip nexthop help 2>&1 | grep -q "stats"; then + echo "SKIP: iproute2 ip too old, missing nexthop stats support" + exit "$ksft_skip" +fi + +trap exit_cleanup_all EXIT + +for t in $TESTS +do + setup_ns ns1; $t; cleanup_all_ns; +done diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 5ded3b3a7538..dd093f9df6f1 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -181,13 +181,12 @@ static int tls_send_cmsg(int fd, unsigned char record_type, return sendmsg(fd, &msg, flags); } -static int tls_recv_cmsg(struct __test_metadata *_metadata, - int fd, unsigned char record_type, - void *data, size_t len, int flags) +static int __tls_recv_cmsg(struct __test_metadata *_metadata, + int fd, unsigned char *ctype, + void *data, size_t len, int flags) { char cbuf[CMSG_SPACE(sizeof(char))]; struct cmsghdr *cmsg; - unsigned char ctype; struct msghdr msg; struct iovec vec; int n; @@ -206,7 +205,20 @@ static int tls_recv_cmsg(struct __test_metadata *_metadata, EXPECT_NE(cmsg, NULL); EXPECT_EQ(cmsg->cmsg_level, SOL_TLS); EXPECT_EQ(cmsg->cmsg_type, TLS_GET_RECORD_TYPE); - ctype = *((unsigned char *)CMSG_DATA(cmsg)); + if (ctype) + *ctype = *((unsigned char *)CMSG_DATA(cmsg)); + + return n; +} + +static int tls_recv_cmsg(struct __test_metadata *_metadata, + int fd, unsigned char record_type, + void *data, size_t len, int flags) +{ + unsigned char ctype; + int n; + + n = __tls_recv_cmsg(_metadata, fd, &ctype, data, len, flags); EXPECT_EQ(ctype, record_type); return n; @@ -2164,6 +2176,284 @@ TEST_F(tls, rekey_poll_delay) } } +struct raw_rec { + unsigned int plain_len; + unsigned char plain_data[100]; + unsigned int cipher_len; + unsigned char cipher_data[128]; +}; + +/* TLS 1.2, AES_CCM, data, seqno:0, plaintext: 'Hello world' */ +static const struct raw_rec id0_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x26, 0xa2, 0x33, + 0xde, 0x8d, 0x94, 0xf0, 0x29, 0x6c, 0xb1, 0xaf, + 0x6a, 0x75, 0xb2, 0x93, 0xad, 0x45, 0xd5, 0xfd, + 0x03, 0x51, 0x57, 0x8f, 0xf9, 0xcc, 0x3b, 0x42, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:0, plaintext: '' */ +static const struct raw_rec id0_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x38, 0x7b, + 0xa6, 0x1c, 0xdd, 0xa7, 0x19, 0x33, 0xab, 0xae, + 0x88, 0xe1, 0xd2, 0x08, 0x4f, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:0, plaintext: '' */ +static const struct raw_rec id0_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xc5, 0x37, 0x90, + 0x70, 0x45, 0x89, 0xfb, 0x5c, 0xc7, 0x89, 0x03, + 0x68, 0x80, 0xd3, 0xd8, 0xcc, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:1, plaintext: 'Hello world' */ +static const struct raw_rec id1_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x3a, 0x1a, 0x9c, + 0xd0, 0xa8, 0x9a, 0xd6, 0x69, 0xd6, 0x1a, 0xe3, + 0xb5, 0x1f, 0x0d, 0x2c, 0xe2, 0x97, 0x46, 0xff, + 0x2b, 0xcc, 0x5a, 0xc4, 0xa3, 0xb9, 0xef, 0xba, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:1, plaintext: '' */ +static const struct raw_rec id1_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x3e, 0xf0, 0xfe, + 0xee, 0xd9, 0xe2, 0x5d, 0xc7, 0x11, 0x4c, 0xe6, + 0xb4, 0x7e, 0xef, 0x40, 0x2b, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:1, plaintext: '' */ +static const struct raw_rec id1_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0xce, 0xfc, 0x86, + 0xc8, 0xf0, 0x55, 0xf9, 0x47, 0x3f, 0x74, 0xdc, + 0xc9, 0xbf, 0xfe, 0x5b, 0xb1, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:2, plaintext: 'Hello world' */ +static const struct raw_rec id2_ctrl_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xe5, 0x3d, 0x19, + 0x3d, 0xca, 0xb8, 0x16, 0xb6, 0xff, 0x79, 0x87, + 0x2a, 0x04, 0x11, 0x3d, 0xf8, 0x64, 0x5f, 0x36, + 0x8b, 0xa8, 0xee, 0x4c, 0x6d, 0x62, 0xa5, 0x00, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:2, plaintext: 'Hello world' */ +static const struct raw_rec id2_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xe5, 0x3d, 0x19, + 0x3d, 0xca, 0xb8, 0x16, 0xb6, 0xff, 0x79, 0x87, + 0x8e, 0xa1, 0xd0, 0xcd, 0x33, 0xb5, 0x86, 0x2b, + 0x17, 0xf1, 0x52, 0x2a, 0x55, 0x62, 0x65, 0x11, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:2, plaintext: '' */ +static const struct raw_rec id2_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xdc, 0x5c, 0x0e, + 0x41, 0xdd, 0xba, 0xd3, 0xcc, 0xcf, 0x6d, 0xd9, + 0x06, 0xdb, 0x79, 0xe5, 0x5d, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:2, plaintext: '' */ +static const struct raw_rec id2_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xc3, 0xca, 0x26, + 0x22, 0xe4, 0x25, 0xfb, 0x5f, 0x6d, 0xbf, 0x83, + 0x30, 0x48, 0x69, 0x1a, 0x47, + }, +}; + +FIXTURE(zero_len) +{ + int fd, cfd; + bool notls; +}; + +FIXTURE_VARIANT(zero_len) +{ + const struct raw_rec *recs[4]; + ssize_t recv_ret[4]; +}; + +FIXTURE_VARIANT_ADD(zero_len, data_data_data) +{ + .recs = { &id0_data_l11, &id1_data_l11, &id2_data_l11, }, + .recv_ret = { 33, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, data_0ctrl_data) +{ + .recs = { &id0_data_l11, &id1_ctrl_l0, &id2_data_l11, }, + .recv_ret = { 11, 0, 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_0data) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_data_l0, }, + .recv_ret = { -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_ctrl) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_ctrl_l11, }, + .recv_ret = { 0, 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_0ctrl) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_ctrl_l0, }, + .recv_ret = { 0, 0, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0ctrl_0ctrl_0ctrl) +{ + .recs = { &id0_ctrl_l0, &id1_ctrl_l0, &id2_ctrl_l0, }, + .recv_ret = { 0, 0, 0, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_data) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_data_l11, }, + .recv_ret = { 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, data_0data_0data) +{ + .recs = { &id0_data_l11, &id1_data_l0, &id2_data_l0, }, + .recv_ret = { 11, -EAGAIN, }, +}; + +FIXTURE_SETUP(zero_len) +{ + struct tls_crypto_info_keys tls12; + int ret; + + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128, + &tls12, 0); + + ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls); + if (self->notls) + return; + + /* Don't install keys on fd, we'll send raw records */ + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len); + ASSERT_EQ(ret, 0); +} + +FIXTURE_TEARDOWN(zero_len) +{ + close(self->fd); + close(self->cfd); +} + +TEST_F(zero_len, test) +{ + const struct raw_rec *const *rec; + unsigned char buf[128]; + int rec_off; + int i; + + for (i = 0; i < 4 && variant->recs[i]; i++) + EXPECT_EQ(send(self->fd, variant->recs[i]->cipher_data, + variant->recs[i]->cipher_len, 0), + variant->recs[i]->cipher_len); + + rec = &variant->recs[0]; + rec_off = 0; + for (i = 0; i < 4; i++) { + int j, ret; + + ret = variant->recv_ret[i] >= 0 ? variant->recv_ret[i] : -1; + EXPECT_EQ(__tls_recv_cmsg(_metadata, self->cfd, NULL, + buf, sizeof(buf), MSG_DONTWAIT), ret); + if (ret == -1) + EXPECT_EQ(errno, -variant->recv_ret[i]); + if (variant->recv_ret[i] == -EAGAIN) + break; + + for (j = 0; j < ret; j++) { + while (rec_off == (*rec)->plain_len) { + rec++; + rec_off = 0; + } + EXPECT_EQ(buf[j], (*rec)->plain_data[rec_off]); + rec_off++; + } + } +}; + FIXTURE(tls_err) { int fd, cfd; @@ -2480,6 +2770,22 @@ TEST_F(tls_err, poll_partial_rec_async) } } +/* Use OOB+large send to trigger copy mode due to memory pressure. + * OOB causes a short read. + */ +TEST_F(tls_err, oob_pressure) +{ + char buf[1<<16]; + int i; + + memrnd(buf, sizeof(buf)); + + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); + EXPECT_EQ(send(self->fd2, buf, sizeof(buf), 0), sizeof(buf)); + for (i = 0; i < 64; i++) + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); +} + TEST(non_established) { struct tls12_crypto_info_aes_gcm_256 tls12; struct sockaddr_in addr; @@ -2708,6 +3014,67 @@ TEST(prequeue) { close(cfd); } +TEST(data_steal) { + struct tls_crypto_info_keys tls; + char buf[20000], buf2[20000]; + struct sockaddr_in addr; + int sfd, cfd, ret, fd; + int pid, status; + socklen_t len; + + len = sizeof(addr); + memrnd(buf, sizeof(buf)); + + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_256, &tls, 0); + + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_ANY); + addr.sin_port = 0; + + fd = socket(AF_INET, SOCK_STREAM, 0); + sfd = socket(AF_INET, SOCK_STREAM, 0); + + ASSERT_EQ(bind(sfd, &addr, sizeof(addr)), 0); + ASSERT_EQ(listen(sfd, 10), 0); + ASSERT_EQ(getsockname(sfd, &addr, &len), 0); + ASSERT_EQ(connect(fd, &addr, sizeof(addr)), 0); + ASSERT_GE(cfd = accept(sfd, &addr, &len), 0); + close(sfd); + + ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")); + if (ret) { + ASSERT_EQ(errno, ENOENT); + SKIP(return, "no TLS support"); + } + ASSERT_EQ(setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")), 0); + + /* Spawn a child and get it into the read wait path of the underlying + * TCP socket. + */ + pid = fork(); + ASSERT_GE(pid, 0); + if (!pid) { + EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2, MSG_WAITALL), + sizeof(buf) / 2); + exit(!__test_passed(_metadata)); + } + + usleep(10000); + ASSERT_EQ(setsockopt(fd, SOL_TLS, TLS_TX, &tls, tls.len), 0); + ASSERT_EQ(setsockopt(cfd, SOL_TLS, TLS_RX, &tls, tls.len), 0); + + EXPECT_EQ(send(fd, buf, sizeof(buf), 0), sizeof(buf)); + EXPECT_EQ(wait(&status), pid); + EXPECT_EQ(status, 0); + EXPECT_EQ(recv(cfd, buf2, sizeof(buf2), MSG_DONTWAIT), -1); + /* Don't check errno, the error will be different depending + * on what random bytes TLS interpreted as the record length. + */ + + close(fd); + close(cfd); +} + static void __attribute__((constructor)) fips_check(void) { int res; FILE *f; diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config index 6133524710f7..cf7cc0ce0248 100644 --- a/tools/testing/selftests/pidfd/config +++ b/tools/testing/selftests/pidfd/config @@ -4,6 +4,5 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_TIME_NS=y -CONFIG_GENERIC_VDSO_TIME_NS=y CONFIG_CGROUPS=y CONFIG_CHECKPOINT_RESTORE=y diff --git a/tools/testing/selftests/powerpc/include/instructions.h b/tools/testing/selftests/powerpc/include/instructions.h index 4efa6314bd96..864f0c9f1afc 100644 --- a/tools/testing/selftests/powerpc/include/instructions.h +++ b/tools/testing/selftests/powerpc/include/instructions.h @@ -67,7 +67,7 @@ static inline int paste_last(void *i) #define PPC_INST_PASTE_LAST __PASTE(0, 0, 1, 1) /* This defines the prefixed load/store instructions */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define stringify_in_c(...) __VA_ARGS__ #else # define __stringify_in_c(...) #__VA_ARGS__ diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 19bb333e2485..6b78a8382d40 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -18,6 +18,7 @@ /proc-tid0 /proc-uptime-001 /proc-uptime-002 +/proc-pidns /read /self /setns-dcache diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 50aba102201a..be3013515aae 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -28,5 +28,6 @@ TEST_GEN_PROGS += setns-sysvipc TEST_GEN_PROGS += thread-self TEST_GEN_PROGS += proc-multiple-procfs TEST_GEN_PROGS += proc-fsconfig-hidepid +TEST_GEN_PROGS += proc-pidns include ../lib.mk diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c index 66773685a047..94bba4553130 100644 --- a/tools/testing/selftests/proc/proc-maps-race.c +++ b/tools/testing/selftests/proc/proc-maps-race.c @@ -202,11 +202,11 @@ static void print_first_lines(char *text, int nr) int offs = end - text; text[offs] = '\0'; - printf(text); + printf("%s", text); text[offs] = '\n'; printf("\n"); } else { - printf(text); + printf("%s", text); } } @@ -221,7 +221,7 @@ static void print_last_lines(char *text, int nr) nr--; start--; } - printf(start); + printf("%s", start); } static void print_boundaries(const char *title, FIXTURE_DATA(proc_maps_race) *self) diff --git a/tools/testing/selftests/proc/proc-pidns.c b/tools/testing/selftests/proc/proc-pidns.c new file mode 100644 index 000000000000..52500597f951 --- /dev/null +++ b/tools/testing/selftests/proc/proc-pidns.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Aleksa Sarai <cyphar@cyphar.com> + * Copyright (C) 2025 SUSE LLC. + */ + +#include <assert.h> +#include <errno.h> +#include <sched.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdio.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/prctl.h> + +#include "../kselftest_harness.h" + +#define ASSERT_ERRNO(expected, _t, seen) \ + __EXPECT(expected, #expected, \ + ({__typeof__(seen) _tmp_seen = (seen); \ + _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1) + +#define ASSERT_ERRNO_EQ(expected, seen) \ + ASSERT_ERRNO(expected, ==, seen) + +#define ASSERT_SUCCESS(seen) \ + ASSERT_ERRNO(0, <=, seen) + +static int touch(char *path) +{ + int fd = open(path, O_WRONLY|O_CREAT|O_CLOEXEC, 0644); + if (fd < 0) + return -1; + return close(fd); +} + +FIXTURE(ns) +{ + int host_mntns, host_pidns; + int dummy_pidns; +}; + +FIXTURE_SETUP(ns) +{ + /* Stash the old mntns. */ + self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_mntns); + + /* Create a new mount namespace and make it private. */ + ASSERT_SUCCESS(unshare(CLONE_NEWNS)); + ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL)); + + /* + * Create a proper tmpfs that we can use and will disappear once we + * leave this mntns. + */ + ASSERT_SUCCESS(mount("tmpfs", "/tmp", "tmpfs", 0, NULL)); + + /* + * Create a pidns we can use for later tests. We need to fork off a + * child so that we get a usable nsfd that we can bind-mount and open. + */ + ASSERT_SUCCESS(mkdir("/tmp/dummy", 0755)); + ASSERT_SUCCESS(touch("/tmp/dummy/pidns")); + ASSERT_SUCCESS(mkdir("/tmp/dummy/proc", 0755)); + + self->host_pidns = open("/proc/self/ns/pid", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_pidns); + ASSERT_SUCCESS(unshare(CLONE_NEWPID)); + + pid_t pid = fork(); + ASSERT_SUCCESS(pid); + if (!pid) { + prctl(PR_SET_PDEATHSIG, SIGKILL); + ASSERT_SUCCESS(mount("/proc/self/ns/pid", "/tmp/dummy/pidns", NULL, MS_BIND, NULL)); + ASSERT_SUCCESS(mount("proc", "/tmp/dummy/proc", "proc", 0, NULL)); + exit(0); + } + + int wstatus; + ASSERT_EQ(waitpid(pid, &wstatus, 0), pid); + ASSERT_TRUE(WIFEXITED(wstatus)); + ASSERT_EQ(WEXITSTATUS(wstatus), 0); + + ASSERT_SUCCESS(setns(self->host_pidns, CLONE_NEWPID)); + + self->dummy_pidns = open("/tmp/dummy/pidns", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->dummy_pidns); +} + +FIXTURE_TEARDOWN(ns) +{ + ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS)); + ASSERT_SUCCESS(close(self->host_mntns)); + + ASSERT_SUCCESS(close(self->host_pidns)); + ASSERT_SUCCESS(close(self->dummy_pidns)); +} + +TEST_F(ns, pidns_mount_string_path) +{ + ASSERT_SUCCESS(mkdir("/tmp/proc-host", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc-host", "proc", 0, "pidns=/proc/self/ns/pid")); + ASSERT_SUCCESS(access("/tmp/proc-host/self/", X_OK)); + + ASSERT_SUCCESS(mkdir("/tmp/proc-dummy", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc-dummy", "proc", 0, "pidns=/tmp/dummy/pidns")); + ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/1/", X_OK)); + ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/self/", X_OK)); +} + +TEST_F(ns, pidns_fsconfig_string_path) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_fsconfig_fd) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_reconfigure_remount) +{ + ASSERT_SUCCESS(mkdir("/tmp/proc", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc", "proc", 0, "")); + + ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK)); + ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK)); + + ASSERT_ERRNO_EQ(-EBUSY, mount(NULL, "/tmp/proc", NULL, MS_REMOUNT, "pidns=/tmp/dummy/pidns")); + + ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK)); + ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK)); +} + +TEST_F(ns, pidns_reconfigure_fsconfig_string_path) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */ + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_reconfigure_fsconfig_fd) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */ + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/README b/tools/testing/selftests/riscv/README new file mode 100644 index 000000000000..443da395da68 --- /dev/null +++ b/tools/testing/selftests/riscv/README @@ -0,0 +1,24 @@ +KSelfTest RISC-V +================ + +- These tests are riscv specific and so not built or run but just skipped + completely when env-variable ARCH is found to be different than 'riscv'. + +- Holding true the above, RISC-V KSFT tests can be run within the + KSelfTest framework using standard Linux top-level-makefile targets: + + $ make TARGETS=riscv kselftest-clean + $ make TARGETS=riscv kselftest + + or + + $ make -C tools/testing/selftests TARGETS=riscv \ + INSTALL_PATH=<your-installation-path> install + + or, alternatively, only specific riscv/ subtargets can be picked: + + $ make -C tools/testing/selftests TARGETS=riscv RISCV_SUBTARGETS="mm vector" \ + INSTALL_PATH=<your-installation-path> install + + Further details on building and running KSFT can be found in: + Documentation/dev-tools/kselftest.rst diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index 663a9cef1952..dcac5cbe7933 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -40,9 +40,9 @@ * Define weak versions to play nice with binaries that are statically linked * against a libc that doesn't support registering its own rseq. */ -__weak ptrdiff_t __rseq_offset; -__weak unsigned int __rseq_size; -__weak unsigned int __rseq_flags; +extern __weak ptrdiff_t __rseq_offset; +extern __weak unsigned int __rseq_size; +extern __weak unsigned int __rseq_flags; static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset; static const unsigned int *libc_rseq_size_p = &__rseq_size; @@ -209,7 +209,7 @@ void rseq_init(void) * libc not having registered a restartable sequence. Try to find the * symbols if that's the case. */ - if (!*libc_rseq_size_p) { + if (!libc_rseq_size_p || !*libc_rseq_size_p) { libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset"); libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size"); libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags"); diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c index 1c9ceb661c43..0cfbb111a2d0 100644 --- a/tools/testing/selftests/sched_ext/hotplug.c +++ b/tools/testing/selftests/sched_ext/hotplug.c @@ -6,7 +6,6 @@ #include <bpf/bpf.h> #include <sched.h> #include <scx/common.h> -#include <sched.h> #include <sys/wait.h> #include <unistd.h> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 61acbd45ffaa..874f17763536 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -24,6 +24,7 @@ #include <linux/filter.h> #include <sys/prctl.h> #include <sys/ptrace.h> +#include <sys/time.h> #include <sys/user.h> #include <linux/prctl.h> #include <linux/ptrace.h> @@ -73,6 +74,14 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + +#ifndef __naked +#define __naked __attribute__((__naked__)) +#endif + #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #define PR_GET_NO_NEW_PRIVS 39 @@ -3547,6 +3556,10 @@ static void signal_handler(int signal) perror("write from signal"); } +static void signal_handler_nop(int signal) +{ +} + TEST(user_notification_signal) { pid_t pid; @@ -4819,6 +4832,132 @@ TEST(user_notification_wait_killable_fatal) EXPECT_EQ(SIGTERM, WTERMSIG(status)); } +/* Ensure signals after the reply do not interrupt */ +TEST(user_notification_wait_killable_after_reply) +{ + int i, max_iter = 100000; + int listener, status; + int pipe_fds[2]; + pid_t pid; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) + { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + listener = user_notif_syscall( + __NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV); + ASSERT_GE(listener, 0); + + /* + * Used to count invocations. One token is transferred from the child + * to the parent per syscall invocation, the parent tries to take + * one token per successful RECV. If the syscall is restarted after + * RECV the parent will try to get two tokens while the child only + * provided one. + */ + ASSERT_EQ(pipe(pipe_fds), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct sigaction new_action = { + .sa_handler = signal_handler_nop, + .sa_flags = SA_RESTART, + }; + struct itimerval timer = { + .it_value = { .tv_usec = 1000 }, + .it_interval = { .tv_usec = 1000 }, + }; + char c = 'a'; + + close(pipe_fds[0]); + + /* Setup the sigaction with SA_RESTART */ + if (sigaction(SIGALRM, &new_action, NULL)) { + perror("sigaction"); + exit(1); + } + + /* + * Kill with SIGALRM repeatedly, to try to hit the race when + * handling the syscall. + */ + if (setitimer(ITIMER_REAL, &timer, NULL) < 0) + perror("setitimer"); + + for (i = 0; i < max_iter; ++i) { + int fd; + + /* Send one token per iteration to catch repeats. */ + if (write(pipe_fds[1], &c, sizeof(c)) != 1) { + perror("write"); + exit(1); + } + + fd = syscall(__NR_dup, 0); + if (fd < 0) { + perror("dup"); + exit(1); + } + close(fd); + } + + exit(0); + } + + close(pipe_fds[1]); + + for (i = 0; i < max_iter; ++i) { + struct seccomp_notif req = {}; + struct seccomp_notif_addfd addfd = {}; + struct pollfd pfd = { + .fd = pipe_fds[0], + .events = POLLIN, + }; + char c; + + /* + * Try to receive one token. If it failed, one child syscall + * was restarted after RECV and needed to be handled twice. + */ + ASSERT_EQ(poll(&pfd, 1, 1000), 1) + kill(pid, SIGKILL); + + ASSERT_EQ(read(pipe_fds[0], &c, sizeof(c)), 1) + kill(pid, SIGKILL); + + /* + * Get the notification, reply to it as fast as possible to test + * whether the child wrongly skips going into the non-preemptible + * (TASK_KILLABLE) state. + */ + do + ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req); + while (ret < 0 && errno == ENOENT); /* Accept interruptions before RECV */ + ASSERT_EQ(ret, 0) + kill(pid, SIGKILL); + + addfd.id = req.id; + addfd.flags = SECCOMP_ADDFD_FLAG_SEND; + addfd.srcfd = 0; + ASSERT_GE(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), 0) + kill(pid, SIGKILL); + } + + /* + * Wait for the process to exit, and make sure the process terminated + * with a zero exit code.. + */ + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + struct tsync_vs_thread_leader_args { pthread_t leader; }; @@ -4896,7 +5035,36 @@ TEST(tsync_vs_dead_thread_leader) EXPECT_EQ(0, status); } -noinline int probed(void) +#ifdef __x86_64__ + +/* + * We need naked probed_uprobe function. Using __nocf_check + * check to skip possible endbr64 instruction and ignoring + * -Wattributes, otherwise the compilation might fail. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" + +__naked __nocf_check noinline int probed_uprobe(void) +{ + /* + * Optimized uprobe is possible only on top of nop5 instruction. + */ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} +#pragma GCC diagnostic pop + +#else +noinline int probed_uprobe(void) +{ + return 1; +} +#endif + +noinline int probed_uretprobe(void) { return 1; } @@ -4949,35 +5117,46 @@ static ssize_t get_uprobe_offset(const void *addr) return found ? (uintptr_t)addr - start + base : -1; } -FIXTURE(URETPROBE) { +FIXTURE(UPROBE) { int fd; }; -FIXTURE_VARIANT(URETPROBE) { +FIXTURE_VARIANT(UPROBE) { /* - * All of the URETPROBE behaviors can be tested with either - * uretprobe attached or not + * All of the U(RET)PROBE behaviors can be tested with either + * u(ret)probe attached or not */ bool attach; + /* + * Test both uprobe and uretprobe. + */ + bool uretprobe; +}; + +FIXTURE_VARIANT_ADD(UPROBE, not_attached) { + .attach = false, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, attached) { +FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) { .attach = true, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { - .attach = false, +FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) { + .attach = true, + .uretprobe = true, }; -FIXTURE_SETUP(URETPROBE) +FIXTURE_SETUP(UPROBE) { const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; ssize_t offset; int type, bit; -#ifndef __NR_uretprobe - SKIP(return, "__NR_uretprobe syscall not defined"); +#if !defined(__NR_uprobe) || !defined(__NR_uretprobe) + SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined"); #endif if (!variant->attach) @@ -4987,12 +5166,17 @@ FIXTURE_SETUP(URETPROBE) type = determine_uprobe_perf_type(); ASSERT_GE(type, 0); - bit = determine_uprobe_retprobe_bit(); - ASSERT_GE(bit, 0); - offset = get_uprobe_offset(probed); + + if (variant->uretprobe) { + bit = determine_uprobe_retprobe_bit(); + ASSERT_GE(bit, 0); + } + + offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe); ASSERT_GE(offset, 0); - attr.config |= 1 << bit; + if (variant->uretprobe) + attr.config |= 1 << bit; attr.size = attr_sz; attr.type = type; attr.config1 = ptr_to_u64("/proc/self/exe"); @@ -5003,7 +5187,7 @@ FIXTURE_SETUP(URETPROBE) PERF_FLAG_FD_CLOEXEC); } -FIXTURE_TEARDOWN(URETPROBE) +FIXTURE_TEARDOWN(UPROBE) { /* we could call close(self->fd), but we'd need extra filter for * that and since we are calling _exit right away.. @@ -5017,11 +5201,17 @@ static int run_probed_with_filter(struct sock_fprog *prog) return -1; } - probed(); + /* + * Uprobe is optimized after first hit, so let's hit twice. + */ + probed_uprobe(); + probed_uprobe(); + + probed_uretprobe(); return 0; } -TEST_F(URETPROBE, uretprobe_default_allow) +TEST_F(UPROBE, uprobe_default_allow) { struct sock_filter filter[] = { BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), @@ -5034,7 +5224,7 @@ TEST_F(URETPROBE, uretprobe_default_allow) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block) +TEST_F(UPROBE, uprobe_default_block) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, @@ -5051,11 +5241,14 @@ TEST_F(URETPROBE, uretprobe_default_block) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) +TEST_F(UPROBE, uprobe_block_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), #endif @@ -5070,11 +5263,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) +TEST_F(UPROBE, uprobe_default_block_with_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), #endif diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 23a61e5b99d0..998e5a2f4579 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -186,6 +186,204 @@ ] }, { + "id": "34c0", + "name": "Test TBF with HHF Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "hhf" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 hhf limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 hhf limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "fd68", + "name": "Test TBF with CODEL Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "codel" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 codel limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 codel limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "514e", + "name": "Test TBF with PIE Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "pie" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 pie limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 pie limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "6c97", + "name": "Test TBF with FQ Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "5d0b", + "name": "Test TBF with FQ_CODEL Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq_codel" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq_codel limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq_codel limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "21c3", + "name": "Test TBF with FQ_PIE Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq_pie" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq_pie limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq_pie limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { "id": "a4bb", "name": "Test FQ_CODEL with HTB parent - force packet drop with empty queue", "category": [ diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 2d93ac860bd5..cd9fe69ecce2 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -20,7 +20,7 @@ static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, struct io_uring_sqe *sqe[1]; ublk_io_alloc_sqes(t, sqe, 1); - io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC); + io_uring_prep_fsync(sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/, IORING_FSYNC_DATASYNC); io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); @@ -42,7 +42,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, if (!sqe[0]) return -ENOMEM; - io_uring_prep_rw(op, sqe[0], 1 /*fds[1]*/, + io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/, addr, iod->nr_sectors << 9, iod->start_sector << 9); @@ -56,19 +56,19 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); - io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0, + io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, iod->nr_sectors << 9, iod->start_sector << 9); sqe[1]->buf_index = tag; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); return 2; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 95188065b2e9..b71faba86c3b 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -432,7 +432,7 @@ static void ublk_thread_deinit(struct ublk_thread *t) } } -static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags) +static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) { struct ublk_dev *dev = q->dev; int depth = dev->dev_info.queue_depth; @@ -446,6 +446,9 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags) q->flags = dev->dev_info.flags; q->flags |= extra_flags; + /* Cache fd in queue for fast path access */ + q->ublk_fd = dev->fds[0]; + cmd_buf_size = ublk_queue_cmd_buf_sz(q); off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ, @@ -481,9 +484,10 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags) return -ENOMEM; } -static int ublk_thread_init(struct ublk_thread *t) +static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flags) { struct ublk_dev *dev = t->dev; + unsigned long long flags = dev->dev_info.flags | extra_flags; int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; int ret; @@ -512,7 +516,17 @@ static int ublk_thread_init(struct ublk_thread *t) io_uring_register_ring_fd(&t->ring); - ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds); + if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) { + /* Register only backing files starting from index 1, exclude ublk control device */ + if (dev->nr_fds > 1) { + ret = io_uring_register_files(&t->ring, &dev->fds[1], dev->nr_fds - 1); + } else { + /* No backing files to register, skip file registration */ + ret = 0; + } + } else { + ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds); + } if (ret) { ublk_err("ublk dev %d thread %d register files failed %d\n", t->dev->dev_info.dev_id, t->idx, ret); @@ -626,9 +640,12 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) /* These fields should be written once, never change */ ublk_set_sqe_cmd_op(sqe[0], cmd_op); - sqe[0]->fd = 0; /* dev->fds[0] */ + sqe[0]->fd = ublk_get_registered_fd(q, 0); /* dev->fds[0] */ sqe[0]->opcode = IORING_OP_URING_CMD; - sqe[0]->flags = IOSQE_FIXED_FILE; + if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) + sqe[0]->flags = 0; /* Use raw FD, not fixed file */ + else + sqe[0]->flags = IOSQE_FIXED_FILE; sqe[0]->rw_flags = 0; cmd->tag = io->tag; cmd->q_id = q->q_id; @@ -832,6 +849,7 @@ struct ublk_thread_info { unsigned idx; sem_t *ready; cpu_set_t *affinity; + unsigned long long extra_flags; }; static void *ublk_io_handler_fn(void *data) @@ -844,7 +862,7 @@ static void *ublk_io_handler_fn(void *data) t->dev = info->dev; t->idx = info->idx; - ret = ublk_thread_init(t); + ret = ublk_thread_init(t, info->extra_flags); if (ret) { ublk_err("ublk dev %d thread %u init failed\n", dev_id, t->idx); @@ -934,6 +952,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) if (ctx->auto_zc_fallback) extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; + if (ctx->no_ublk_fixed_fd) + extra_flags |= UBLKS_Q_NO_UBLK_FIXED_FD; for (i = 0; i < dinfo->nr_hw_queues; i++) { dev->q[i].dev = dev; @@ -951,6 +971,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) tinfo[i].dev = dev; tinfo[i].idx = i; tinfo[i].ready = &ready; + tinfo[i].extra_flags = extra_flags; /* * If threads are not tied 1:1 to queues, setting thread @@ -1400,7 +1421,7 @@ static int cmd_dev_get_features(void) if (!((1ULL << i) & features)) continue; - if (i < sizeof(feat_map) / sizeof(feat_map[0])) + if (i < ARRAY_SIZE(feat_map)) feat = feat_map[i]; else feat = "unknown"; @@ -1471,13 +1492,13 @@ static void __cmd_create_help(char *exe, bool recovery) printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n", exe, recovery ? "recover" : "add"); printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n"); - printf("\t[-e 0|1 ] [-i 0|1]\n"); + printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n"); printf("\t[--nthreads threads] [--per_io_tasks]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); - for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) { + for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) { const struct ublk_tgt_ops *ops = tgt_ops_list[i]; if (ops->usage) @@ -1534,6 +1555,7 @@ int main(int argc, char *argv[]) { "size", 1, NULL, 's'}, { "nthreads", 1, NULL, 0 }, { "per_io_tasks", 0, NULL, 0 }, + { "no_ublk_fixed_fd", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1613,6 +1635,8 @@ int main(int argc, char *argv[]) ctx.nthreads = strtol(optarg, NULL, 10); if (!strcmp(longopts[option_idx].name, "per_io_tasks")) ctx.per_io_tasks = 1; + if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd")) + ctx.no_ublk_fixed_fd = 1; break; case '?': /* diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 219233f8a053..5e55484fb0aa 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -77,6 +77,7 @@ struct dev_ctx { unsigned int recovery:1; unsigned int auto_zc_fallback:1; unsigned int per_io_tasks:1; + unsigned int no_ublk_fixed_fd:1; int _evtfd; int _shmid; @@ -166,7 +167,9 @@ struct ublk_queue { /* borrow one bit of ublk uapi flags, which may never be used */ #define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63) +#define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) __u64 flags; + int ublk_fd; /* cached ublk char device fd */ struct ublk_io ios[UBLK_QUEUE_DEPTH]; }; @@ -273,34 +276,48 @@ static inline int ublk_io_alloc_sqes(struct ublk_thread *t, return nr_sqes; } -static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, - int dev_fd, int tag, int q_id, __u64 index) +static inline int ublk_get_registered_fd(struct ublk_queue *q, int fd_index) +{ + if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) { + if (fd_index == 0) + /* Return the raw ublk FD for index 0 */ + return q->ublk_fd; + /* Adjust index for backing files (index 1 becomes 0, etc.) */ + return fd_index - 1; + } + return fd_index; +} + +static inline void __io_uring_prep_buf_reg_unreg(struct io_uring_sqe *sqe, + struct ublk_queue *q, int tag, int q_id, __u64 index) { struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + int dev_fd = ublk_get_registered_fd(q, 0); io_uring_prep_read(sqe, dev_fd, 0, 0, 0); sqe->opcode = IORING_OP_URING_CMD; - sqe->flags |= IOSQE_FIXED_FILE; - sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; + if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) + sqe->flags &= ~IOSQE_FIXED_FILE; + else + sqe->flags |= IOSQE_FIXED_FILE; cmd->tag = tag; cmd->addr = index; cmd->q_id = q_id; } -static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, - int dev_fd, int tag, int q_id, __u64 index) +static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, + struct ublk_queue *q, int tag, int q_id, __u64 index) { - struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + __io_uring_prep_buf_reg_unreg(sqe, q, tag, q_id, index); + sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; +} - io_uring_prep_read(sqe, dev_fd, 0, 0, 0); - sqe->opcode = IORING_OP_URING_CMD; - sqe->flags |= IOSQE_FIXED_FILE; +static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, + struct ublk_queue *q, int tag, int q_id, __u64 index) +{ + __io_uring_prep_buf_reg_unreg(sqe, q, tag, q_id, index); sqe->cmd_op = UBLK_U_IO_UNREGISTER_IO_BUF; - - cmd->tag = tag; - cmd->addr = index; - cmd->q_id = q_id; } static inline void *ublk_get_sqe_cmd(const struct io_uring_sqe *sqe) diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index f0e0003a4860..280043f6b689 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -63,7 +63,7 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; @@ -71,7 +71,7 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, __setup_nop_io(tag, iod, sqe[1], q->q_id); sqe[1]->flags |= IOSQE_IO_HARDLINK; - io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); // buf register is marked as IOSQE_CQE_SKIP_SUCCESS diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index 1fb9b7cc281b..791fa8dc1651 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -142,7 +142,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, s->nr + extra); if (zc) { - io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -168,7 +168,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, if (zc) { struct io_uring_sqe *unreg = sqe[s->nr + 1]; - io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, io->buf_index); unreg->user_data = build_user_data( tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1); } diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh index 40d1437ca298..3f901db4d09d 100755 --- a/tools/testing/selftests/ublk/test_stress_04.sh +++ b/tools/testing/selftests/ublk/test_stress_04.sh @@ -28,14 +28,14 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M -ublk_io_and_kill_daemon 8G -t null -q 4 -z & -ublk_io_and_kill_daemon 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" & +ublk_io_and_kill_daemon 8G -t null -q 4 -z --no_ublk_fixed_fd & +ublk_io_and_kill_daemon 256M -t loop -q 4 -z --no_ublk_fixed_fd "${UBLK_BACKFILES[0]}" & ublk_io_and_kill_daemon 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & if _have_feature "AUTO_BUF_REG"; then ublk_io_and_kill_daemon 8G -t null -q 4 --auto_zc & ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc "${UBLK_BACKFILES[0]}" & - ublk_io_and_kill_daemon 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & + ublk_io_and_kill_daemon 256M -t stripe -q 4 --auto_zc --no_ublk_fixed_fd "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback & fi diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index 30d5c8f0e5c7..ba322a353aff 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only vdso_test vdso_test_abi -vdso_test_clock_getres vdso_test_correctness vdso_test_gettimeofday vdso_test_getcpu diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 918a2caa070e..e361aca22a74 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -4,7 +4,6 @@ include ../../../scripts/Makefile.arch TEST_GEN_PROGS := vdso_test_gettimeofday TEST_GEN_PROGS += vdso_test_getcpu TEST_GEN_PROGS += vdso_test_abi -TEST_GEN_PROGS += vdso_test_clock_getres ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64)) TEST_GEN_PROGS += vdso_standalone_test_x86 endif @@ -29,7 +28,6 @@ CFLAGS_NOLIBC := -nostdlib -nostdinc -ffreestanding -fno-asynchronous-unwind-tab $(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c $(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_test_abi: parse_vdso.c vdso_test_abi.c -$(OUTPUT)/vdso_test_clock_getres: vdso_test_clock_getres.c $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c | headers $(OUTPUT)/vdso_standalone_test_x86: CFLAGS:=$(CFLAGS_NOLIBC) $(CFLAGS) diff --git a/tools/testing/selftests/vDSO/vdso_call.h b/tools/testing/selftests/vDSO/vdso_call.h index bb237d771051..e7205584cbdc 100644 --- a/tools/testing/selftests/vDSO/vdso_call.h +++ b/tools/testing/selftests/vDSO/vdso_call.h @@ -44,7 +44,6 @@ register long _r6 asm ("r6"); \ register long _r7 asm ("r7"); \ register long _r8 asm ("r8"); \ - register long _rval asm ("r3"); \ \ LOADARGS_##nr(fn, args); \ \ @@ -54,13 +53,13 @@ " bns+ 1f\n" \ " neg 3, 3\n" \ "1:" \ - : "+r" (_r0), "=r" (_r3), "+r" (_r4), "+r" (_r5), \ + : "+r" (_r0), "+r" (_r3), "+r" (_r4), "+r" (_r5), \ "+r" (_r6), "+r" (_r7), "+r" (_r8) \ - : "r" (_rval) \ + : \ : "r9", "r10", "r11", "r12", "cr0", "cr1", "cr5", \ "cr6", "cr7", "xer", "lr", "ctr", "memory" \ ); \ - _rval; \ + _r3; \ }) #else diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index a54424e2336f..238d609a457a 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -26,24 +26,31 @@ static const char *version; static const char **name; +/* The same as struct __kernel_timespec */ +struct vdso_timespec64 { + uint64_t tv_sec; + uint64_t tv_nsec; +}; + typedef long (*vdso_gettimeofday_t)(struct timeval *tv, struct timezone *tz); typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); +typedef long (*vdso_clock_gettime64_t)(clockid_t clk_id, struct vdso_timespec64 *ts); typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); typedef time_t (*vdso_time_t)(time_t *t); -const char *vdso_clock_name[12] = { - "CLOCK_REALTIME", - "CLOCK_MONOTONIC", - "CLOCK_PROCESS_CPUTIME_ID", - "CLOCK_THREAD_CPUTIME_ID", - "CLOCK_MONOTONIC_RAW", - "CLOCK_REALTIME_COARSE", - "CLOCK_MONOTONIC_COARSE", - "CLOCK_BOOTTIME", - "CLOCK_REALTIME_ALARM", - "CLOCK_BOOTTIME_ALARM", - "CLOCK_SGI_CYCLE", - "CLOCK_TAI", +static const char * const vdso_clock_name[] = { + [CLOCK_REALTIME] = "CLOCK_REALTIME", + [CLOCK_MONOTONIC] = "CLOCK_MONOTONIC", + [CLOCK_PROCESS_CPUTIME_ID] = "CLOCK_PROCESS_CPUTIME_ID", + [CLOCK_THREAD_CPUTIME_ID] = "CLOCK_THREAD_CPUTIME_ID", + [CLOCK_MONOTONIC_RAW] = "CLOCK_MONOTONIC_RAW", + [CLOCK_REALTIME_COARSE] = "CLOCK_REALTIME_COARSE", + [CLOCK_MONOTONIC_COARSE] = "CLOCK_MONOTONIC_COARSE", + [CLOCK_BOOTTIME] = "CLOCK_BOOTTIME", + [CLOCK_REALTIME_ALARM] = "CLOCK_REALTIME_ALARM", + [CLOCK_BOOTTIME_ALARM] = "CLOCK_BOOTTIME_ALARM", + [10 /* CLOCK_SGI_CYCLE */] = "CLOCK_SGI_CYCLE", + [CLOCK_TAI] = "CLOCK_TAI", }; static void vdso_test_gettimeofday(void) @@ -70,6 +77,33 @@ static void vdso_test_gettimeofday(void) } } +static void vdso_test_clock_gettime64(clockid_t clk_id) +{ + /* Find clock_gettime64. */ + vdso_clock_gettime64_t vdso_clock_gettime64 = + (vdso_clock_gettime64_t)vdso_sym(version, name[5]); + + if (!vdso_clock_gettime64) { + ksft_print_msg("Couldn't find %s\n", name[5]); + ksft_test_result_skip("%s %s\n", name[5], + vdso_clock_name[clk_id]); + return; + } + + struct vdso_timespec64 ts; + long ret = VDSO_CALL(vdso_clock_gettime64, 2, clk_id, &ts); + + if (ret == 0) { + ksft_print_msg("The time is %lld.%06lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); + ksft_test_result_pass("%s %s\n", name[5], + vdso_clock_name[clk_id]); + } else { + ksft_test_result_fail("%s %s\n", name[5], + vdso_clock_name[clk_id]); + } +} + static void vdso_test_clock_gettime(clockid_t clk_id) { /* Find clock_gettime. */ @@ -171,23 +205,23 @@ static inline void vdso_test_clock(clockid_t clock_id) ksft_print_msg("clock_id: %s\n", vdso_clock_name[clock_id]); vdso_test_clock_gettime(clock_id); + vdso_test_clock_gettime64(clock_id); vdso_test_clock_getres(clock_id); } -#define VDSO_TEST_PLAN 16 +#define VDSO_TEST_PLAN 29 int main(int argc, char **argv) { unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); ksft_print_header(); - ksft_set_plan(VDSO_TEST_PLAN); - if (!sysinfo_ehdr) { - ksft_print_msg("AT_SYSINFO_EHDR is not present!\n"); - return KSFT_SKIP; - } + if (!sysinfo_ehdr) + ksft_exit_skip("AT_SYSINFO_EHDR is not present!\n"); + + ksft_set_plan(VDSO_TEST_PLAN); version = versions[VDSO_VERSION]; name = (const char **)&names[VDSO_NAMES]; @@ -198,40 +232,17 @@ int main(int argc, char **argv) vdso_test_gettimeofday(); -#if _POSIX_TIMERS > 0 - -#ifdef CLOCK_REALTIME vdso_test_clock(CLOCK_REALTIME); -#endif - -#ifdef CLOCK_BOOTTIME vdso_test_clock(CLOCK_BOOTTIME); -#endif - -#ifdef CLOCK_TAI vdso_test_clock(CLOCK_TAI); -#endif - -#ifdef CLOCK_REALTIME_COARSE vdso_test_clock(CLOCK_REALTIME_COARSE); -#endif - -#ifdef CLOCK_MONOTONIC vdso_test_clock(CLOCK_MONOTONIC); -#endif - -#ifdef CLOCK_MONOTONIC_RAW vdso_test_clock(CLOCK_MONOTONIC_RAW); -#endif - -#ifdef CLOCK_MONOTONIC_COARSE vdso_test_clock(CLOCK_MONOTONIC_COARSE); -#endif - -#endif + vdso_test_clock(CLOCK_PROCESS_CPUTIME_ID); + vdso_test_clock(CLOCK_THREAD_CPUTIME_ID); vdso_test_time(); - ksft_print_cnts(); - return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; + ksft_finished(); } diff --git a/tools/testing/selftests/vDSO/vdso_test_clock_getres.c b/tools/testing/selftests/vDSO/vdso_test_clock_getres.c deleted file mode 100644 index b5d5f59f725a..000000000000 --- a/tools/testing/selftests/vDSO/vdso_test_clock_getres.c +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note -/* - * vdso_clock_getres.c: Sample code to test clock_getres. - * Copyright (c) 2019 Arm Ltd. - * - * Compile with: - * gcc -std=gnu99 vdso_clock_getres.c - * - * Tested on ARM, ARM64, MIPS32, x86 (32-bit and 64-bit), - * Power (32-bit and 64-bit), S390x (32-bit and 64-bit). - * Might work on other architectures. - */ - -#define _GNU_SOURCE -#include <elf.h> -#include <fcntl.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <time.h> -#include <sys/auxv.h> -#include <sys/mman.h> -#include <sys/time.h> -#include <unistd.h> -#include <sys/syscall.h> - -#include "../kselftest.h" - -static long syscall_clock_getres(clockid_t _clkid, struct timespec *_ts) -{ - long ret; - - ret = syscall(SYS_clock_getres, _clkid, _ts); - - return ret; -} - -const char *vdso_clock_name[12] = { - "CLOCK_REALTIME", - "CLOCK_MONOTONIC", - "CLOCK_PROCESS_CPUTIME_ID", - "CLOCK_THREAD_CPUTIME_ID", - "CLOCK_MONOTONIC_RAW", - "CLOCK_REALTIME_COARSE", - "CLOCK_MONOTONIC_COARSE", - "CLOCK_BOOTTIME", - "CLOCK_REALTIME_ALARM", - "CLOCK_BOOTTIME_ALARM", - "CLOCK_SGI_CYCLE", - "CLOCK_TAI", -}; - -/* - * This function calls clock_getres in vdso and by system call - * with different values for clock_id. - * - * Example of output: - * - * clock_id: CLOCK_REALTIME [PASS] - * clock_id: CLOCK_BOOTTIME [PASS] - * clock_id: CLOCK_TAI [PASS] - * clock_id: CLOCK_REALTIME_COARSE [PASS] - * clock_id: CLOCK_MONOTONIC [PASS] - * clock_id: CLOCK_MONOTONIC_RAW [PASS] - * clock_id: CLOCK_MONOTONIC_COARSE [PASS] - */ -static inline int vdso_test_clock(unsigned int clock_id) -{ - struct timespec x, y; - - printf("clock_id: %s", vdso_clock_name[clock_id]); - clock_getres(clock_id, &x); - syscall_clock_getres(clock_id, &y); - - if ((x.tv_sec != y.tv_sec) || (x.tv_nsec != y.tv_nsec)) { - printf(" [FAIL]\n"); - return KSFT_FAIL; - } - - printf(" [PASS]\n"); - return KSFT_PASS; -} - -int main(int argc, char **argv) -{ - int ret = 0; - -#if _POSIX_TIMERS > 0 - -#ifdef CLOCK_REALTIME - ret += vdso_test_clock(CLOCK_REALTIME); -#endif - -#ifdef CLOCK_BOOTTIME - ret += vdso_test_clock(CLOCK_BOOTTIME); -#endif - -#ifdef CLOCK_TAI - ret += vdso_test_clock(CLOCK_TAI); -#endif - -#ifdef CLOCK_REALTIME_COARSE - ret += vdso_test_clock(CLOCK_REALTIME_COARSE); -#endif - -#ifdef CLOCK_MONOTONIC - ret += vdso_test_clock(CLOCK_MONOTONIC); -#endif - -#ifdef CLOCK_MONOTONIC_RAW - ret += vdso_test_clock(CLOCK_MONOTONIC_RAW); -#endif - -#ifdef CLOCK_MONOTONIC_COARSE - ret += vdso_test_clock(CLOCK_MONOTONIC_COARSE); -#endif - -#endif - if (ret > 0) - return KSFT_FAIL; - - return KSFT_PASS; -} diff --git a/tools/testing/shared/linux/idr.h b/tools/testing/shared/linux/idr.h index 4e342f2e37cf..676c5564e33f 100644 --- a/tools/testing/shared/linux/idr.h +++ b/tools/testing/shared/linux/idr.h @@ -1 +1,5 @@ +/* Avoid duplicate definitions due to system headers. */ +#ifdef __CONCAT +#undef __CONCAT +#endif #include "../../../../include/linux/idr.h" diff --git a/tools/tracing/latency/Makefile.config b/tools/tracing/latency/Makefile.config index 0fe6b50f029b..6efa13e3ca93 100644 --- a/tools/tracing/latency/Makefile.config +++ b/tools/tracing/latency/Makefile.config @@ -1,7 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-only +include $(srctree)/tools/scripts/utilities.mak + STOP_ERROR := +ifndef ($(NO_LIBTRACEEVENT),1) + ifeq ($(call get-executable,$(PKG_CONFIG)),) + $(error Error: $(PKG_CONFIG) needed by libtraceevent/libtracefs is missing on this system, please install it) + endif +endif + define lib_setup $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)")) $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)")) diff --git a/tools/tracing/rtla/Makefile.config b/tools/tracing/rtla/Makefile.config index 5f2231d8d626..07ff5e8f3006 100644 --- a/tools/tracing/rtla/Makefile.config +++ b/tools/tracing/rtla/Makefile.config @@ -1,10 +1,18 @@ # SPDX-License-Identifier: GPL-2.0-only +include $(srctree)/tools/scripts/utilities.mak + STOP_ERROR := LIBTRACEEVENT_MIN_VERSION = 1.5 LIBTRACEFS_MIN_VERSION = 1.6 +ifndef ($(NO_LIBTRACEEVENT),1) + ifeq ($(call get-executable,$(PKG_CONFIG)),) + $(error Error: $(PKG_CONFIG) needed by libtraceevent/libtracefs is missing on this system, please install it) + endif +endif + define lib_setup $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)")) $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)")) diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c index aaf0808125d7..13ff1934d47c 100644 --- a/tools/tracing/rtla/src/actions.c +++ b/tools/tracing/rtla/src/actions.c @@ -49,7 +49,7 @@ actions_destroy(struct actions *self) static struct action * actions_new(struct actions *self) { - if (self->size >= self->len) { + if (self->len >= self->size) { self->size *= 2; self->list = realloc(self->list, self->size * sizeof(struct action)); } @@ -131,7 +131,7 @@ actions_parse(struct actions *self, const char *trigger) { enum action_type type = ACTION_NONE; char *token; - char trigger_c[strlen(trigger)]; + char trigger_c[strlen(trigger) + 1]; /* For ACTION_SIGNAL */ int signal = 0, pid = 0; |