diff options
Diffstat (limited to 'tools')
210 files changed, 8583 insertions, 660 deletions
diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 9a5d85cfd1fb..139d5e87dc95 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -75,11 +75,13 @@ #define ARM_CPU_PART_CORTEX_A76 0xD0B #define ARM_CPU_PART_NEOVERSE_N1 0xD0C #define ARM_CPU_PART_CORTEX_A77 0xD0D +#define ARM_CPU_PART_CORTEX_A76AE 0xD0E #define ARM_CPU_PART_NEOVERSE_V1 0xD40 #define ARM_CPU_PART_CORTEX_A78 0xD41 #define ARM_CPU_PART_CORTEX_A78AE 0xD42 #define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A510 0xD46 +#define ARM_CPU_PART_CORTEX_X1C 0xD4C #define ARM_CPU_PART_CORTEX_A520 0xD80 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_CORTEX_A715 0xD4D @@ -119,9 +121,11 @@ #define QCOM_CPU_PART_KRYO 0x200 #define QCOM_CPU_PART_KRYO_2XX_GOLD 0x800 #define QCOM_CPU_PART_KRYO_2XX_SILVER 0x801 +#define QCOM_CPU_PART_KRYO_3XX_GOLD 0x802 #define QCOM_CPU_PART_KRYO_3XX_SILVER 0x803 #define QCOM_CPU_PART_KRYO_4XX_GOLD 0x804 #define QCOM_CPU_PART_KRYO_4XX_SILVER 0x805 +#define QCOM_CPU_PART_ORYON_X1 0x001 #define NVIDIA_CPU_PART_DENVER 0x003 #define NVIDIA_CPU_PART_CARMEL 0x004 @@ -129,6 +133,7 @@ #define FUJITSU_CPU_PART_A64FX 0x001 #define HISI_CPU_PART_TSV110 0xD01 +#define HISI_CPU_PART_HIP09 0xD02 #define HISI_CPU_PART_HIP12 0xD06 #define APPLE_CPU_PART_M1_ICESTORM 0x022 @@ -159,11 +164,13 @@ #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) #define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) +#define MIDR_CORTEX_A76AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76AE) #define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1) #define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) #define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) #define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) +#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C) #define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_CORTEX_A715 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A715) @@ -196,13 +203,26 @@ #define MIDR_QCOM_KRYO MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO) #define MIDR_QCOM_KRYO_2XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_GOLD) #define MIDR_QCOM_KRYO_2XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_SILVER) +#define MIDR_QCOM_KRYO_3XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_GOLD) #define MIDR_QCOM_KRYO_3XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_SILVER) #define MIDR_QCOM_KRYO_4XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_GOLD) #define MIDR_QCOM_KRYO_4XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_SILVER) +#define MIDR_QCOM_ORYON_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_ORYON_X1) + +/* + * NOTES: + * - Qualcomm Kryo 5XX Prime / Gold ID themselves as MIDR_CORTEX_A77 + * - Qualcomm Kryo 5XX Silver IDs itself as MIDR_QCOM_KRYO_4XX_SILVER + * - Qualcomm Kryo 6XX Prime IDs itself as MIDR_CORTEX_X1 + * - Qualcomm Kryo 6XX Gold IDs itself as ARM_CPU_PART_CORTEX_A78 + * - Qualcomm Kryo 6XX Silver IDs itself as MIDR_CORTEX_A55 + */ + #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER) #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) +#define MIDR_HISI_HIP09 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP09) #define MIDR_HISI_HIP12 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP12) #define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM) #define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM) @@ -291,6 +311,14 @@ static inline u32 __attribute_const__ read_cpuid_id(void) return read_cpuid(MIDR_EL1); } +struct target_impl_cpu { + u64 midr; + u64 revidr; + u64 aidr; +}; + +bool cpu_errata_set_target_impl(u64 num, void *impl_cpus); + static inline u64 __attribute_const__ read_cpuid_mpidr(void) { return read_cpuid(MPIDR_EL1); diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index 690b6ebd118f..65f2759ea27a 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -1080,9 +1080,6 @@ #define ARM64_FEATURE_FIELD_BITS 4 -/* Defined for compatibility only, do not add new users. */ -#define ARM64_FEATURE_MASK(x) (x##_MASK) - #ifdef __ASSEMBLY__ .macro mrs_s, rt, sreg diff --git a/tools/arch/loongarch/include/asm/inst.h b/tools/arch/loongarch/include/asm/inst.h index c25b5853181d..d68fad63c8b7 100644 --- a/tools/arch/loongarch/include/asm/inst.h +++ b/tools/arch/loongarch/include/asm/inst.h @@ -51,6 +51,10 @@ enum reg2i16_op { bgeu_op = 0x1b, }; +enum reg3_op { + amswapw_op = 0x70c0, +}; + struct reg0i15_format { unsigned int immediate : 15; unsigned int opcode : 17; @@ -96,6 +100,13 @@ struct reg2i16_format { unsigned int opcode : 6; }; +struct reg3_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int rk : 5; + unsigned int opcode : 17; +}; + union loongarch_instruction { unsigned int word; struct reg0i15_format reg0i15_format; @@ -105,6 +116,7 @@ union loongarch_instruction { struct reg2i12_format reg2i12_format; struct reg2i14_format reg2i14_format; struct reg2i16_format reg2i16_format; + struct reg3_format reg3_format; }; #define LOONGARCH_INSN_SIZE sizeof(union loongarch_instruction) diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index eaeda001784e..077c5437f521 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -1,18 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * * Copyright IBM Corp. 2007 * * Authors: Hollis Blanchard <hollisb@us.ibm.com> diff --git a/tools/arch/riscv/include/asm/csr.h b/tools/arch/riscv/include/asm/csr.h index 0dfc09254f99..56d7367ee344 100644 --- a/tools/arch/riscv/include/asm/csr.h +++ b/tools/arch/riscv/include/asm/csr.h @@ -468,13 +468,13 @@ #define IE_TIE (_AC(0x1, UL) << RV_IRQ_TIMER) #define IE_EIE (_AC(0x1, UL) << RV_IRQ_EXT) -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __ASM_STR(x) x #else #define __ASM_STR(x) #x #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define csr_swap(csr, val) \ ({ \ @@ -536,6 +536,6 @@ : "memory"); \ }) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_CSR_H */ diff --git a/tools/arch/riscv/include/asm/vdso/processor.h b/tools/arch/riscv/include/asm/vdso/processor.h index 662aca039848..0665b117f30f 100644 --- a/tools/arch/riscv/include/asm/vdso/processor.h +++ b/tools/arch/riscv/include/asm/vdso/processor.h @@ -2,7 +2,7 @@ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm-generic/barrier.h> @@ -27,6 +27,6 @@ static inline void cpu_relax(void) barrier(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index ee176236c2be..06fc0479a23f 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -218,6 +218,7 @@ #define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* "flexpriority" Intel FlexPriority */ #define X86_FEATURE_EPT ( 8*32+ 2) /* "ept" Intel Extended Page Table */ #define X86_FEATURE_VPID ( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */ +#define X86_FEATURE_COHERENCY_SFW_NO ( 8*32+ 4) /* SNP cache coherency software work around not needed */ #define X86_FEATURE_VMMCALL ( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */ #define X86_FEATURE_XENPV ( 8*32+16) /* Xen paravirtual guest */ @@ -456,10 +457,14 @@ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */ #define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ #define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */ +#define X86_FEATURE_VERW_CLEAR (20*32+ 5) /* The memory form of VERW mitigates TSA */ #define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */ + #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ +#define X86_FEATURE_GP_ON_USER_CPUID (20*32+17) /* User CPUID faulting */ + #define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */ #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ @@ -487,6 +492,9 @@ #define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */ #define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */ #define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */ +#define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */ +#define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ +#define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ /* * BUG word(s) @@ -542,5 +550,5 @@ #define X86_BUG_OLD_MICROCODE X86_BUG( 1*32+ 6) /* "old_microcode" CPU has old microcode, it is surely vulnerable to something */ #define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */ #define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ - +#define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/inat.h b/tools/arch/x86/include/asm/inat.h index 183aa662b165..099e926595bd 100644 --- a/tools/arch/x86/include/asm/inat.h +++ b/tools/arch/x86/include/asm/inat.h @@ -37,6 +37,8 @@ #define INAT_PFX_EVEX 15 /* EVEX prefix */ /* x86-64 REX2 prefix */ #define INAT_PFX_REX2 16 /* 0xD5 */ +/* AMD XOP prefix */ +#define INAT_PFX_XOP 17 /* 0x8F */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -77,6 +79,7 @@ #define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_XOPOK INAT_VEXOK #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) #define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) @@ -111,6 +114,8 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_pp); +extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, + insn_byte_t map_select); /* Attribute checking functions */ static inline int inat_is_legacy_prefix(insn_attr_t attr) @@ -164,6 +169,11 @@ static inline int inat_is_vex3_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; } +static inline int inat_is_xop_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_XOP; +} + static inline int inat_is_escape(insn_attr_t attr) { return attr & INAT_ESC_MASK; @@ -229,6 +239,11 @@ static inline int inat_accept_vex(insn_attr_t attr) return attr & INAT_VEXOK; } +static inline int inat_accept_xop(insn_attr_t attr) +{ + return attr & INAT_XOPOK; +} + static inline int inat_must_vex(insn_attr_t attr) { return attr & (INAT_VEXONLY | INAT_EVEXONLY); diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 0e5abd896ad4..c683d609934b 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -71,7 +71,10 @@ struct insn { * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ - struct insn_field vex_prefix; /* VEX prefix */ + union { + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field xop_prefix; /* XOP prefix */ + }; struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 @@ -135,6 +138,17 @@ struct insn { #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ +/* XOP bit fields */ +#define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */ +#define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */ +#define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */ +#define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */ +#define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */ +#define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */ +#define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */ +#define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */ +#define X86_XOP_M_MIN 0x08 /* Min of XOP.M */ +#define X86_XOP_M_MAX 0x1f /* Max of XOP.M */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); @@ -178,7 +192,7 @@ static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) return X86_REX2_M(insn->rex_prefix.bytes[1]); } -static inline int insn_is_avx(struct insn *insn) +static inline int insn_is_avx_or_xop(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); @@ -192,6 +206,22 @@ static inline int insn_is_evex(struct insn *insn) return (insn->vex_prefix.nbytes == 4); } +/* If we already know this is AVX/XOP encoded */ +static inline int avx_insn_is_xop(struct insn *insn) +{ + insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]); + + return inat_is_xop_prefix(attr); +} + +static inline int insn_is_xop(struct insn *insn) +{ + if (!insn_is_avx_or_xop(insn)) + return 0; + + return avx_insn_is_xop(insn); +} + static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; @@ -222,11 +252,26 @@ static inline insn_byte_t insn_vex_w_bit(struct insn *insn) return X86_VEX_W(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_xop_map_bits(struct insn *insn) +{ + if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */ + return 0; + return X86_XOP_M(insn->xop_prefix.bytes[1]); +} + +static inline insn_byte_t insn_xop_p_bits(struct insn *insn) +{ + return X86_XOP_P(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { - if (insn_is_avx(insn)) + if (insn_is_avx_or_xop(insn)) { + if (avx_insn_is_xop(insn)) + return insn_xop_p_bits(insn); return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ + } if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 5cfb5d74dd5f..f627196eb796 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -315,12 +315,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) @@ -419,6 +421,7 @@ #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12) #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) +#define DEBUGCTLMSR_RTM_DEBUG BIT(15) #define MSR_PEBS_FRONTEND 0x000003f7 @@ -733,6 +736,11 @@ #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +/* AMD Hardware Feedback Support MSRs */ +#define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500 +#define MSR_AMD_WORKLOAD_CLASS_ID 0xc0000501 +#define MSR_AMD_WORKLOAD_HRST 0xc0000502 + /* AMD Last Branch Record MSRs */ #define MSR_AMD64_LBR_SELECT 0xc000010e @@ -831,6 +839,7 @@ #define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT) #define MSR_K7_HWCR_IRPERF_EN_BIT 30 #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) +#define MSR_K7_HWCR_CPUID_USER_DIS_BIT 35 #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 #define MSR_K7_HWCR_CPB_DIS_BIT 25 diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 6f3499507c5e..0f15d683817d 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -965,7 +965,13 @@ struct kvm_tdx_cmd { struct kvm_tdx_capabilities { __u64 supported_attrs; __u64 supported_xfam; - __u64 reserved[254]; + + __u64 kernel_tdvmcallinfo_1_r11; + __u64 user_tdvmcallinfo_1_r11; + __u64 kernel_tdvmcallinfo_1_r12; + __u64 user_tdvmcallinfo_1_r12; + + __u64 reserved[250]; /* Configurable CPUID bits for userspace */ struct kvm_cpuid2 cpuid; diff --git a/tools/arch/x86/lib/inat.c b/tools/arch/x86/lib/inat.c index dfbcc6405941..ffcb0e27453b 100644 --- a/tools/arch/x86/lib/inat.c +++ b/tools/arch/x86/lib/inat.c @@ -81,3 +81,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, return table[opcode]; } +insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) +{ + const insn_attr_t *table; + + if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) + return 0; + map_select -= X86_XOP_M_MIN; + /* At first, this checks the master table */ + table = inat_xop_tables[map_select]; + if (!table) + return 0; + return table[opcode]; +} diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index bce69c6bfa69..1d1c57c74d1f 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -200,12 +200,15 @@ found: } insn->rex_prefix.got = 1; - /* Decode VEX prefix */ + /* Decode VEX/XOP prefix */ b = peek_next(insn_byte_t, insn); - attr = inat_get_opcode_attribute(b); - if (inat_is_vex_prefix(attr)) { + if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); - if (!insn->x86_64) { + + if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) { + /* Grp1A.0 is always POP Ev */ + goto vex_end; + } else if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is @@ -226,13 +229,13 @@ found: if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; - } else if (inat_is_vex3_prefix(attr)) { + } else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) - /* VEX.W overrides opnd_size */ + /* VEX.W/XOP.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* @@ -288,9 +291,22 @@ int insn_get_opcode(struct insn *insn) insn_set_byte(opcode, 0, op); opcode->nbytes = 1; - /* Check if there is VEX prefix or not */ - if (insn_is_avx(insn)) { + /* Check if there is VEX/XOP prefix or not */ + if (insn_is_avx_or_xop(insn)) { insn_byte_t m, p; + + /* XOP prefix has different encoding */ + if (unlikely(avx_insn_is_xop(insn))) { + m = insn_xop_map_bits(insn); + insn->attr = inat_get_xop_attribute(op, m); + if (!inat_accept_xop(insn->attr)) { + insn->attr = 0; + return -EINVAL; + } + /* XOP has only 1 byte for opcode */ + goto end; + } + m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); @@ -383,7 +399,8 @@ int insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) && + !inat_accept_xop(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index 262f7ca1fb95..2a4e69ecc2de 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -27,6 +27,11 @@ # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. +# (xop): this opcode accepts XOP prefix. +# +# XOP Superscripts +# (W=0): this opcode requires XOP.W == 0 +# (W=1): this opcode requires XOP.W == 1 # # Last Prefix Superscripts # - (66): the last prefix is 0x66 @@ -194,7 +199,7 @@ AVXcode: 8c: MOV Ev,Sw 8d: LEA Gv,M 8e: MOV Sw,Ew -8f: Grp1A (1A) | POP Ev (d64) +8f: Grp1A (1A) | POP Ev (d64) | XOP (Prefix) # 0x90 - 0x9f 90: NOP | PAUSE (F3) | XCHG r8,rAX 91: XCHG rCX/r9,rAX @@ -1106,6 +1111,84 @@ AVXcode: 7 f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) EndTable +# From AMD64 Architecture Programmer's Manual Vol3, Appendix A.1.5 +Table: XOP map 8h +Referrer: +XOPcode: 0 +85: VPMACSSWW Vo,Ho,Wo,Lo +86: VPMACSSWD Vo,Ho,Wo,Lo +87: VPMACSSDQL Vo,Ho,Wo,Lo +8e: VPMACSSDD Vo,Ho,Wo,Lo +8f: VPMACSSDQH Vo,Ho,Wo,Lo +95: VPMACSWW Vo,Ho,Wo,Lo +96: VPMACSWD Vo,Ho,Wo,Lo +97: VPMACSDQL Vo,Ho,Wo,Lo +9e: VPMACSDD Vo,Ho,Wo,Lo +9f: VPMACSDQH Vo,Ho,Wo,Lo +a2: VPCMOV Vx,Hx,Wx,Lx (W=0) | VPCMOV Vx,Hx,Lx,Wx (W=1) +a3: VPPERM Vo,Ho,Wo,Lo (W=0) | VPPERM Vo,Ho,Lo,Wo (W=1) +a6: VPMADCSSWD Vo,Ho,Wo,Lo +b6: VPMADCSWD Vo,Ho,Wo,Lo +c0: VPROTB Vo,Wo,Ib +c1: VPROTW Vo,Wo,Ib +c2: VPROTD Vo,Wo,Ib +c3: VPROTQ Vo,Wo,Ib +cc: VPCOMccB Vo,Ho,Wo,Ib +cd: VPCOMccW Vo,Ho,Wo,Ib +ce: VPCOMccD Vo,Ho,Wo,Ib +cf: VPCOMccQ Vo,Ho,Wo,Ib +ec: VPCOMccUB Vo,Ho,Wo,Ib +ed: VPCOMccUW Vo,Ho,Wo,Ib +ee: VPCOMccUD Vo,Ho,Wo,Ib +ef: VPCOMccUQ Vo,Ho,Wo,Ib +EndTable + +Table: XOP map 9h +Referrer: +XOPcode: 1 +01: GrpXOP1 +02: GrpXOP2 +12: GrpXOP3 +80: VFRCZPS Vx,Wx +81: VFRCZPD Vx,Wx +82: VFRCZSS Vq,Wss +83: VFRCZSD Vq,Wsd +90: VPROTB Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +91: VPROTW Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +92: VPROTD Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +93: VPROTQ Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +94: VPSHLB Vo,Wo,Ho (W=0) | VPSHLB Vo,Ho,Wo (W=1) +95: VPSHLW Vo,Wo,Ho (W=0) | VPSHLW Vo,Ho,Wo (W=1) +96: VPSHLD Vo,Wo,Ho (W=0) | VPSHLD Vo,Ho,Wo (W=1) +97: VPSHLQ Vo,Wo,Ho (W=0) | VPSHLQ Vo,Ho,Wo (W=1) +98: VPSHAB Vo,Wo,Ho (W=0) | VPSHAB Vo,Ho,Wo (W=1) +99: VPSHAW Vo,Wo,Ho (W=0) | VPSHAW Vo,Ho,Wo (W=1) +9a: VPSHAD Vo,Wo,Ho (W=0) | VPSHAD Vo,Ho,Wo (W=1) +9b: VPSHAQ Vo,Wo,Ho (W=0) | VPSHAQ Vo,Ho,Wo (W=1) +c1: VPHADDBW Vo,Wo +c2: VPHADDBD Vo,Wo +c3: VPHADDBQ Vo,Wo +c6: VPHADDWD Vo,Wo +c7: VPHADDWQ Vo,Wo +cb: VPHADDDQ Vo,Wo +d1: VPHADDUBWD Vo,Wo +d2: VPHADDUBD Vo,Wo +d3: VPHADDUBQ Vo,Wo +d6: VPHADDUWD Vo,Wo +d7: VPHADDUWQ Vo,Wo +db: VPHADDUDQ Vo,Wo +e1: VPHSUBBW Vo,Wo +e2: VPHSUBWD Vo,Wo +e3: VPHSUBDQ Vo,Wo +EndTable + +Table: XOP map Ah +Referrer: +XOPcode: 2 +10: BEXTR Gy,Ey,Id +12: GrpXOP4 +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1320,3 +1403,29 @@ GrpTable: GrpRNG 4: xcrypt-cfb 5: xcrypt-ofb EndTable + +# GrpXOP1-4 is shown in AMD APM Vol.3 Appendix A as XOP group #1-4 +GrpTable: GrpXOP1 +1: BLCFILL By,Ey (xop) +2: BLSFILL By,Ey (xop) +3: BLCS By,Ey (xop) +4: TZMSK By,Ey (xop) +5: BLCIC By,Ey (xop) +6: BLSIC By,Ey (xop) +7: T1MSKC By,Ey (xop) +EndTable + +GrpTable: GrpXOP2 +1: BLCMSK By,Ey (xop) +6: BLCI By,Ey (xop) +EndTable + +GrpTable: GrpXOP3 +0: LLWPCB Ry (xop) +1: SLWPCB Ry (xop) +EndTable + +GrpTable: GrpXOP4 +0: LWPINS By,Ed,Id (xop) +1: LWPVAL By,Ed,Id (xop) +EndTable diff --git a/tools/arch/x86/tools/gen-insn-attr-x86.awk b/tools/arch/x86/tools/gen-insn-attr-x86.awk index 2c19d7fc8a85..7ea1b75e59b7 100644 --- a/tools/arch/x86/tools/gen-insn-attr-x86.awk +++ b/tools/arch/x86/tools/gen-insn-attr-x86.awk @@ -21,6 +21,7 @@ function clear_vars() { eid = -1 # escape id gid = -1 # group id aid = -1 # AVX id + xopid = -1 # XOP id tname = "" } @@ -39,9 +40,11 @@ BEGIN { ggid = 1 geid = 1 gaid = 0 + gxopid = 0 delete etable delete gtable delete atable + delete xoptable opnd_expr = "^[A-Za-z/]" ext_expr = "^\\(" @@ -61,6 +64,7 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Lo"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -87,6 +91,8 @@ BEGIN { evexonly_expr = "\\(ev\\)" # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size evex_scalable_expr = "\\(es\\)" + # All opcodes in XOP table or with (xop) superscript accept XOP prefix + xopok_expr = "\\(xop\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -106,6 +112,7 @@ BEGIN { prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" prefix_num["REX2"] = "INAT_PFX_REX2" + prefix_num["XOP"] = "INAT_PFX_XOP" clear_vars() } @@ -147,6 +154,7 @@ function array_size(arr, i,c) { if (NF != 1) { # AVX/escape opcode table aid = $2 + xopid = -1 if (gaid <= aid) gaid = aid + 1 if (tname == "") # AVX only opcode table @@ -156,6 +164,20 @@ function array_size(arr, i,c) { tname = "inat_primary_table" } +/^XOPcode:/ { + if (NF != 1) { + # XOP opcode table + xopid = $2 + aid = -1 + if (gxopid <= xopid) + gxopid = xopid + 1 + if (tname == "") # XOP only opcode table + tname = sprintf("inat_xop_table_%d", $2) + } + if (xopid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + /^GrpTable:/ { print "/* " $0 " */" if (!($2 in group)) @@ -206,6 +228,8 @@ function print_table(tbl,name,fmt,n) etable[eid,0] = tname if (aid >= 0) atable[aid,0] = tname + else if (xopid >= 0) + xoptable[xopid] = tname } if (array_size(lptable1) != 0) { print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", @@ -347,6 +371,8 @@ function convert_operands(count,opnd, i,j,imm,mod) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") + else if (match(ext, xopok_expr) || xopid >= 0) + flags = add_flags(flags, "INAT_XOPOK") # check prefixes if (match(ext, prefix_expr)) { @@ -413,6 +439,14 @@ END { print " ["i"]["j"] = "atable[i,j]"," print "};\n" + print "/* XOP opcode map array */" + print "const insn_attr_t * const inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1]" \ + " = {" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print " ["i"] = "xoptable[i]"," + print "};" + print "#else /* !__BOOT_COMPRESSED */\n" print "/* Escape opcode map array */" @@ -430,6 +464,10 @@ END { "[INAT_LSTPFX_MAX + 1];" print "" + print "/* XOP opcode map array */" + print "static const insn_attr_t *inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1];" + print "" + print "static void inat_init_tables(void)" print "{" @@ -455,6 +493,12 @@ END { if (atable[i,j]) print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + print "" + print "\t/* Print XOP opcode map array */" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print "\tinat_xop_tables["i"] = "xoptable[i]";" + print "}" print "#endif" } diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 57c669d2aa90..55d59ed507d5 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -193,7 +193,7 @@ static int load_xbc_from_initrd(int fd, char **buf) if (stat.st_size < BOOTCONFIG_FOOTER_SIZE) return 0; - if (lseek(fd, -BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0) + if (lseek(fd, -(off_t)BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0) return pr_errno("Failed to lseek for magic", -errno); if (read(fd, magic, BOOTCONFIG_MAGIC_LEN) < 0) @@ -203,7 +203,7 @@ static int load_xbc_from_initrd(int fd, char **buf) if (memcmp(magic, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN) != 0) return 0; - if (lseek(fd, -BOOTCONFIG_FOOTER_SIZE, SEEK_END) < 0) + if (lseek(fd, -(off_t)BOOTCONFIG_FOOTER_SIZE, SEEK_END) < 0) return pr_errno("Failed to lseek for size", -errno); if (read(fd, &size, sizeof(uint32_t)) < 0) diff --git a/tools/gpio/Makefile b/tools/gpio/Makefile index ed565eb52275..342e056c8c66 100644 --- a/tools/gpio/Makefile +++ b/tools/gpio/Makefile @@ -77,7 +77,7 @@ $(OUTPUT)gpio-watch: $(GPIO_WATCH_IN) clean: rm -f $(ALL_PROGRAMS) - rm -f $(OUTPUT)include/linux/gpio.h + rm -rf $(OUTPUT)include find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete install: $(ALL_PROGRAMS) diff --git a/tools/include/linux/args.h b/tools/include/linux/args.h new file mode 100644 index 000000000000..2e8e65d975c7 --- /dev/null +++ b/tools/include/linux/args.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_ARGS_H +#define _LINUX_ARGS_H + +/* + * How do these macros work? + * + * In __COUNT_ARGS() _0 to _12 are just placeholders from the start + * in order to make sure _n is positioned over the correct number + * from 12 to 0 (depending on X, which is a variadic argument list). + * They serve no purpose other than occupying a position. Since each + * macro parameter must have a distinct identifier, those identifiers + * are as good as any. + * + * In COUNT_ARGS() we use actual integers, so __COUNT_ARGS() returns + * that as _n. + */ + +/* This counts to 15. Any more, it will return 16th argument. */ +#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _n, X...) _n +#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) + +/* Concatenate two parameters, but allow them to be expanded beforehand. */ +#define __CONCAT(a, b) a ## b +#define CONCATENATE(a, b) __CONCAT(a, b) + +#endif /* _LINUX_ARGS_H */ diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 7ad056219115..a40cc861b3a7 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -2,10 +2,8 @@ #ifndef __LINUX_BITS_H #define __LINUX_BITS_H -#include <linux/const.h> #include <vdso/bits.h> #include <uapi/linux/bits.h> -#include <asm/bitsperlong.h> #define BIT_MASK(nr) (UL(1) << ((nr) % BITS_PER_LONG)) #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) @@ -50,10 +48,14 @@ (type_max(t) << (l) & \ type_max(t) >> (BITS_PER_TYPE(t) - 1 - (h))))) +#define GENMASK(h, l) GENMASK_TYPE(unsigned long, h, l) +#define GENMASK_ULL(h, l) GENMASK_TYPE(unsigned long long, h, l) + #define GENMASK_U8(h, l) GENMASK_TYPE(u8, h, l) #define GENMASK_U16(h, l) GENMASK_TYPE(u16, h, l) #define GENMASK_U32(h, l) GENMASK_TYPE(u32, h, l) #define GENMASK_U64(h, l) GENMASK_TYPE(u64, h, l) +#define GENMASK_U128(h, l) GENMASK_TYPE(u128, h, l) /* * Fixed-type variants of BIT(), with additional checks like GENMASK_TYPE(). The @@ -79,28 +81,9 @@ * BUILD_BUG_ON_ZERO is not available in h files included from asm files, * disable the input check if that is the case. */ -#define GENMASK_INPUT_CHECK(h, l) 0 +#define GENMASK(h, l) __GENMASK(h, l) +#define GENMASK_ULL(h, l) __GENMASK_ULL(h, l) #endif /* !defined(__ASSEMBLY__) */ -#define GENMASK(h, l) \ - (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) -#define GENMASK_ULL(h, l) \ - (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l)) - -#if !defined(__ASSEMBLY__) -/* - * Missing asm support - * - * __GENMASK_U128() depends on _BIT128() which would not work - * in the asm code, as it shifts an 'unsigned __int128' data - * type instead of direct representation of 128 bit constants - * such as long and unsigned long. The fundamental problem is - * that a 128 bit constant will get silently truncated by the - * gcc compiler. - */ -#define GENMASK_U128(h, l) \ - (GENMASK_INPUT_CHECK(h, l) + __GENMASK_U128(h, l)) -#endif - #endif /* __LINUX_BITS_H */ diff --git a/tools/include/linux/cfi_types.h b/tools/include/linux/cfi_types.h index 6b8713675765..fb8d90bff92e 100644 --- a/tools/include/linux/cfi_types.h +++ b/tools/include/linux/cfi_types.h @@ -8,7 +8,7 @@ #ifdef __ASSEMBLY__ #include <linux/linkage.h> -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI /* * Use the __kcfi_typeid_<function> type identifier symbol to * annotate indirectly called assembly functions. The compiler emits @@ -29,17 +29,40 @@ #define SYM_TYPED_START(name, linkage, align...) \ SYM_TYPED_ENTRY(name, linkage, align) -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ #define SYM_TYPED_START(name, linkage, align...) \ SYM_START(name, linkage, align) -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifndef SYM_TYPED_FUNC_START #define SYM_TYPED_FUNC_START(name) \ SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) #endif +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_CFI_CLANG +#define DEFINE_CFI_TYPE(name, func) \ + /* \ + * Force a reference to the function so the compiler generates \ + * __kcfi_typeid_<func>. \ + */ \ + __ADDRESSABLE(func); \ + /* u32 name __ro_after_init = __kcfi_typeid_<func> */ \ + extern u32 name; \ + asm ( \ + " .pushsection .data..ro_after_init,\"aw\",\%progbits \n" \ + " .type " #name ",\%object \n" \ + " .globl " #name " \n" \ + " .p2align 2, 0x0 \n" \ + #name ": \n" \ + " .4byte __kcfi_typeid_" #func " \n" \ + " .size " #name ", 4 \n" \ + " .popsection \n" \ + ); +#endif + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_CFI_TYPES_H */ diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 2892a45023af..04e0077fb4c9 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -852,8 +852,14 @@ __SYSCALL(__NR_removexattrat, sys_removexattrat) #define __NR_open_tree_attr 467 __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr) +/* fs/inode.c */ +#define __NR_file_getattr 468 +__SYSCALL(__NR_file_getattr, sys_file_getattr) +#define __NR_file_setattr 469 +__SYSCALL(__NR_file_setattr, sys_file_setattr) + #undef __NR_syscalls -#define __NR_syscalls 468 +#define __NR_syscalls 470 /* * 32 bit systems traditionally used different diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 7415a3863891..f0f0d49d2544 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -178,6 +178,7 @@ struct kvm_xen_exit { #define KVM_EXIT_NOTIFY 37 #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 +#define KVM_EXIT_TDX 40 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -447,6 +448,31 @@ struct kvm_run { __u64 gpa; __u64 size; } memory_fault; + /* KVM_EXIT_TDX */ + struct { + __u64 flags; + __u64 nr; + union { + struct { + __u64 ret; + __u64 data[5]; + } unknown; + struct { + __u64 ret; + __u64 gpa; + __u64 size; + } get_quote; + struct { + __u64 ret; + __u64 leaf; + __u64 r11, r12, r13, r14; + } get_tdvmcall_info; + struct { + __u64 ret; + __u64 vector; + } setup_event_notify; + }; + } tdx; /* Fix the size of the union. */ char padding[256]; }; @@ -935,6 +961,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2 240 #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 +#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h index 34127653fd00..33c9b578b3b2 100644 --- a/tools/include/uapi/linux/nsfs.h +++ b/tools/include/uapi/linux/nsfs.h @@ -16,8 +16,6 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) -/* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ @@ -42,4 +40,19 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +/* Retrieve namespace identifiers. */ +#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) +#define NS_GET_ID _IOR(NSIO, 13, __u64) + +enum init_ns_ino { + IPC_NS_INIT_INO = 0xEFFFFFFFU, + UTS_NS_INIT_INO = 0xEFFFFFFEU, + USER_NS_INIT_INO = 0xEFFFFFFDU, + PID_NS_INIT_INO = 0xEFFFFFFCU, + CGROUP_NS_INIT_INO = 0xEFFFFFFBU, + TIME_NS_INIT_INO = 0xEFFFFFFAU, + NET_NS_INIT_INO = 0xEFFFFFF9U, + MNT_NS_INIT_INO = 0xEFFFFFF8U, +}; + #endif /* __LINUX_NSFS_H */ diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index 9ef569492560..ddaeb4eb3e24 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -75,6 +75,9 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) size_t ci, cj, ei; int cmp; + if (!excludes->cnt) + return; + ci = cj = ei = 0; while (ci < cmds->cnt && ei < excludes->cnt) { cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name); diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index ef032e17fec4..eb295756c3bf 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -830,7 +830,7 @@ class TypeArrayNest(Type): 'ynl_attr_for_each_nested(attr2, attr) {', '\tif (ynl_attr_validate(yarg, attr2))', '\t\treturn YNL_PARSE_CB_ERROR;', - f'\t{var}->_count.{self.c_name}++;', + f'\tn_{self.c_name}++;', '}'] return get_lines, None, local_vars diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index b6fdc68053cc..2e555c4060c5 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -278,6 +278,25 @@ static bool decode_insn_reg2i16_fomat(union loongarch_instruction inst, return true; } +static bool decode_insn_reg3_fomat(union loongarch_instruction inst, + struct instruction *insn) +{ + switch (inst.reg3_format.opcode) { + case amswapw_op: + if (inst.reg3_format.rd == LOONGARCH_GPR_ZERO && + inst.reg3_format.rk == LOONGARCH_GPR_RA && + inst.reg3_format.rj == LOONGARCH_GPR_ZERO) { + /* amswap.w $zero, $ra, $zero */ + insn->type = INSN_BUG; + } + break; + default: + return false; + } + + return true; +} + int arch_decode_instruction(struct objtool_file *file, const struct section *sec, unsigned long offset, unsigned int maxlen, struct instruction *insn) @@ -309,11 +328,19 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec return 0; if (decode_insn_reg2i16_fomat(inst, insn)) return 0; + if (decode_insn_reg3_fomat(inst, insn)) + return 0; - if (inst.word == 0) + if (inst.word == 0) { + /* andi $zero, $zero, 0x0 */ insn->type = INSN_NOP; - else if (inst.reg0i15_format.opcode == break_op) { - /* break */ + } else if (inst.reg0i15_format.opcode == break_op && + inst.reg0i15_format.immediate == 0x0) { + /* break 0x0 */ + insn->type = INSN_TRAP; + } else if (inst.reg0i15_format.opcode == break_op && + inst.reg0i15_format.immediate == 0x1) { + /* break 0x1 */ insn->type = INSN_BUG; } else if (inst.reg2_format.opcode == ertn_op) { /* ertn */ diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c index e39f86d97002..a80b75f7b061 100644 --- a/tools/objtool/arch/loongarch/special.c +++ b/tools/objtool/arch/loongarch/special.c @@ -27,6 +27,7 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file, struct table_info *next_table; unsigned long tmp_insn_offset; unsigned long tmp_rodata_offset; + bool is_valid_list = false; rsec = find_section_by_name(file->elf, ".rela.discard.tablejump_annotate"); if (!rsec) @@ -35,6 +36,12 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file, INIT_LIST_HEAD(&table_list); for_each_reloc(rsec, reloc) { + if (reloc->sym->sec->rodata) + continue; + + if (strcmp(insn->sec->name, reloc->sym->sec->name)) + continue; + orig_table = malloc(sizeof(struct table_info)); if (!orig_table) { WARN("malloc failed"); @@ -49,6 +56,22 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file, if (reloc_idx(reloc) + 1 == sec_num_entries(rsec)) break; + + if (strcmp(insn->sec->name, (reloc + 1)->sym->sec->name)) { + list_for_each_entry(orig_table, &table_list, jump_info) { + if (orig_table->insn_offset == insn->offset) { + is_valid_list = true; + break; + } + } + + if (!is_valid_list) { + list_del_init(&table_list); + continue; + } + + break; + } } list_for_each_entry(orig_table, &table_list, jump_info) { diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 98c4713c1b09..0ad5cc70ecbe 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -880,3 +880,15 @@ unsigned int arch_reloc_size(struct reloc *reloc) return 8; } } + +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_X86_64_32: + case R_X86_64_32S: + case R_X86_64_64: + return true; + default: + return false; + } +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 80239843e9f0..0f6b197cfcb0 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -87,6 +87,7 @@ static const struct option check_options[] = { OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), OPT_BOOLEAN(0 , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), + OPT_BOOLEAN(0 , "noabs", &opts.noabs, "reject absolute references in allocatable sections"), OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), @@ -162,6 +163,7 @@ static bool opts_valid(void) opts.hack_noinstr || opts.ibt || opts.mcount || + opts.noabs || opts.noinstr || opts.orc || opts.retpoline || diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d14f20ef1db1..093fcd01dd6e 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3564,7 +3564,9 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { /* Ignore KCFI type preambles, which always fall through */ if (!strncmp(func->name, "__cfi_", 6) || - !strncmp(func->name, "__pfx_", 6)) + !strncmp(func->name, "__pfx_", 6) || + !strncmp(func->name, "__pi___cfi_", 11) || + !strncmp(func->name, "__pi___pfx_", 11)) return 0; if (file->ignore_unreachables) @@ -4644,6 +4646,47 @@ static void disas_warned_funcs(struct objtool_file *file) disas_funcs(funcs); } +__weak bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + unsigned int type = reloc_type(reloc); + size_t sz = elf_addr_size(elf); + + return (sz == 8) ? (type == R_ABS64) : (type == R_ABS32); +} + +static int check_abs_references(struct objtool_file *file) +{ + struct section *sec; + struct reloc *reloc; + int ret = 0; + + for_each_sec(file, sec) { + /* absolute references in non-loadable sections are fine */ + if (!(sec->sh.sh_flags & SHF_ALLOC)) + continue; + + /* section must have an associated .rela section */ + if (!sec->rsec) + continue; + + /* + * Special case for compiler generated metadata that is not + * consumed until after boot. + */ + if (!strcmp(sec->name, "__patchable_function_entries")) + continue; + + for_each_reloc(sec->rsec, reloc) { + if (arch_absolute_reloc(file->elf, reloc)) { + WARN("section %s has absolute relocation at offset 0x%lx", + sec->name, reloc_offset(reloc)); + ret++; + } + } + } + return ret; +} + struct insn_chunk { void *addr; struct insn_chunk *next; @@ -4777,6 +4820,9 @@ int check(struct objtool_file *file) goto out; } + if (opts.noabs) + warnings += check_abs_references(file); + if (opts.orc && nr_insns) { ret = orc_create(file); if (ret) diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 01ef6f415adf..be33c7b43180 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -97,6 +97,7 @@ bool arch_is_embedded_insn(struct symbol *sym); int arch_rewrite_retpolines(struct objtool_file *file); bool arch_pc_relative_reloc(struct reloc *reloc); +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc); unsigned int arch_reloc_size(struct reloc *reloc); unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table); diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 6b08666fa69d..ab22673862e1 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -26,6 +26,7 @@ struct opts { bool uaccess; int prefix; bool cfi; + bool noabs; /* options: */ bool backtrace; diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 6a922d046b8e..802895fae3ca 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -45,7 +45,6 @@ NORETURN(rewind_stack_and_make_dead) NORETURN(rust_begin_unwind) NORETURN(rust_helper_BUG) NORETURN(sev_es_terminate) -NORETURN(snp_abort) NORETURN(start_kernel) NORETURN(stop_this_cpu) NORETURN(usercopy_abort) diff --git a/tools/perf/arch/arm/entry/syscalls/syscall.tbl b/tools/perf/arch/arm/entry/syscalls/syscall.tbl index 27c1d5ebcd91..b07e699aaa3c 100644 --- a/tools/perf/arch/arm/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/arm/entry/syscalls/syscall.tbl @@ -482,3 +482,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index 1e8c44c7b614..7a7049c2c307 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -382,3 +382,5 @@ 465 n64 listxattrat sys_listxattrat 466 n64 removexattrat sys_removexattrat 467 n64 open_tree_attr sys_open_tree_attr +468 n64 file_getattr sys_file_getattr +469 n64 file_setattr sys_file_setattr diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 9a084bdb8926..b453e80dfc00 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -558,3 +558,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index a4569b96ef06..8a6744d658db 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -470,3 +470,5 @@ 465 common listxattrat sys_listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr sys_file_setattr diff --git a/tools/perf/arch/sh/entry/syscalls/syscall.tbl b/tools/perf/arch/sh/entry/syscalls/syscall.tbl index 52a7652fcff6..5e9c9eff5539 100644 --- a/tools/perf/arch/sh/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/sh/entry/syscalls/syscall.tbl @@ -471,3 +471,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/perf/arch/sparc/entry/syscalls/syscall.tbl b/tools/perf/arch/sparc/entry/syscalls/syscall.tbl index 83e45eb6c095..ebb7d06d1044 100644 --- a/tools/perf/arch/sparc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/sparc/entry/syscalls/syscall.tbl @@ -513,3 +513,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl index ac007ea00979..4877e16da69a 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl @@ -473,3 +473,5 @@ 465 i386 listxattrat sys_listxattrat 466 i386 removexattrat sys_removexattrat 467 i386 open_tree_attr sys_open_tree_attr +468 i386 file_getattr sys_file_getattr +469 i386 file_setattr sys_file_setattr diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index cfb5ca41e30d..92cf0fe2291e 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -391,6 +391,8 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr # # Due to a historical design error, certain syscalls are numbered differently diff --git a/tools/perf/arch/x86/tests/topdown.c b/tools/perf/arch/x86/tests/topdown.c index 8d0ea7a4bbc1..1eba3b4594ef 100644 --- a/tools/perf/arch/x86/tests/topdown.c +++ b/tools/perf/arch/x86/tests/topdown.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "arch-tests.h" #include "../util/topdown.h" +#include "debug.h" #include "evlist.h" #include "parse-events.h" #include "pmu.h" diff --git a/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl b/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl index f657a77314f8..374e4cb788d8 100644 --- a/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl @@ -438,3 +438,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c index aad572a78d7f..12387ea88b9a 100644 --- a/tools/perf/bench/inject-buildid.c +++ b/tools/perf/bench/inject-buildid.c @@ -85,7 +85,7 @@ static int add_dso(const char *fpath, const struct stat *sb __maybe_unused, if (typeflag == FTW_D || typeflag == FTW_SL) return 0; - if (filename__read_build_id(fpath, &bid) < 0) + if (filename__read_build_id(fpath, &bid, /*block=*/true) < 0) return 0; dso->name = realpath(fpath, NULL); diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index c98104481c8a..2e0f2004696a 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -180,7 +180,7 @@ static int build_id_cache__add_file(const char *filename, struct nsinfo *nsi) struct nscookie nsc; nsinfo__mountns_enter(nsi, &nsc); - err = filename__read_build_id(filename, &bid); + err = filename__read_build_id(filename, &bid, /*block=*/true); nsinfo__mountns_exit(&nsc); if (err < 0) { pr_debug("Couldn't read a build-id in %s\n", filename); @@ -204,7 +204,7 @@ static int build_id_cache__remove_file(const char *filename, struct nsinfo *nsi) int err; nsinfo__mountns_enter(nsi, &nsc); - err = filename__read_build_id(filename, &bid); + err = filename__read_build_id(filename, &bid, /*block=*/true); nsinfo__mountns_exit(&nsc); if (err < 0) { pr_debug("Couldn't read a build-id in %s\n", filename); @@ -280,7 +280,7 @@ static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused) if (!dso__build_id_filename(dso, filename, sizeof(filename), false)) return true; - if (filename__read_build_id(filename, &bid) == -1) { + if (filename__read_build_id(filename, &bid, /*block=*/true) == -1) { if (errno == ENOENT) return false; @@ -309,7 +309,7 @@ static int build_id_cache__update_file(const char *filename, struct nsinfo *nsi) int err; nsinfo__mountns_enter(nsi, &nsc); - err = filename__read_build_id(filename, &bid); + err = filename__read_build_id(filename, &bid, /*block=*/true); nsinfo__mountns_exit(&nsc); if (err < 0) { pr_debug("Couldn't read a build-id in %s\n", filename); diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 40ba6a94f719..a114b3fa1bea 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -680,12 +680,12 @@ static int dso__read_build_id(struct dso *dso) mutex_lock(dso__lock(dso)); nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); - if (filename__read_build_id(dso__long_name(dso), &bid) > 0) + if (filename__read_build_id(dso__long_name(dso), &bid, /*block=*/true) > 0) dso__set_build_id(dso, &bid); else if (dso__nsinfo(dso)) { char *new_name = dso__filename_with_chroot(dso, dso__long_name(dso)); - if (new_name && filename__read_build_id(new_name, &bid) > 0) + if (new_name && filename__read_build_id(new_name, &bid, /*block=*/true) > 0) dso__set_build_id(dso, &bid); free(new_name); } diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index fd49703021fd..078634461df2 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -2009,6 +2009,7 @@ static int __cmd_contention(int argc, const char **argv) .owner = show_lock_owner, .cgroups = RB_ROOT, }; + struct perf_env host_env; lockhash_table = calloc(LOCKHASH_SIZE, sizeof(*lockhash_table)); if (!lockhash_table) @@ -2024,7 +2025,10 @@ static int __cmd_contention(int argc, const char **argv) eops.mmap = perf_event__process_mmap; eops.tracing_data = perf_event__process_tracing_data; - session = perf_session__new(use_bpf ? NULL : &data, &eops); + perf_env__init(&host_env); + session = __perf_session__new(use_bpf ? NULL : &data, &eops, + /*trace_event_repipe=*/false, &host_env); + if (IS_ERR(session)) { pr_err("Initializing perf session failed\n"); err = PTR_ERR(session); @@ -2142,6 +2146,7 @@ out_delete: evlist__delete(con.evlist); lock_contention_finish(&con); perf_session__delete(session); + perf_env__exit(&host_env); zfree(&lockhash_table); return err; } diff --git a/tools/perf/tests/pe-file-parsing.c b/tools/perf/tests/pe-file-parsing.c index 30c7da79e109..8b31d1d05f90 100644 --- a/tools/perf/tests/pe-file-parsing.c +++ b/tools/perf/tests/pe-file-parsing.c @@ -37,7 +37,7 @@ static int run_dir(const char *d) size_t idx; scnprintf(filename, PATH_MAX, "%s/pe-file.exe", d); - ret = filename__read_build_id(filename, &bid); + ret = filename__read_build_id(filename, &bid, /*block=*/true); TEST_ASSERT_VAL("Failed to read build_id", ret == sizeof(expect_build_id)); TEST_ASSERT_VAL("Wrong build_id", !memcmp(bid.data, expect_build_id, @@ -49,7 +49,7 @@ static int run_dir(const char *d) !strcmp(debuglink, expect_debuglink)); scnprintf(debugfile, PATH_MAX, "%s/%s", d, debuglink); - ret = filename__read_build_id(debugfile, &bid); + ret = filename__read_build_id(debugfile, &bid, /*block=*/true); TEST_ASSERT_VAL("Failed to read debug file build_id", ret == sizeof(expect_build_id)); TEST_ASSERT_VAL("Wrong build_id", !memcmp(bid.data, expect_build_id, diff --git a/tools/perf/tests/sdt.c b/tools/perf/tests/sdt.c index 93baee2eae42..6132f1af3e22 100644 --- a/tools/perf/tests/sdt.c +++ b/tools/perf/tests/sdt.c @@ -31,7 +31,7 @@ static int build_id_cache__add_file(const char *filename) struct build_id bid = { .size = 0, }; int err; - err = filename__read_build_id(filename, &bid); + err = filename__read_build_id(filename, &bid, /*block=*/true); if (err < 0) { pr_debug("Failed to read build id of %s\n", filename); return err; diff --git a/tools/perf/tests/shell/test_bpf_metadata.sh b/tools/perf/tests/shell/test_bpf_metadata.sh index 69e3c2055134..be67d56e0f09 100755 --- a/tools/perf/tests/shell/test_bpf_metadata.sh +++ b/tools/perf/tests/shell/test_bpf_metadata.sh @@ -61,7 +61,7 @@ test_bpf_metadata() { /perf_version/ { if (entry) print $NF; } - ' | egrep "$VERS" > /dev/null + ' | grep -qF "$VERS" then echo "Basic BPF metadata test [Failed invalid output]" err=1 diff --git a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h index a15ac2fa4b20..f291ab4f94eb 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h @@ -90,10 +90,28 @@ #define DN_ATTRIB 0x00000020 /* File changed attibutes */ #define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ +/* Reserved kernel ranges [-100], [-10000, -40000]. */ #define AT_FDCWD -100 /* Special value for dirfd used to indicate openat should use the current working directory. */ +/* + * The concept of process and threads in userland and the kernel is a confusing + * one - within the kernel every thread is a 'task' with its own individual PID, + * however from userland's point of view threads are grouped by a single PID, + * which is that of the 'thread group leader', typically the first thread + * spawned. + * + * To cut the Gideon knot, for internal kernel usage, we refer to + * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel + * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread + * group leader... + */ +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ + +#define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ +#define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h index 0098b0ce8ccb..0bd678a4a10e 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fs.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h @@ -60,6 +60,17 @@ #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ #define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ +/* + * The root inode of procfs is guaranteed to always have the same inode number. + * For programs that make heavy use of procfs, verifying that the root is a + * real procfs root and using openat2(RESOLVE_{NO_{XDEV,MAGICLINKS},BENEATH}) + * will allow you to make sure you are never tricked into operating on the + * wrong procfs file. + */ +enum procfs_ino { + PROCFS_ROOT_INO = 1, +}; + struct file_clone_range { __s64 src_fd; __u64 src_offset; @@ -91,6 +102,63 @@ struct fs_sysfs_path { __u8 name[128]; }; +/* Protection info capability flags */ +#define LBMD_PI_CAP_INTEGRITY (1 << 0) +#define LBMD_PI_CAP_REFTAG (1 << 1) + +/* Checksum types for Protection Information */ +#define LBMD_PI_CSUM_NONE 0 +#define LBMD_PI_CSUM_IP 1 +#define LBMD_PI_CSUM_CRC16_T10DIF 2 +#define LBMD_PI_CSUM_CRC64_NVME 4 + +/* sizeof first published struct */ +#define LBMD_SIZE_VER0 16 + +/* + * Logical block metadata capability descriptor + * If the device does not support metadata, all the fields will be zero. + * Applications must check lbmd_flags to determine whether metadata is + * supported or not. + */ +struct logical_block_metadata_cap { + /* Bitmask of logical block metadata capability flags */ + __u32 lbmd_flags; + /* + * The amount of data described by each unit of logical block + * metadata + */ + __u16 lbmd_interval; + /* + * Size in bytes of the logical block metadata associated with each + * interval + */ + __u8 lbmd_size; + /* + * Size in bytes of the opaque block tag associated with each + * interval + */ + __u8 lbmd_opaque_size; + /* + * Offset in bytes of the opaque block tag within the logical block + * metadata + */ + __u8 lbmd_opaque_offset; + /* Size in bytes of the T10 PI tuple associated with each interval */ + __u8 lbmd_pi_size; + /* Offset in bytes of T10 PI tuple within the logical block metadata */ + __u8 lbmd_pi_offset; + /* T10 PI guard tag type */ + __u8 lbmd_guard_tag_type; + /* Size in bytes of the T10 PI application tag */ + __u8 lbmd_app_tag_size; + /* Size in bytes of the T10 PI reference tag */ + __u8 lbmd_ref_tag_size; + /* Size in bytes of the T10 PI storage tag */ + __u8 lbmd_storage_tag_size; + __u8 pad; +}; + /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ #define FILE_DEDUPE_RANGE_SAME 0 #define FILE_DEDUPE_RANGE_DIFFERS 1 @@ -149,6 +217,24 @@ struct fsxattr { }; /* + * Variable size structure for file_[sg]et_attr(). + * + * Note. This is alternative to the structure 'struct file_kattr'/'struct fsxattr'. + * As this structure is passed to/from userspace with its size, this can + * be versioned based on the size. + */ +struct file_attr { + __u64 fa_xflags; /* xflags field value (get/set) */ + __u32 fa_extsize; /* extsize field value (get/set)*/ + __u32 fa_nextents; /* nextents field value (get) */ + __u32 fa_projid; /* project identifier (get/set) */ + __u32 fa_cowextsize; /* CoW extsize field value (get/set) */ +}; + +#define FILE_ATTR_SIZE_VER0 24 +#define FILE_ATTR_SIZE_LATEST FILE_ATTR_SIZE_VER0 + +/* * Flags for the fsx_xflags field */ #define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ @@ -247,6 +333,8 @@ struct fsxattr { * also /sys/kernel/debug/ for filesystems with debugfs exports */ #define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path) +/* Get logical block metadata capability details */ +#define FS_IOC_GETLBMD_CAP _IOWR(0x15, 2, struct logical_block_metadata_cap) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h index 3b93fb906e3c..ed3aed264aeb 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h +++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h @@ -244,6 +244,8 @@ struct prctl_mm_map { # define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) /* Unused; kept only for source compatibility */ # define PR_MTE_TCF_SHIFT 1 +/* MTE tag check store only */ +# define PR_MTE_STORE_ONLY (1UL << 19) /* RISC-V pointer masking tag length */ # define PR_PMLEN_SHIFT 24 # define PR_PMLEN_MASK (0x7fUL << PR_PMLEN_SHIFT) @@ -255,7 +257,12 @@ struct prctl_mm_map { /* Dispatch syscalls to a userspace handler */ #define PR_SET_SYSCALL_USER_DISPATCH 59 # define PR_SYS_DISPATCH_OFF 0 -# define PR_SYS_DISPATCH_ON 1 +/* Enable dispatch except for the specified range */ +# define PR_SYS_DISPATCH_EXCLUSIVE_ON 1 +/* Enable dispatch for the specified range */ +# define PR_SYS_DISPATCH_INCLUSIVE_ON 2 +/* Legacy name for backwards compatibility */ +# define PR_SYS_DISPATCH_ON PR_SYS_DISPATCH_EXCLUSIVE_ON /* The control values for the user space selector when dispatch is enabled */ # define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 diff --git a/tools/perf/trace/beauty/include/uapi/linux/vhost.h b/tools/perf/trace/beauty/include/uapi/linux/vhost.h index d4b3e2ae1314..c57674a6aa0d 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/vhost.h +++ b/tools/perf/trace/beauty/include/uapi/linux/vhost.h @@ -235,4 +235,39 @@ */ #define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) + +/* Extended features manipulation */ +#define VHOST_GET_FEATURES_ARRAY _IOR(VHOST_VIRTIO, 0x83, \ + struct vhost_features_array) +#define VHOST_SET_FEATURES_ARRAY _IOW(VHOST_VIRTIO, 0x83, \ + struct vhost_features_array) + +/* fork_owner values for vhost */ +#define VHOST_FORK_OWNER_KTHREAD 0 +#define VHOST_FORK_OWNER_TASK 1 + +/** + * VHOST_SET_FORK_FROM_OWNER - Set the fork_owner flag for the vhost device, + * This ioctl must called before VHOST_SET_OWNER. + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y + * + * @param fork_owner: An 8-bit value that determines the vhost thread mode + * + * When fork_owner is set to VHOST_FORK_OWNER_TASK(default value): + * - Vhost will create vhost worker as tasks forked from the owner, + * inheriting all of the owner's attributes. + * + * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD: + * - Vhost will create vhost workers as kernel threads. + */ +#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x84, __u8) + +/** + * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device. + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y + * + * @return: An 8-bit value indicating the current thread mode. + */ +#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x85, __u8) + #endif diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 5b6d3e899e11..2298cd396c42 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -657,9 +657,15 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, info_node->info_linear = info_linear; info_node->metadata = NULL; if (!perf_env__insert_bpf_prog_info(env, info_node)) { - free(info_linear); + /* + * Insert failed, likely because of a duplicate event + * made by the sideband thread. Ignore synthesizing the + * metadata. + */ free(info_node); + goto out; } + /* info_linear is now owned by info_node and shouldn't be freed below. */ info_linear = NULL; /* @@ -827,18 +833,18 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, return err; } -static void perf_env__add_bpf_info(struct perf_env *env, u32 id) +static int perf_env__add_bpf_info(struct perf_env *env, u32 id) { struct bpf_prog_info_node *info_node; struct perf_bpil *info_linear; struct btf *btf = NULL; u64 arrays; u32 btf_id; - int fd; + int fd, err = 0; fd = bpf_prog_get_fd_by_id(id); if (fd < 0) - return; + return -EINVAL; arrays = 1UL << PERF_BPIL_JITED_KSYMS; arrays |= 1UL << PERF_BPIL_JITED_FUNC_LENS; @@ -852,6 +858,7 @@ static void perf_env__add_bpf_info(struct perf_env *env, u32 id) info_linear = get_bpf_prog_info_linear(fd, arrays); if (IS_ERR_OR_NULL(info_linear)) { pr_debug("%s: failed to get BPF program info. aborting\n", __func__); + err = PTR_ERR(info_linear); goto out; } @@ -862,38 +869,46 @@ static void perf_env__add_bpf_info(struct perf_env *env, u32 id) info_node->info_linear = info_linear; info_node->metadata = bpf_metadata_create(&info_linear->info); if (!perf_env__insert_bpf_prog_info(env, info_node)) { + pr_debug("%s: duplicate add bpf info request for id %u\n", + __func__, btf_id); free(info_linear); free(info_node); + goto out; } - } else + } else { free(info_linear); + err = -ENOMEM; + goto out; + } if (btf_id == 0) goto out; btf = btf__load_from_kernel_by_id(btf_id); - if (libbpf_get_error(btf)) { - pr_debug("%s: failed to get BTF of id %u, aborting\n", - __func__, btf_id); - goto out; + if (!btf) { + err = -errno; + pr_debug("%s: failed to get BTF of id %u %d\n", __func__, btf_id, err); + } else { + perf_env__fetch_btf(env, btf_id, btf); } - perf_env__fetch_btf(env, btf_id, btf); out: btf__free(btf); close(fd); + return err; } static int bpf_event__sb_cb(union perf_event *event, void *data) { struct perf_env *env = data; + int ret = 0; if (event->header.type != PERF_RECORD_BPF_EVENT) return -1; switch (event->bpf.type) { case PERF_BPF_EVENT_PROG_LOAD: - perf_env__add_bpf_info(env, event->bpf.id); + ret = perf_env__add_bpf_info(env, event->bpf.id); case PERF_BPF_EVENT_PROG_UNLOAD: /* @@ -907,7 +922,7 @@ static int bpf_event__sb_cb(union perf_event *event, void *data) break; } - return 0; + return ret; } int evlist__add_bpf_sb_event(struct evlist *evlist, struct perf_env *env) diff --git a/tools/perf/util/bpf-utils.c b/tools/perf/util/bpf-utils.c index 80b1d2b3729b..5a66dc8594aa 100644 --- a/tools/perf/util/bpf-utils.c +++ b/tools/perf/util/bpf-utils.c @@ -20,7 +20,7 @@ struct bpil_array_desc { */ }; -static struct bpil_array_desc bpil_array_desc[] = { +static const struct bpil_array_desc bpil_array_desc[] = { [PERF_BPIL_JITED_INSNS] = { offsetof(struct bpf_prog_info, jited_prog_insns), offsetof(struct bpf_prog_info, jited_prog_len), @@ -115,7 +115,7 @@ get_bpf_prog_info_linear(int fd, __u64 arrays) __u32 info_len = sizeof(info); __u32 data_len = 0; int i, err; - void *ptr; + __u8 *ptr; if (arrays >> PERF_BPIL_LAST_ARRAY) return ERR_PTR(-EINVAL); @@ -126,15 +126,15 @@ get_bpf_prog_info_linear(int fd, __u64 arrays) pr_debug("can't get prog info: %s", strerror(errno)); return ERR_PTR(-EFAULT); } + if (info.type >= __MAX_BPF_PROG_TYPE) + pr_debug("%s:%d: unexpected program type %u\n", __func__, __LINE__, info.type); /* step 2: calculate total size of all arrays */ for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) { + const struct bpil_array_desc *desc = &bpil_array_desc[i]; bool include_array = (arrays & (1UL << i)) > 0; - struct bpil_array_desc *desc; __u32 count, size; - desc = bpil_array_desc + i; - /* kernel is too old to support this field */ if (info_len < desc->array_offset + sizeof(__u32) || info_len < desc->count_offset + sizeof(__u32) || @@ -163,19 +163,20 @@ get_bpf_prog_info_linear(int fd, __u64 arrays) ptr = info_linear->data; for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) { - struct bpil_array_desc *desc; + const struct bpil_array_desc *desc = &bpil_array_desc[i]; __u32 count, size; if ((arrays & (1UL << i)) == 0) continue; - desc = bpil_array_desc + i; count = bpf_prog_info_read_offset_u32(&info, desc->count_offset); size = bpf_prog_info_read_offset_u32(&info, desc->size_offset); bpf_prog_info_set_offset_u32(&info_linear->info, desc->count_offset, count); bpf_prog_info_set_offset_u32(&info_linear->info, desc->size_offset, size); + assert(ptr >= info_linear->data); + assert(ptr < &info_linear->data[data_len]); bpf_prog_info_set_offset_u64(&info_linear->info, desc->array_offset, ptr_to_u64(ptr)); @@ -189,27 +190,45 @@ get_bpf_prog_info_linear(int fd, __u64 arrays) free(info_linear); return ERR_PTR(-EFAULT); } + if (info_linear->info.type >= __MAX_BPF_PROG_TYPE) { + pr_debug("%s:%d: unexpected program type %u\n", + __func__, __LINE__, info_linear->info.type); + } /* step 6: verify the data */ + ptr = info_linear->data; for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) { - struct bpil_array_desc *desc; - __u32 v1, v2; + const struct bpil_array_desc *desc = &bpil_array_desc[i]; + __u32 count1, count2, size1, size2; + __u64 ptr2; if ((arrays & (1UL << i)) == 0) continue; - desc = bpil_array_desc + i; - v1 = bpf_prog_info_read_offset_u32(&info, desc->count_offset); - v2 = bpf_prog_info_read_offset_u32(&info_linear->info, + count1 = bpf_prog_info_read_offset_u32(&info, desc->count_offset); + count2 = bpf_prog_info_read_offset_u32(&info_linear->info, desc->count_offset); - if (v1 != v2) - pr_warning("%s: mismatch in element count\n", __func__); + if (count1 != count2) { + pr_warning("%s: mismatch in element count %u vs %u\n", __func__, count1, count2); + free(info_linear); + return ERR_PTR(-ERANGE); + } - v1 = bpf_prog_info_read_offset_u32(&info, desc->size_offset); - v2 = bpf_prog_info_read_offset_u32(&info_linear->info, + size1 = bpf_prog_info_read_offset_u32(&info, desc->size_offset); + size2 = bpf_prog_info_read_offset_u32(&info_linear->info, desc->size_offset); - if (v1 != v2) - pr_warning("%s: mismatch in rec size\n", __func__); + if (size1 != size2) { + pr_warning("%s: mismatch in rec size %u vs %u\n", __func__, size1, size2); + free(info_linear); + return ERR_PTR(-ERANGE); + } + ptr2 = bpf_prog_info_read_offset_u64(&info_linear->info, desc->array_offset); + if (ptr_to_u64(ptr) != ptr2) { + pr_warning("%s: mismatch in array %p vs %llx\n", __func__, ptr, ptr2); + free(info_linear); + return ERR_PTR(-ERANGE); + } + ptr += roundup(count1 * size1, sizeof(__u64)); } /* step 7: update info_len and data_len */ @@ -224,13 +243,12 @@ void bpil_addr_to_offs(struct perf_bpil *info_linear) int i; for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) { - struct bpil_array_desc *desc; + const struct bpil_array_desc *desc = &bpil_array_desc[i]; __u64 addr, offs; if ((info_linear->arrays & (1UL << i)) == 0) continue; - desc = bpil_array_desc + i; addr = bpf_prog_info_read_offset_u64(&info_linear->info, desc->array_offset); offs = addr - ptr_to_u64(info_linear->data); @@ -244,13 +262,12 @@ void bpil_offs_to_addr(struct perf_bpil *info_linear) int i; for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) { - struct bpil_array_desc *desc; + const struct bpil_array_desc *desc = &bpil_array_desc[i]; __u64 addr, offs; if ((info_linear->arrays & (1UL << i)) == 0) continue; - desc = bpil_array_desc + i; offs = bpf_prog_info_read_offset_u64(&info_linear->info, desc->array_offset); addr = offs + ptr_to_u64(info_linear->data); diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index a7018a3b0437..bf7f3268b9a2 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -115,7 +115,7 @@ int filename__snprintf_build_id(const char *pathname, char *sbuild_id, size_t sb struct build_id bid = { .size = 0, }; int ret; - ret = filename__read_build_id(pathname, &bid); + ret = filename__read_build_id(pathname, &bid, /*block=*/true); if (ret < 0) return ret; @@ -841,7 +841,7 @@ static int filename__read_build_id_ns(const char *filename, int ret; nsinfo__mountns_enter(nsi, &nsc); - ret = filename__read_build_id(filename, bid); + ret = filename__read_build_id(filename, bid, /*block=*/true); nsinfo__mountns_exit(&nsc); return ret; diff --git a/tools/perf/util/debuginfo.c b/tools/perf/util/debuginfo.c index a44c70f93156..bb9ebd84ec2d 100644 --- a/tools/perf/util/debuginfo.c +++ b/tools/perf/util/debuginfo.c @@ -110,8 +110,12 @@ struct debuginfo *debuginfo__new(const char *path) if (!dso) goto out; - /* Set the build id for DSO_BINARY_TYPE__BUILDID_DEBUGINFO */ - if (is_regular_file(path) && filename__read_build_id(path, &bid) > 0) + /* + * Set the build id for DSO_BINARY_TYPE__BUILDID_DEBUGINFO. Don't block + * incase the path isn't for a regular file. + */ + assert(!dso__has_build_id(dso)); + if (filename__read_build_id(path, &bid, /*block=*/false) > 0) dso__set_build_id(dso, &bid); for (type = distro_dwarf_types; diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index 0a7645c7fae7..64c1d65b0149 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -81,13 +81,13 @@ static int dsos__read_build_ids_cb(struct dso *dso, void *data) return 0; } nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); - if (filename__read_build_id(dso__long_name(dso), &bid) > 0) { + if (filename__read_build_id(dso__long_name(dso), &bid, /*block=*/true) > 0) { dso__set_build_id(dso, &bid); args->have_build_id = true; } else if (errno == ENOENT && dso__nsinfo(dso)) { char *new_name = dso__filename_with_chroot(dso, dso__long_name(dso)); - if (new_name && filename__read_build_id(new_name, &bid) > 0) { + if (new_name && filename__read_build_id(new_name, &bid, /*block=*/true) > 0) { dso__set_build_id(dso, &bid); args->have_build_id = true; } diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h index 89979ca23c3f..34e2fdfe7300 100644 --- a/tools/perf/util/include/linux/linkage.h +++ b/tools/perf/util/include/linux/linkage.h @@ -120,7 +120,7 @@ #endif // In the kernel sources (include/linux/cfi_types.h), this has a different -// definition when CONFIG_CFI_CLANG is used, for tools/ just use the !clang +// definition when CONFIG_CFI is used, for tools/ just use the !cfi // definition: #ifndef SYM_TYPED_START #define SYM_TYPED_START(name, linkage, align...) \ diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c index 8fabddc1c0da..72c7a4e15d61 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -32,7 +32,7 @@ static void intel_pt_insn_decoder(struct insn *insn, intel_pt_insn->rel = 0; intel_pt_insn->emulated_ptwrite = false; - if (insn_is_avx(insn)) { + if (insn_is_avx_or_xop(insn)) { intel_pt_insn->op = INTEL_PT_OP_OTHER; intel_pt_insn->branch = INTEL_PT_BR_NO_BRANCH; intel_pt_insn->length = insn->length; diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 85b2a93a59ac..779f6230130a 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -477,6 +477,7 @@ static int __maps__insert(struct maps *maps, struct map *new) } /* Insert the value at the end. */ maps_by_address[nr_maps] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) maps_by_name[nr_maps] = map__get(new); @@ -502,8 +503,6 @@ static int __maps__insert(struct maps *maps, struct map *new) if (map__end(new) < map__start(new)) RC_CHK_ACCESS(maps)->ends_broken = true; - map__set_kmap_maps(new, maps); - return 0; } @@ -891,6 +890,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) if (before) { map__put(maps_by_address[i]); maps_by_address[i] = before; + map__set_kmap_maps(before, maps); if (maps_by_name) { map__put(maps_by_name[ni]); @@ -918,6 +918,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) */ map__put(maps_by_address[i]); maps_by_address[i] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) { map__put(maps_by_name[ni]); @@ -942,14 +943,13 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) */ map__put(maps_by_address[i]); maps_by_address[i] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) { map__put(maps_by_name[ni]); maps_by_name[ni] = map__get(new); } - map__set_kmap_maps(new, maps); - check_invariants(maps); return err; } @@ -1019,6 +1019,7 @@ int maps__copy_from(struct maps *dest, struct maps *parent) err = unwind__prepare_access(dest, new, NULL); if (!err) { dest_maps_by_address[i] = new; + map__set_kmap_maps(new, dest); if (dest_maps_by_name) dest_maps_by_name[i] = map__get(new); RC_CHK_ACCESS(dest)->nr_maps = i + 1; diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 6d2c280a1730..1346fd180653 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -873,13 +873,17 @@ out: #ifdef HAVE_LIBBFD_BUILDID_SUPPORT -static int read_build_id(const char *filename, struct build_id *bid) +static int read_build_id(const char *filename, struct build_id *bid, bool block) { size_t size = sizeof(bid->data); - int err = -1; + int err = -1, fd; bfd *abfd; - abfd = bfd_openr(filename, NULL); + fd = open(filename, block ? O_RDONLY : (O_RDONLY | O_NONBLOCK)); + if (fd < 0) + return -1; + + abfd = bfd_fdopenr(filename, /*target=*/NULL, fd); if (!abfd) return -1; @@ -902,7 +906,7 @@ out_close: #else // HAVE_LIBBFD_BUILDID_SUPPORT -static int read_build_id(const char *filename, struct build_id *bid) +static int read_build_id(const char *filename, struct build_id *bid, bool block) { size_t size = sizeof(bid->data); int fd, err = -1; @@ -911,7 +915,7 @@ static int read_build_id(const char *filename, struct build_id *bid) if (size < BUILD_ID_SIZE) goto out; - fd = open(filename, O_RDONLY); + fd = open(filename, block ? O_RDONLY : (O_RDONLY | O_NONBLOCK)); if (fd < 0) goto out; @@ -934,7 +938,7 @@ out: #endif // HAVE_LIBBFD_BUILDID_SUPPORT -int filename__read_build_id(const char *filename, struct build_id *bid) +int filename__read_build_id(const char *filename, struct build_id *bid, bool block) { struct kmod_path m = { .name = NULL, }; char path[PATH_MAX]; @@ -958,9 +962,10 @@ int filename__read_build_id(const char *filename, struct build_id *bid) } close(fd); filename = path; + block = true; } - err = read_build_id(filename, bid); + err = read_build_id(filename, bid, block); if (m.comp) unlink(filename); diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c index 7201494c5c20..41e4ebe5eac5 100644 --- a/tools/perf/util/symbol-minimal.c +++ b/tools/perf/util/symbol-minimal.c @@ -4,7 +4,6 @@ #include <errno.h> #include <unistd.h> -#include <stdio.h> #include <fcntl.h> #include <string.h> #include <stdlib.h> @@ -86,13 +85,10 @@ int filename__read_debuglink(const char *filename __maybe_unused, /* * Just try PT_NOTE header otherwise fails */ -int filename__read_build_id(const char *filename, struct build_id *bid) +int filename__read_build_id(const char *filename, struct build_id *bid, bool block) { - FILE *fp; - int ret = -1; + int fd, ret = -1; bool need_swap = false, elf32; - u8 e_ident[EI_NIDENT]; - int i; union { struct { Elf32_Ehdr ehdr32; @@ -103,28 +99,27 @@ int filename__read_build_id(const char *filename, struct build_id *bid) Elf64_Phdr *phdr64; }; } hdrs; - void *phdr; - size_t phdr_size; - void *buf = NULL; - size_t buf_size = 0; + void *phdr, *buf = NULL; + ssize_t phdr_size, ehdr_size, buf_size = 0; - fp = fopen(filename, "r"); - if (fp == NULL) + fd = open(filename, block ? O_RDONLY : (O_RDONLY | O_NONBLOCK)); + if (fd < 0) return -1; - if (fread(e_ident, sizeof(e_ident), 1, fp) != 1) + if (read(fd, hdrs.ehdr32.e_ident, EI_NIDENT) != EI_NIDENT) goto out; - if (memcmp(e_ident, ELFMAG, SELFMAG) || - e_ident[EI_VERSION] != EV_CURRENT) + if (memcmp(hdrs.ehdr32.e_ident, ELFMAG, SELFMAG) || + hdrs.ehdr32.e_ident[EI_VERSION] != EV_CURRENT) goto out; - need_swap = check_need_swap(e_ident[EI_DATA]); - elf32 = e_ident[EI_CLASS] == ELFCLASS32; + need_swap = check_need_swap(hdrs.ehdr32.e_ident[EI_DATA]); + elf32 = hdrs.ehdr32.e_ident[EI_CLASS] == ELFCLASS32; + ehdr_size = (elf32 ? sizeof(hdrs.ehdr32) : sizeof(hdrs.ehdr64)) - EI_NIDENT; - if (fread(elf32 ? (void *)&hdrs.ehdr32 : (void *)&hdrs.ehdr64, - elf32 ? sizeof(hdrs.ehdr32) : sizeof(hdrs.ehdr64), - 1, fp) != 1) + if (read(fd, + (elf32 ? (void *)&hdrs.ehdr32 : (void *)&hdrs.ehdr64) + EI_NIDENT, + ehdr_size) != ehdr_size) goto out; if (need_swap) { @@ -138,14 +133,18 @@ int filename__read_build_id(const char *filename, struct build_id *bid) hdrs.ehdr64.e_phnum = bswap_16(hdrs.ehdr64.e_phnum); } } - phdr_size = elf32 ? hdrs.ehdr32.e_phentsize * hdrs.ehdr32.e_phnum - : hdrs.ehdr64.e_phentsize * hdrs.ehdr64.e_phnum; + if ((elf32 && hdrs.ehdr32.e_phentsize != sizeof(Elf32_Phdr)) || + (!elf32 && hdrs.ehdr64.e_phentsize != sizeof(Elf64_Phdr))) + goto out; + + phdr_size = elf32 ? sizeof(Elf32_Phdr) * hdrs.ehdr32.e_phnum + : sizeof(Elf64_Phdr) * hdrs.ehdr64.e_phnum; phdr = malloc(phdr_size); if (phdr == NULL) goto out; - fseek(fp, elf32 ? hdrs.ehdr32.e_phoff : hdrs.ehdr64.e_phoff, SEEK_SET); - if (fread(phdr, phdr_size, 1, fp) != 1) + lseek(fd, elf32 ? hdrs.ehdr32.e_phoff : hdrs.ehdr64.e_phoff, SEEK_SET); + if (read(fd, phdr, phdr_size) != phdr_size) goto out_free; if (elf32) @@ -153,8 +152,8 @@ int filename__read_build_id(const char *filename, struct build_id *bid) else hdrs.phdr64 = phdr; - for (i = 0; i < elf32 ? hdrs.ehdr32.e_phnum : hdrs.ehdr64.e_phnum; i++) { - size_t p_filesz; + for (int i = 0; i < (elf32 ? hdrs.ehdr32.e_phnum : hdrs.ehdr64.e_phnum); i++) { + ssize_t p_filesz; if (need_swap) { if (elf32) { @@ -180,8 +179,8 @@ int filename__read_build_id(const char *filename, struct build_id *bid) goto out_free; buf = tmp; } - fseek(fp, elf32 ? hdrs.phdr32[i].p_offset : hdrs.phdr64[i].p_offset, SEEK_SET); - if (fread(buf, p_filesz, 1, fp) != 1) + lseek(fd, elf32 ? hdrs.phdr32[i].p_offset : hdrs.phdr64[i].p_offset, SEEK_SET); + if (read(fd, buf, p_filesz) != p_filesz) goto out_free; ret = read_build_id(buf, p_filesz, bid, need_swap); @@ -194,7 +193,7 @@ out_free: free(buf); free(phdr); out: - fclose(fp); + close(fd); return ret; } @@ -324,7 +323,7 @@ int dso__load_sym(struct dso *dso, struct map *map __maybe_unused, if (ret >= 0) RC_CHK_ACCESS(dso)->is_64_bit = ret; - if (filename__read_build_id(ss->name, &bid) > 0) + if (filename__read_build_id(ss->name, &bid, /*block=*/true) > 0) dso__set_build_id(dso, &bid); return 0; } diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index e816e4220d33..3fed54de5401 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1869,14 +1869,14 @@ int dso__load(struct dso *dso, struct map *map) /* * Read the build id if possible. This is required for - * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work + * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work. Don't block in case path + * isn't for a regular file. */ - if (!dso__has_build_id(dso) && - is_regular_file(dso__long_name(dso))) { + if (!dso__has_build_id(dso)) { struct build_id bid = { .size = 0, }; __symbol__join_symfs(name, PATH_MAX, dso__long_name(dso)); - if (filename__read_build_id(name, &bid) > 0) + if (filename__read_build_id(name, &bid, /*block=*/false) > 0) dso__set_build_id(dso, &bid); } diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 3fb5d146d9b1..347106218799 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -140,7 +140,7 @@ struct symbol *dso__next_symbol(struct symbol *sym); enum dso_type dso__type_fd(int fd); -int filename__read_build_id(const char *filename, struct build_id *id); +int filename__read_build_id(const char *filename, struct build_id *id, bool block); int sysfs__read_build_id(const char *filename, struct build_id *bid); int modules__parse(const char *filename, void *arg, int (*process_module)(void *arg, const char *name, diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index cb2c1ace304a..fcd1fd13c30e 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -401,7 +401,7 @@ static void perf_record_mmap2__read_build_id(struct perf_record_mmap2 *event, nsi = nsinfo__new(event->pid); nsinfo__mountns_enter(nsi, &nc); - rc = filename__read_build_id(event->filename, &bid) > 0 ? 0 : -1; + rc = filename__read_build_id(event->filename, &bid, /*block=*/false) > 0 ? 0 : -1; nsinfo__mountns_exit(&nc); nsinfo__put(nsi); diff --git a/tools/power/cpupower/man/cpupower-set.1 b/tools/power/cpupower/man/cpupower-set.1 index 500653ef98c7..8ac82b6f9189 100644 --- a/tools/power/cpupower/man/cpupower-set.1 +++ b/tools/power/cpupower/man/cpupower-set.1 @@ -81,10 +81,11 @@ Refer to the AMD P-State kernel documentation for further information. .RE .PP -\-\-turbo\-boost, \-t +\-\-turbo\-boost, \-\-boost, \-t .RS 4 -This option is used to enable or disable the turbo boost feature on -supported Intel and AMD processors. +This option is used to enable or disable the boost feature on +supported Intel and AMD processors, and other boost supported systems. +(The --boost option is an alias for the --turbo-boost option) This option takes as parameter either \fB1\fP to enable, or \fB0\fP to disable the feature. diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index fc750e127404..7d3732f5f2f6 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -128,7 +128,7 @@ static int get_boost_mode_x86(unsigned int cpu) /* ToDo: Make this more global */ unsigned long pstates[MAX_HW_PSTATES] = {0,}; - ret = cpufreq_has_boost_support(cpu, &support, &active, &b_states); + ret = cpufreq_has_x86_boost_support(cpu, &support, &active, &b_states); if (ret) { printf(_("Error while evaluating Boost Capabilities" " on CPU %d -- are you root?\n"), cpu); @@ -204,6 +204,18 @@ static int get_boost_mode_x86(unsigned int cpu) return 0; } +static int get_boost_mode_generic(unsigned int cpu) +{ + bool active; + + if (!cpufreq_has_generic_boost_support(&active)) { + printf(_(" boost state support:\n")); + printf(_(" Active: %s\n"), active ? _("yes") : _("no")); + } + + return 0; +} + /* --boost / -b */ static int get_boost_mode(unsigned int cpu) @@ -214,6 +226,8 @@ static int get_boost_mode(unsigned int cpu) cpupower_cpu_info.vendor == X86_VENDOR_HYGON || cpupower_cpu_info.vendor == X86_VENDOR_INTEL) return get_boost_mode_x86(cpu); + else + get_boost_mode_generic(cpu); freqs = cpufreq_get_boost_frequencies(cpu); if (freqs) { diff --git a/tools/power/cpupower/utils/cpupower-set.c b/tools/power/cpupower/utils/cpupower-set.c index 0677b58374ab..c2117e5650dd 100644 --- a/tools/power/cpupower/utils/cpupower-set.c +++ b/tools/power/cpupower/utils/cpupower-set.c @@ -21,6 +21,7 @@ static struct option set_opts[] = { {"epp", required_argument, NULL, 'e'}, {"amd-pstate-mode", required_argument, NULL, 'm'}, {"turbo-boost", required_argument, NULL, 't'}, + {"boost", required_argument, NULL, 't'}, { }, }; @@ -62,8 +63,8 @@ int cmd_set(int argc, char **argv) params.params = 0; /* parameter parsing */ - while ((ret = getopt_long(argc, argv, "b:e:m:", - set_opts, NULL)) != -1) { + while ((ret = getopt_long(argc, argv, "b:e:m:t:", + set_opts, NULL)) != -1) { switch (ret) { case 'b': if (params.perf_bias) diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 95749b8ee475..82ea62bdf5a2 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -103,6 +103,9 @@ extern struct cpupower_cpu_info cpupower_cpu_info; /* cpuid and cpuinfo helpers **************************/ +int cpufreq_has_generic_boost_support(bool *active); +int cpupower_set_turbo_boost(int turbo_boost); + /* X86 ONLY ****************************************/ #if defined(__i386__) || defined(__x86_64__) @@ -118,7 +121,6 @@ extern unsigned long long msr_intel_get_turbo_ratio(unsigned int cpu); extern int cpupower_set_epp(unsigned int cpu, char *epp); extern int cpupower_set_amd_pstate_mode(char *mode); -extern int cpupower_set_turbo_boost(int turbo_boost); /* Read/Write msr ****************************/ @@ -139,8 +141,8 @@ extern int decode_pstates(unsigned int cpu, int boost_states, /* AMD HW pstate decoding **************************/ -extern int cpufreq_has_boost_support(unsigned int cpu, int *support, - int *active, int * states); +int cpufreq_has_x86_boost_support(unsigned int cpu, int *support, + int *active, int *states); /* AMD P-State stuff **************************/ bool cpupower_amd_pstate_enabled(void); @@ -181,13 +183,11 @@ static inline int cpupower_set_epp(unsigned int cpu, char *epp) { return -1; }; static inline int cpupower_set_amd_pstate_mode(char *mode) { return -1; }; -static inline int cpupower_set_turbo_boost(int turbo_boost) -{ return -1; }; /* Read/Write msr ****************************/ -static inline int cpufreq_has_boost_support(unsigned int cpu, int *support, - int *active, int * states) +static inline int cpufreq_has_x86_boost_support(unsigned int cpu, int *support, + int *active, int *states) { return -1; } static inline bool cpupower_amd_pstate_enabled(void) diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index 76e461ff4f74..166dc1e470ea 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -8,15 +8,14 @@ #include "helpers/helpers.h" #include "helpers/sysfs.h" #include "cpufreq.h" +#include "cpupower_intern.h" #if defined(__i386__) || defined(__x86_64__) -#include "cpupower_intern.h" - #define MSR_AMD_HWCR 0xc0010015 -int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, - int *states) +int cpufreq_has_x86_boost_support(unsigned int cpu, int *support, int *active, + int *states) { int ret; unsigned long long val; @@ -124,24 +123,6 @@ int cpupower_set_amd_pstate_mode(char *mode) return 0; } -int cpupower_set_turbo_boost(int turbo_boost) -{ - char path[SYSFS_PATH_MAX]; - char linebuf[2] = {}; - - snprintf(path, sizeof(path), PATH_TO_CPU "cpufreq/boost"); - - if (!is_valid_path(path)) - return -1; - - snprintf(linebuf, sizeof(linebuf), "%d", turbo_boost); - - if (cpupower_write_sysfs(path, linebuf, 2) <= 0) - return -1; - - return 0; -} - bool cpupower_amd_pstate_enabled(void) { char *driver = cpufreq_get_driver(0); @@ -160,6 +141,39 @@ bool cpupower_amd_pstate_enabled(void) #endif /* #if defined(__i386__) || defined(__x86_64__) */ +int cpufreq_has_generic_boost_support(bool *active) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[2] = {}; + unsigned long val; + char *endp; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpufreq/boost"); + + if (!is_valid_path(path)) + return -EACCES; + + if (cpupower_read_sysfs(path, linebuf, 2) <= 0) + return -EINVAL; + + val = strtoul(linebuf, &endp, 0); + if (endp == linebuf || errno == ERANGE) + return -EINVAL; + + switch (val) { + case 0: + *active = false; + break; + case 1: + *active = true; + break; + default: + return -EINVAL; + } + + return 0; +} + /* get_cpustate * * Gather the information of all online CPUs into bitmask struct @@ -259,3 +273,21 @@ void print_speed(unsigned long speed, int no_rounding) } } } + +int cpupower_set_turbo_boost(int turbo_boost) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[2] = {}; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpufreq/boost"); + + if (!is_valid_path(path)) + return -1; + + snprintf(linebuf, sizeof(linebuf), "%d", turbo_boost); + + if (cpupower_write_sysfs(path, linebuf, 2) <= 0) + return -1; + + return 0; +} diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 72a280e7a9d5..47eb2d4d13a5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1195,7 +1195,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_EMERALDRAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_D, &spr_features }, - { INTEL_PANTHERCOVE_X, &dmr_features }, + { INTEL_DIAMONDRAPIDS_X, &dmr_features }, { INTEL_LAKEFIELD, &cnl_features }, { INTEL_ALDERLAKE, &adl_features }, { INTEL_ALDERLAKE_L, &adl_features }, diff --git a/tools/sched_ext/include/scx/bpf_arena_common.bpf.h b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h new file mode 100644 index 000000000000..4366fb3c91ce --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef PAGE_SIZE +#define PAGE_SIZE __PAGE_SIZE +/* + * for older kernels try sizeof(struct genradix_node) + * or flexible: + * static inline long __bpf_page_size(void) { + * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); + * } + * but generated code is not great. + */ +#endif + +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) +#define __arena __attribute__((address_space(1))) +#define __arena_global __attribute__((address_space(1))) +#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ +#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ +#else + +/* emit instruction: + * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as + * + * This is a workaround for LLVM compiler versions without + * __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena + * pointers and native kernel/userspace ones. In this case we explicitly do so + * with cast_kern() and cast_user(). E.g., in the Linux kernel tree, + * tools/testing/selftests/bpf includes tests that use these macros to implement + * linked lists and hashtables backed by arena memory. In sched_ext, we use + * cast_kern() and cast_user() for compatibility with older LLVM toolchains. + */ +#ifndef bpf_addr_space_cast +#define bpf_addr_space_cast(var, dst_as, src_as)\ + asm volatile(".byte 0xBF; \ + .ifc %[reg], r0; \ + .byte 0x00; \ + .endif; \ + .ifc %[reg], r1; \ + .byte 0x11; \ + .endif; \ + .ifc %[reg], r2; \ + .byte 0x22; \ + .endif; \ + .ifc %[reg], r3; \ + .byte 0x33; \ + .endif; \ + .ifc %[reg], r4; \ + .byte 0x44; \ + .endif; \ + .ifc %[reg], r5; \ + .byte 0x55; \ + .endif; \ + .ifc %[reg], r6; \ + .byte 0x66; \ + .endif; \ + .ifc %[reg], r7; \ + .byte 0x77; \ + .endif; \ + .ifc %[reg], r8; \ + .byte 0x88; \ + .endif; \ + .ifc %[reg], r9; \ + .byte 0x99; \ + .endif; \ + .short %[off]; \ + .long %[as]" \ + : [reg]"+r"(var) \ + : [off]"i"(BPF_ADDR_SPACE_CAST) \ + , [as]"i"((dst_as << 16) | src_as)); +#endif + +#define __arena +#define __arena_global SEC(".addr_space.1") +#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) +#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) +#endif + +void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, + int node_id, __u64 flags) __ksym __weak; +void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; + +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef TEST +#define can_loop true +#define __cond_break(expr) expr +#else +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */ +#endif /* __BPF_FEATURE_MAY_GOTO */ +#endif /* TEST */ + +#define cond_break __cond_break(break) +#define cond_break_label(label) __cond_break(goto label) + + +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; diff --git a/tools/sched_ext/include/scx/bpf_arena_common.h b/tools/sched_ext/include/scx/bpf_arena_common.h new file mode 100644 index 000000000000..10141db0b59d --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef arena_container_of +#define arena_container_of(ptr, type, member) \ + ({ \ + void __arena *__mptr = (void __arena *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + +/* Provide the definition of PAGE_SIZE. */ +#include <sys/user.h> + +#define __arena +#define __arg_arena +#define cast_kern(ptr) /* nop for user space */ +#define cast_user(ptr) /* nop for user space */ +char __attribute__((weak)) arena[1]; + +#ifndef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) +#endif + +static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, + int node_id, __u64 flags) +{ + return NULL; +} +static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) +{ +} diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index d4e21558e982..06e2551033cb 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -24,14 +24,26 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <asm-generic/errno.h> -#include "user_exit_info.h" +#include "user_exit_info.bpf.h" #include "enum_defs.autogen.h" +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ +#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_EXITING 0x00000004 #define CLOCK_MONOTONIC 1 +#ifndef NR_CPUS +#define NR_CPUS 1024 +#endif + +#ifndef NUMA_NO_NODE +#define NUMA_NO_NODE (-1) +#endif + extern int LINUX_KERNEL_VERSION __kconfig; extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak; extern const char CONFIG_LOCALVERSION[64] __kconfig __weak; @@ -91,6 +103,8 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; +struct rq *scx_bpf_locked_rq(void) __ksym; +struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; @@ -107,6 +121,9 @@ void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __ static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} +#define SCX_STRINGIFY(x) #x +#define SCX_TOSTRING(x) SCX_STRINGIFY(x) + /* * Helper macro for initializing the fmt and variadic argument inputs to both * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to @@ -141,13 +158,15 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments * instead of an array of u64. Invoking this macro will cause the scheduler to * exit in an erroneous state, with diagnostic information being passed to the - * user. + * user. It appends the file and line number to aid debugging. */ #define scx_bpf_error(fmt, args...) \ ({ \ - scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_bstr_preamble( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args) \ scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ - ___scx_bpf_bstr_format_checker(fmt, ##args); \ + ___scx_bpf_bstr_format_checker( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args); \ }) /* @@ -229,6 +248,7 @@ BPF_PROG(name, ##args) * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of * `MEMBER_VPTR(ptr, ->member)`. */ +#ifndef MEMBER_VPTR #define MEMBER_VPTR(base, member) (typeof((base) member) *) \ ({ \ u64 __base = (u64)&(base); \ @@ -245,6 +265,7 @@ BPF_PROG(name, ##args) [max]"i"(sizeof(base) - sizeof((base) member))); \ __addr; \ }) +#endif /* MEMBER_VPTR */ /** * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element @@ -260,6 +281,7 @@ BPF_PROG(name, ##args) * size of the array to compute the max, which will result in rejection by * the verifier. */ +#ifndef ARRAY_ELEM_PTR #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ ({ \ u64 __base = (u64)arr; \ @@ -274,7 +296,7 @@ BPF_PROG(name, ##args) [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ __addr; \ }) - +#endif /* ARRAY_ELEM_PTR */ /* * BPF declarations and helpers @@ -438,8 +460,27 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) */ static inline bool is_migration_disabled(const struct task_struct *p) { - if (bpf_core_field_exists(p->migration_disabled)) - return p->migration_disabled; + /* + * Testing p->migration_disabled in a BPF code is tricky because the + * migration is _always_ disabled while running the BPF code. + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF + * code execution disable and re-enable the migration of the current + * task, respectively. So, the _current_ task of the sched_ext ops is + * always migration-disabled. Moreover, p->migration_disabled could be + * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is + * executed in the middle of the other BPF code execution. + * + * Therefore, we should decide that the _current_ task is + * migration-disabled only when its migration_disabled count is greater + * than one. In other words, when p->migration_disabled == 1, there is + * an ambiguity, so we should check if @p is the current task or not. + */ + if (bpf_core_field_exists(p->migration_disabled)) { + if (p->migration_disabled == 1) + return bpf_get_current_task_btf() != p; + else + return p->migration_disabled; + } return false; } @@ -476,7 +517,7 @@ static inline s64 time_delta(u64 after, u64 before) */ static inline bool time_after(u64 a, u64 b) { - return (s64)(b - a) < 0; + return (s64)(b - a) < 0; } /** @@ -500,7 +541,7 @@ static inline bool time_before(u64 a, u64 b) */ static inline bool time_after_eq(u64 a, u64 b) { - return (s64)(a - b) >= 0; + return (s64)(a - b) >= 0; } /** @@ -547,9 +588,15 @@ static inline bool time_in_range_open(u64 a, u64 b, u64 c) */ /* useful compiler attributes */ +#ifndef likely #define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#ifndef __maybe_unused #define __maybe_unused __attribute__((__unused__)) +#endif /* * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They @@ -633,6 +680,26 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s }) /* + * __calc_avg - Calculate exponential weighted moving average (EWMA) with + * @old and @new values. @decay represents how large the @old value remains. + * With a larger @decay value, the moving average changes slowly, exhibiting + * fewer fluctuations. + */ +#define __calc_avg(old, new, decay) ({ \ + typeof(decay) thr = 1 << (decay); \ + typeof(old) ret; \ + if (((old) < thr) || ((new) < thr)) { \ + if (((old) == 1) && ((new) == 0)) \ + ret = 0; \ + else \ + ret = ((old) - ((old) >> 1)) + ((new) >> 1); \ + } else { \ + ret = ((old) - ((old) >> (decay))) + ((new) >> (decay)); \ + } \ + ret; \ +}) + +/* * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. * @v: The value for which we're computing the base 2 logarithm. */ @@ -663,6 +730,25 @@ static inline u32 log2_u64(u64 v) } /* + * sqrt_u64 - Calculate the square root of value @x using Newton's method. + */ +static inline u64 __sqrt_u64(u64 x) +{ + if (x == 0 || x == 1) + return x; + + u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32); + + for (int i = 0; i < 8; ++i) { + u64 q = x / r; + if (r <= q) + break; + r = (r + q) >> 1; + } + return r; +} + +/* * Return a value proportionally scaled to the task's weight. */ static inline u64 scale_by_task_weight(const struct task_struct *p, u64 value) diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h index 1dc76bd84296..b3c6372bcf81 100644 --- a/tools/sched_ext/include/scx/common.h +++ b/tools/sched_ext/include/scx/common.h @@ -75,8 +75,9 @@ typedef int64_t s64; #include "enums.h" /* not available when building kernel tools/sched_ext */ -#if __has_include(<lib/sdt_task.h>) -#include <lib/sdt_task.h> +#if __has_include(<lib/sdt_task_defs.h>) +#include "bpf_arena_common.h" +#include <lib/sdt_task_defs.h> #endif #endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 9252e1a00556..dd9144624dc9 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -38,6 +38,7 @@ void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__i void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ (bpf_ksym_exists(scx_bpf_dsq_insert) ? \ @@ -82,6 +83,10 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) +#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ + (bpf_ksym_exists(bpf_cpumask_populate) ? \ + (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) + #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") @@ -226,6 +231,23 @@ static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) scx_bpf_pick_any_cpu(cpus_allowed, flags)) /* + * v6.18: Add a helper to retrieve the current task running on a CPU. + * + * Keep this helper available until v6.20 for compatibility. + */ +static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu) +{ + struct rq *rq; + + if (bpf_ksym_exists(scx_bpf_cpu_curr)) + return scx_bpf_cpu_curr(cpu); + + rq = scx_bpf_cpu_rq(cpu); + + return rq ? rq->curr : NULL; +} + +/* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). */ diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h new file mode 100644 index 000000000000..e7ac6611a990 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ + +#ifndef __USER_EXIT_INFO_BPF_H +#define __USER_EXIT_INFO_BPF_H + +#ifndef LSP +#include "vmlinux.h" +#endif +#include <bpf/bpf_core_read.h> + +#include "user_exit_info_common.h" + +#define UEI_DEFINE(__name) \ + char RESIZABLE_ARRAY(data, __name##_dump); \ + const volatile u32 __name##_dump_len; \ + struct user_exit_info __name SEC(".data") + +#define UEI_RECORD(__uei_name, __ei) ({ \ + bpf_probe_read_kernel_str(__uei_name.reason, \ + sizeof(__uei_name.reason), (__ei)->reason); \ + bpf_probe_read_kernel_str(__uei_name.msg, \ + sizeof(__uei_name.msg), (__ei)->msg); \ + bpf_probe_read_kernel_str(__uei_name##_dump, \ + __uei_name##_dump_len, (__ei)->dump); \ + if (bpf_core_field_exists((__ei)->exit_code)) \ + __uei_name.exit_code = (__ei)->exit_code; \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ + (__ei)->kind); \ +}) + +#endif /* __USER_EXIT_INFO_BPF_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index 66f856640ee7..399697fa372f 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -10,55 +10,11 @@ #ifndef __USER_EXIT_INFO_H #define __USER_EXIT_INFO_H -#ifdef LSP -#define __bpf__ -#include "../vmlinux.h" -#endif - -enum uei_sizes { - UEI_REASON_LEN = 128, - UEI_MSG_LEN = 1024, - UEI_DUMP_DFL_LEN = 32768, -}; - -struct user_exit_info { - int kind; - s64 exit_code; - char reason[UEI_REASON_LEN]; - char msg[UEI_MSG_LEN]; -}; - -#ifdef __bpf__ - -#ifndef LSP -#include "vmlinux.h" -#endif -#include <bpf/bpf_core_read.h> - -#define UEI_DEFINE(__name) \ - char RESIZABLE_ARRAY(data, __name##_dump); \ - const volatile u32 __name##_dump_len; \ - struct user_exit_info __name SEC(".data") - -#define UEI_RECORD(__uei_name, __ei) ({ \ - bpf_probe_read_kernel_str(__uei_name.reason, \ - sizeof(__uei_name.reason), (__ei)->reason); \ - bpf_probe_read_kernel_str(__uei_name.msg, \ - sizeof(__uei_name.msg), (__ei)->msg); \ - bpf_probe_read_kernel_str(__uei_name##_dump, \ - __uei_name##_dump_len, (__ei)->dump); \ - if (bpf_core_field_exists((__ei)->exit_code)) \ - __uei_name.exit_code = (__ei)->exit_code; \ - /* use __sync to force memory barrier */ \ - __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ - (__ei)->kind); \ -}) - -#else /* !__bpf__ */ - #include <stdio.h> #include <stdbool.h> +#include "user_exit_info_common.h" + /* no need to call the following explicitly if SCX_OPS_LOAD() is used */ #define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ @@ -114,5 +70,4 @@ enum uei_ecode_mask { #define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) -#endif /* __bpf__ */ #endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h new file mode 100644 index 000000000000..2d0981aedd89 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info_common.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifndef __USER_EXIT_INFO_COMMON_H +#define __USER_EXIT_INFO_COMMON_H + +#ifdef LSP +#include "../vmlinux.h" +#endif + +enum uei_sizes { + UEI_REASON_LEN = 128, + UEI_MSG_LEN = 1024, + UEI_DUMP_DFL_LEN = 32768, +}; + +struct user_exit_info { + int kind; + s64 exit_code; + char reason[UEI_REASON_LEN]; + char msg[UEI_MSG_LEN]; +}; + +#endif /* __USER_EXIT_INFO_COMMON_H */ diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 50bc1737c167..55df8b798865 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * A central FIFO sched_ext scheduler which demonstrates the followings: + * A central FIFO sched_ext scheduler which demonstrates the following: * * a. Making all scheduling decisions from one CPU: * diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 6ba6e610eeaa..55931a4cd71c 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -61,6 +61,7 @@ restart: skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + assert(skel->rodata->nr_cpu_ids > 0); assert(skel->rodata->nr_cpu_ids <= INT32_MAX); while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index fdc7170639e6..2c720e3ecad5 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -950,5 +950,5 @@ SCX_OPS_DEFINE(flatcg_ops, .cgroup_move = (void *)fcg_cgroup_move, .init = (void *)fcg_init, .exit = (void *)fcg_exit, - .flags = SCX_OPS_ENQ_EXITING, + .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, .name = "flatcg"); diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c index 6dd423eeb4ff..cd85eb401179 100644 --- a/tools/sched_ext/scx_flatcg.c +++ b/tools/sched_ext/scx_flatcg.c @@ -6,6 +6,7 @@ */ #include <stdio.h> #include <signal.h> +#include <assert.h> #include <unistd.h> #include <libgen.h> #include <limits.h> @@ -137,6 +138,7 @@ restart: skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + assert(skel->rodata->nr_cpus > 0); skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 69d877501cb7..3072b593f898 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -39,7 +39,8 @@ const volatile u32 stall_kernel_nth; const volatile u32 dsp_inf_loop_after; const volatile u32 dsp_batch; const volatile bool highpri_boosting; -const volatile bool print_shared_dsq; +const volatile bool print_dsqs_and_events; +const volatile bool print_msgs; const volatile s32 disallow_tgid; const volatile bool suppress_dump; @@ -56,7 +57,8 @@ struct qmap { queue1 SEC(".maps"), queue2 SEC(".maps"), queue3 SEC(".maps"), - queue4 SEC(".maps"); + queue4 SEC(".maps"), + dump_store SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); @@ -578,11 +580,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) return; scx_bpf_dump("QMAP FIFO[%d]:", i); + + /* + * Dump can be invoked anytime and there is no way to iterate in + * a non-destructive way. Pop and store in dump_store and then + * restore afterwards. If racing against new enqueues, ordering + * can get mixed up. + */ bpf_repeat(4096) { if (bpf_map_pop_elem(fifo, &pid)) break; + bpf_map_push_elem(&dump_store, &pid, 0); scx_bpf_dump(" %d", pid); } + + bpf_repeat(4096) { + if (bpf_map_pop_elem(&dump_store, &pid)) + break; + bpf_map_push_elem(fifo, &pid, 0); + } + scx_bpf_dump("\n"); } } @@ -617,22 +634,25 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args) { - bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", - cgrp->kn->id, args->weight, args->bw_period_us, - args->bw_quota_us, args->bw_burst_us); + if (print_msgs) + bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", + cgrp->kn->id, args->weight, args->bw_period_us, + args->bw_quota_us, args->bw_burst_us); return 0; } void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight) { - bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); + if (print_msgs) + bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); } void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) { - bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id, - period_us, quota_us, burst_us); + if (print_msgs) + bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", + cgrp->kn->id, period_us, quota_us, burst_us); } /* @@ -676,16 +696,20 @@ static void print_cpus(void) void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) { - bpf_printk("CPU %d coming online", cpu); - /* @cpu is already online at this point */ - print_cpus(); + if (print_msgs) { + bpf_printk("CPU %d coming online", cpu); + /* @cpu is already online at this point */ + print_cpus(); + } } void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) { - bpf_printk("CPU %d going offline", cpu); - /* @cpu is still online at this point */ - print_cpus(); + if (print_msgs) { + bpf_printk("CPU %d going offline", cpu); + /* @cpu is still online at this point */ + print_cpus(); + } } struct monitor_timer { @@ -783,35 +807,36 @@ static void dump_shared_dsq(void) static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { - struct scx_event_stats events; - bpf_rcu_read_lock(); dispatch_highpri(true); bpf_rcu_read_unlock(); monitor_cpuperf(); - if (print_shared_dsq) + if (print_dsqs_and_events) { + struct scx_event_stats events; + dump_shared_dsq(); - __COMPAT_scx_bpf_events(&events, sizeof(events)); - - bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", - scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", - scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", - scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); - bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", - scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); - bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", - scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", - scx_read_event(&events, SCX_EV_BYPASS_DURATION)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", - scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", - scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + __COMPAT_scx_bpf_events(&events, sizeof(events)); + + bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", + scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + } bpf_timer_start(timer, ONE_SEC_IN_NS, 0); return 0; @@ -823,7 +848,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) struct bpf_timer *timer; s32 ret; - print_cpus(); + if (print_msgs) + print_cpus(); ret = scx_bpf_create_dsq(SHARED_DSQ, -1); if (ret) diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index c4912ab2e76f..ef701d45ba43 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -20,7 +20,7 @@ const char help_fmt[] = "See the top-level comment in .bpf.c for more details.\n" "\n" "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" -" [-P] [-d PID] [-D LEN] [-p] [-v]\n" +" [-P] [-M] [-d PID] [-D LEN] [-p] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" @@ -28,7 +28,8 @@ const char help_fmt[] = " -T COUNT Stall every COUNT'th kernel thread\n" " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -b COUNT Dispatch upto COUNT tasks together\n" -" -P Print out DSQ content to trace_pipe every second, use with -b\n" +" -P Print out DSQ content and event counters to trace_pipe every second\n" +" -M Print out debug messages to trace_pipe\n" " -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" @@ -66,7 +67,7 @@ int main(int argc, char **argv) skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -87,7 +88,10 @@ int main(int argc, char **argv) skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); break; case 'P': - skel->rodata->print_shared_dsq = true; + skel->rodata->print_dsqs_and_events = true; + break; + case 'M': + skel->rodata->print_msgs = true; break; case 'H': skel->rodata->highpri_boosting = true; diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c index 76d83199545c..06d4b13bf76b 100644 --- a/tools/sched_ext/scx_simple.c +++ b/tools/sched_ext/scx_simple.c @@ -7,6 +7,7 @@ #include <stdio.h> #include <unistd.h> #include <signal.h> +#include <assert.h> #include <libgen.h> #include <bpf/bpf.h> #include <scx/common.h> @@ -41,6 +42,7 @@ static void sigint_handler(int simple) static void read_stats(struct scx_simple *skel, __u64 *stats) { int nr_cpus = libbpf_num_possible_cpus(); + assert(nr_cpus > 0); __u64 cnts[2][nr_cpus]; __u32 idx; diff --git a/tools/scripts/syscall.tbl b/tools/scripts/syscall.tbl index 580b4e246aec..d1ae5e92c615 100644 --- a/tools/scripts/syscall.tbl +++ b/tools/scripts/syscall.tbl @@ -408,3 +408,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index 002ec38a8bbb..3b96d090c5eb 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -17,6 +17,8 @@ #include <asm/sigcontext.h> #include <asm/unistd.h> +#include <linux/auxvec.h> + #include "../../kselftest.h" #define TESTS_PER_HWCAP 3 @@ -55,7 +57,6 @@ static void cmpbr_sigill(void) /* Not implemented, too complicated and unreliable anyway */ } - static void crc32_sigill(void) { /* CRC32W W0, W0, W1 */ @@ -169,6 +170,18 @@ static void lse128_sigill(void) : "cc", "memory"); } +static void lsfe_sigill(void) +{ + float __attribute__ ((aligned (16))) mem; + register float *memp asm ("x0") = &mem; + + /* STFADD H0, [X0] */ + asm volatile(".inst 0x7c20801f" + : "+r" (memp) + : + : "memory"); +} + static void lut_sigill(void) { /* LUTI2 V0.16B, { V0.16B }, V[0] */ @@ -763,6 +776,13 @@ static const struct hwcap_data { .sigill_fn = lse128_sigill, }, { + .name = "LSFE", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_LSFE, + .cpuinfo = "lsfe", + .sigill_fn = lsfe_sigill, + }, + { .name = "LUT", .at_hwcap = AT_HWCAP2, .hwcap_bit = HWCAP2_LUT, diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index f58a9f89b952..4c89ab0f1010 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -227,10 +227,10 @@ int main(int argc, char **argv) ret = open("/proc/sys/abi/sme_default_vector_length", O_RDONLY, 0); if (ret >= 0) { ksft_test_result(default_value(), "default_value\n"); - ksft_test_result(write_read, "write_read\n"); - ksft_test_result(write_sleep_read, "write_sleep_read\n"); - ksft_test_result(write_fork_read, "write_fork_read\n"); - ksft_test_result(write_clone_read, "write_clone_read\n"); + ksft_test_result(write_read(), "write_read\n"); + ksft_test_result(write_sleep_read(), "write_sleep_read\n"); + ksft_test_result(write_fork_read(), "write_fork_read\n"); + ksft_test_result(write_clone_read(), "write_clone_read\n"); } else { ksft_print_msg("SME support not present\n"); diff --git a/tools/testing/selftests/arm64/bti/assembler.h b/tools/testing/selftests/arm64/bti/assembler.h index 04e7b72880ef..141cdcbf0b8f 100644 --- a/tools/testing/selftests/arm64/bti/assembler.h +++ b/tools/testing/selftests/arm64/bti/assembler.h @@ -14,7 +14,6 @@ #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) - .macro startfn name:req .globl \name \name: diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index 124bc883365e..a85c19e9524e 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -1187,7 +1187,7 @@ static void sve_write_sve(pid_t child, struct test_config *config) if (!vl) return; - iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE); + iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_SVE); iov.iov_base = malloc(iov.iov_len); if (!iov.iov_base) { ksft_print_msg("Failed allocating %lu byte SVE write buffer\n", @@ -1234,8 +1234,7 @@ static void sve_write_fpsimd(pid_t child, struct test_config *config) if (!vl) return; - iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, - SVE_PT_REGS_FPSIMD); + iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_FPSIMD); iov.iov_base = malloc(iov.iov_len); if (!iov.iov_base) { ksft_print_msg("Failed allocating %lu byte SVE write buffer\n", @@ -1569,7 +1568,6 @@ static void run_sve_tests(void) &test_config); } } - } static void run_sme_tests(void) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index 74e23208b94c..9349aa630c84 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -105,8 +105,8 @@ static void child_start(struct child_data *child, const char *program) /* * Read from the startup pipe, there should be no data - * and we should block until it is closed. We just - * carry on on error since this isn't super critical. + * and we should block until it is closed. We just + * carry-on on error since this isn't super critical. */ ret = read(3, &i, sizeof(i)); if (ret < 0) @@ -549,7 +549,7 @@ int main(int argc, char **argv) evs = calloc(tests, sizeof(*evs)); if (!evs) - ksft_exit_fail_msg("Failed to allocated %d epoll events\n", + ksft_exit_fail_msg("Failed to allocate %d epoll events\n", tests); for (i = 0; i < cpus; i++) { diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c index e3cec3723ffa..0c40007d1282 100644 --- a/tools/testing/selftests/arm64/fp/kernel-test.c +++ b/tools/testing/selftests/arm64/fp/kernel-test.c @@ -188,13 +188,13 @@ static bool create_socket(void) ref = malloc(digest_len); if (!ref) { - printf("Failed to allocated %d byte reference\n", digest_len); + printf("Failed to allocate %d byte reference\n", digest_len); return false; } digest = malloc(digest_len); if (!digest) { - printf("Failed to allocated %d byte digest\n", digest_len); + printf("Failed to allocate %d byte digest\n", digest_len); return false; } diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index b22303778fb0..e0fc3a001e28 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -66,7 +66,7 @@ static const struct vec_type vec_types[] = { }; #define VL_TESTS (((TEST_VQ_MAX - SVE_VQ_MIN) + 1) * 4) -#define FLAG_TESTS 2 +#define FLAG_TESTS 4 #define FPSIMD_TESTS 2 #define EXPECTED_TESTS ((VL_TESTS + FLAG_TESTS + FPSIMD_TESTS) * ARRAY_SIZE(vec_types)) @@ -95,19 +95,27 @@ static int do_child(void) static int get_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd) { struct iovec iov; + int ret; iov.iov_base = fpsimd; iov.iov_len = sizeof(*fpsimd); - return ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); + ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_GETREGSET)"); + return ret; } static int set_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd) { struct iovec iov; + int ret; iov.iov_base = fpsimd; iov.iov_len = sizeof(*fpsimd); - return ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); + ret = ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_SETREGSET)"); + return ret; } static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, @@ -115,8 +123,9 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, { struct user_sve_header *sve; void *p; - size_t sz = sizeof *sve; + size_t sz = sizeof(*sve); struct iovec iov; + int ret; while (1) { if (*size < sz) { @@ -132,8 +141,11 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, iov.iov_base = *buf; iov.iov_len = sz; - if (ptrace(PTRACE_GETREGSET, pid, type->regset, &iov)) + ret = ptrace(PTRACE_GETREGSET, pid, type->regset, &iov); + if (ret) { + ksft_perror("ptrace(PTRACE_GETREGSET)"); goto error; + } sve = *buf; if (sve->size <= sz) @@ -152,10 +164,46 @@ static int set_sve(pid_t pid, const struct vec_type *type, const struct user_sve_header *sve) { struct iovec iov; + int ret; iov.iov_base = (void *)sve; iov.iov_len = sve->size; - return ptrace(PTRACE_SETREGSET, pid, type->regset, &iov); + ret = ptrace(PTRACE_SETREGSET, pid, type->regset, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_SETREGSET)"); + return ret; +} + +/* A read operation fails */ +static void read_fails(pid_t child, const struct vec_type *type) +{ + struct user_sve_header *new_sve = NULL; + size_t new_sve_size = 0; + void *ret; + + ret = get_sve(child, type, (void **)&new_sve, &new_sve_size); + + ksft_test_result(ret == NULL, "%s unsupported read fails\n", + type->name); + + free(new_sve); +} + +/* A write operation fails */ +static void write_fails(pid_t child, const struct vec_type *type) +{ + struct user_sve_header sve; + int ret; + + /* Just the header, no data */ + memset(&sve, 0, sizeof(sve)); + sve.size = sizeof(sve); + sve.flags = SVE_PT_REGS_SVE; + sve.vl = SVE_VL_MIN; + ret = set_sve(child, type, &sve); + + ksft_test_result(ret != 0, "%s unsupported write fails\n", + type->name); } /* Validate setting and getting the inherit flag */ @@ -270,6 +318,25 @@ static void check_u32(unsigned int vl, const char *reg, } } +/* Set out of range VLs */ +static void ptrace_set_vl_ranges(pid_t child, const struct vec_type *type) +{ + struct user_sve_header sve; + int ret; + + memset(&sve, 0, sizeof(sve)); + sve.flags = SVE_PT_REGS_SVE; + sve.size = sizeof(sve); + + ret = set_sve(child, type, &sve); + ksft_test_result(ret != 0, "%s Set invalid VL 0\n", type->name); + + sve.vl = SVE_VL_MAX + SVE_VQ_BYTES; + ret = set_sve(child, type, &sve); + ksft_test_result(ret != 0, "%s Set invalid VL %d\n", type->name, + SVE_VL_MAX + SVE_VQ_BYTES); +} + /* Access the FPSIMD registers via the SVE regset */ static void ptrace_sve_fpsimd(pid_t child, const struct vec_type *type) { @@ -683,6 +750,20 @@ static int do_parent(pid_t child) } for (i = 0; i < ARRAY_SIZE(vec_types); i++) { + /* + * If the vector type isn't supported reads and writes + * should fail. + */ + if (!(getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap)) { + read_fails(child, &vec_types[i]); + write_fails(child, &vec_types[i]); + } else { + ksft_test_result_skip("%s unsupported read fails\n", + vec_types[i].name); + ksft_test_result_skip("%s unsupported write fails\n", + vec_types[i].name); + } + /* FPSIMD via SVE regset */ if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) { ptrace_sve_fpsimd(child, &vec_types[i]); @@ -703,6 +784,17 @@ static int do_parent(pid_t child) vec_types[i].name); } + /* Setting out of bounds VLs should fail */ + if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) { + ptrace_set_vl_ranges(child, &vec_types[i]); + } else { + ksft_test_result_skip("%s Set invalid VL 0\n", + vec_types[i].name); + ksft_test_result_skip("%s Set invalid VL %d\n", + vec_types[i].name, + SVE_VL_MAX + SVE_VQ_BYTES); + } + /* Step through every possible VQ */ for (vq = SVE_VQ_MIN; vq <= TEST_VQ_MAX; vq++) { vl = sve_vl_from_vq(vq); diff --git a/tools/testing/selftests/arm64/fp/vec-syscfg.c b/tools/testing/selftests/arm64/fp/vec-syscfg.c index ea9c7d47790f..2d75d342eeb9 100644 --- a/tools/testing/selftests/arm64/fp/vec-syscfg.c +++ b/tools/testing/selftests/arm64/fp/vec-syscfg.c @@ -690,7 +690,6 @@ static inline void smstop(void) asm volatile("msr S0_3_C4_C6_3, xzr"); } - /* * Verify we can change the SVE vector length while SME is active and * continue to use SME afterwards. diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c index 584b8d59b7ea..a7f34040fbf1 100644 --- a/tools/testing/selftests/arm64/fp/zt-ptrace.c +++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c @@ -108,7 +108,6 @@ static int get_zt(pid_t pid, char zt[ZT_SIG_REG_BYTES]) return ptrace(PTRACE_GETREGSET, pid, NT_ARM_ZT, &iov); } - static int set_zt(pid_t pid, const char zt[ZT_SIG_REG_BYTES]) { struct iovec iov; diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index d2f3497a9103..1fbbf0ca1f02 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -14,11 +14,11 @@ LDLIBS+=-lpthread include ../../lib.mk $(OUTPUT)/basic-gcs: basic-gcs.c - $(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ - -static -include ../../../../include/nolibc/nolibc.h \ + $(CC) $(CFLAGS) -fno-asynchronous-unwind-tables -fno-ident -s -nostdlib -nostdinc \ + -static -I../../../../include/nolibc -include ../../../../include/nolibc/nolibc.h \ -I../../../../../usr/include \ -std=gnu99 -I../.. -g \ - -ffreestanding -Wall $^ -o $@ -lgcc + -ffreestanding $^ -o $@ -lgcc $(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S $(CC) -nostdlib $^ -o $@ diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c index 54f9c888249d..250977abc398 100644 --- a/tools/testing/selftests/arm64/gcs/basic-gcs.c +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -10,6 +10,7 @@ #include <sys/mman.h> #include <asm/mman.h> +#include <asm/hwcap.h> #include <linux/sched.h> #include "kselftest.h" @@ -386,14 +387,13 @@ int main(void) ksft_print_header(); - /* - * We don't have getauxval() with nolibc so treat a failure to - * read GCS state as a lack of support and skip. - */ + if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) + ksft_exit_skip("SKIP GCS not supported\n"); + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode, 0, 0, 0); if (ret != 0) - ksft_exit_skip("Failed to read GCS state: %d\n", ret); + ksft_exit_fail_msg("Failed to read GCS state: %d\n", ret); if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) { gcs_mode = PR_SHADOW_STACK_ENABLE; @@ -410,7 +410,7 @@ int main(void) } /* One last test: disable GCS, we can do this one time */ - my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); + ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); if (ret != 0) ksft_print_msg("Failed to disable GCS: %d\n", ret); diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c b/tools/testing/selftests/arm64/gcs/gcs-locking.c index 989f75a491b7..1e6abb136ffd 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-locking.c +++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c @@ -165,7 +165,6 @@ TEST_F(valid_modes, lock_enable_disable_others) ASSERT_EQ(ret, 0); ASSERT_EQ(mode, PR_SHADOW_STACK_ALL_MODES); - ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, variant->mode); ASSERT_EQ(ret, 0); diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c index bbc7f4950c13..cf316d78ea97 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-stress.c +++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c @@ -433,7 +433,7 @@ int main(int argc, char **argv) evs = calloc(tests, sizeof(*evs)); if (!evs) - ksft_exit_fail_msg("Failed to allocated %d epoll events\n", + ksft_exit_fail_msg("Failed to allocate %d epoll events\n", tests); for (i = 0; i < gcs_threads; i++) diff --git a/tools/testing/selftests/arm64/pauth/exec_target.c b/tools/testing/selftests/arm64/pauth/exec_target.c index 4435600ca400..e597861b26d6 100644 --- a/tools/testing/selftests/arm64/pauth/exec_target.c +++ b/tools/testing/selftests/arm64/pauth/exec_target.c @@ -13,7 +13,12 @@ int main(void) unsigned long hwcaps; size_t val; - fread(&val, sizeof(size_t), 1, stdin); + size_t size = fread(&val, sizeof(size_t), 1, stdin); + + if (size != 1) { + fprintf(stderr, "Could not read input from stdin\n"); + return EXIT_FAILURE; + } /* don't try to execute illegal (unimplemented) instructions) caller * should have checked this and keep worker simple diff --git a/tools/testing/selftests/bpf/prog_tests/free_timer.c b/tools/testing/selftests/bpf/prog_tests/free_timer.c index b7b77a6b2979..0de8facca4c5 100644 --- a/tools/testing/selftests/bpf/prog_tests/free_timer.c +++ b/tools/testing/selftests/bpf/prog_tests/free_timer.c @@ -124,6 +124,10 @@ void test_free_timer(void) int err; skel = free_timer__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "open_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index d66687f1ee6a..56f660ca567b 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -86,6 +86,10 @@ void serial_test_timer(void) int err; timer_skel = timer__open_and_load(); + if (!timer_skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/timer_crash.c b/tools/testing/selftests/bpf/prog_tests/timer_crash.c index f74b82305da8..b841597c8a3a 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_crash.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_crash.c @@ -12,6 +12,10 @@ static void test_timer_crash_mode(int mode) struct timer_crash *skel; skel = timer_crash__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "timer_crash__open_and_load")) return; skel->bss->pid = getpid(); diff --git a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c index 1a2f99596916..eb303fa1e09a 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c @@ -59,6 +59,10 @@ void test_timer_lockup(void) } skel = timer_lockup__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/timer_mim.c b/tools/testing/selftests/bpf/prog_tests/timer_mim.c index 9ff7843909e7..c930c7d7105b 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_mim.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_mim.c @@ -65,6 +65,10 @@ void serial_test_timer_mim(void) goto cleanup; timer_skel = timer_mim__open_and_load(); + if (!timer_skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index b17dc39a23db..6d75ede16e7c 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -8,22 +8,31 @@ #include <asm/ptrace.h> #include <linux/compiler.h> #include <linux/stringify.h> +#include <linux/kernel.h> #include <sys/wait.h> #include <sys/syscall.h> #include <sys/prctl.h> #include <asm/prctl.h> #include "uprobe_syscall.skel.h" #include "uprobe_syscall_executed.skel.h" +#include "bpf/libbpf_internal.h" -__naked unsigned long uretprobe_regs_trigger(void) +#define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00 +#include "usdt.h" + +#pragma GCC diagnostic ignored "-Wattributes" + +__attribute__((aligned(16))) +__nocf_check __weak __naked unsigned long uprobe_regs_trigger(void) { asm volatile ( + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */ "movq $0xdeadbeef, %rax\n" "ret\n" ); } -__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) +__naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after) { asm volatile ( "movq %r15, 0(%rdi)\n" @@ -44,15 +53,17 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) "movq $0, 120(%rdi)\n" /* orig_rax */ "movq $0, 128(%rdi)\n" /* rip */ "movq $0, 136(%rdi)\n" /* cs */ + "pushq %rax\n" "pushf\n" "pop %rax\n" "movq %rax, 144(%rdi)\n" /* eflags */ + "pop %rax\n" "movq %rsp, 152(%rdi)\n" /* rsp */ "movq $0, 160(%rdi)\n" /* ss */ /* save 2nd argument */ "pushq %rsi\n" - "call uretprobe_regs_trigger\n" + "call uprobe_regs_trigger\n" /* save return value and load 2nd argument pointer to rax */ "pushq %rax\n" @@ -92,25 +103,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) ); } -static void test_uretprobe_regs_equal(void) +static void test_uprobe_regs_equal(bool retprobe) { + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = retprobe, + ); struct uprobe_syscall *skel = NULL; struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; unsigned long *pa = (unsigned long *) &after; unsigned long *pp; + unsigned long offset; unsigned int i, cnt; - int err; + + offset = get_uprobe_offset(&uprobe_regs_trigger); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return; skel = uprobe_syscall__open_and_load(); if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load")) goto cleanup; - err = uprobe_syscall__attach(skel); - if (!ASSERT_OK(err, "uprobe_syscall__attach")) + skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts")) goto cleanup; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + if (!retprobe) + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); pp = (unsigned long *) &skel->bss->regs; cnt = sizeof(before)/sizeof(*pb); @@ -119,7 +142,7 @@ static void test_uretprobe_regs_equal(void) unsigned int offset = i * sizeof(unsigned long); /* - * Check register before and after uretprobe_regs_trigger call + * Check register before and after uprobe_regs_trigger call * that triggers the uretprobe. */ switch (offset) { @@ -133,7 +156,7 @@ static void test_uretprobe_regs_equal(void) /* * Check register seen from bpf program and register after - * uretprobe_regs_trigger call + * uprobe_regs_trigger call (with rax exception, check below). */ switch (offset) { /* @@ -146,6 +169,15 @@ static void test_uretprobe_regs_equal(void) case offsetof(struct pt_regs, rsp): case offsetof(struct pt_regs, ss): break; + /* + * uprobe does not see return value in rax, it needs to see the + * original (before) rax value + */ + case offsetof(struct pt_regs, rax): + if (!retprobe) { + ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check"); + break; + } default: if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check")) fprintf(stdout, "failed register offset %u\n", offset); @@ -175,7 +207,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset) return ret != n ? (int) ret : 0; } -static void test_uretprobe_regs_change(void) +static void test_regs_change(void) { struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; @@ -183,13 +215,16 @@ static void test_uretprobe_regs_change(void) unsigned long cnt = sizeof(before)/sizeof(*pb); unsigned int i, err, offset; - offset = get_uprobe_offset(uretprobe_regs_trigger); + offset = get_uprobe_offset(uprobe_regs_trigger); err = write_bpf_testmod_uprobe(offset); if (!ASSERT_OK(err, "register_uprobe")) return; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); err = write_bpf_testmod_uprobe(0); if (!ASSERT_OK(err, "unregister_uprobe")) @@ -252,6 +287,7 @@ static void test_uretprobe_syscall_call(void) ); struct uprobe_syscall_executed *skel; int pid, status, err, go[2], c = 0; + struct bpf_link *link; if (!ASSERT_OK(pipe(go), "pipe")) return; @@ -277,11 +313,14 @@ static void test_uretprobe_syscall_call(void) _exit(0); } - skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid, - "/proc/self/exe", - "uretprobe_syscall_call", &opts); - if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi")) + skel->bss->pid = pid; + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + pid, "/proc/self/exe", + "uretprobe_syscall_call", &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) goto cleanup; + skel->links.test_uretprobe_multi = link; /* kick the child */ write(go[1], &c, 1); @@ -301,6 +340,256 @@ cleanup: close(go[0]); } +#define TRAMP "[uprobes-trampoline]" + +__attribute__((aligned(16))) +__nocf_check __weak __naked void uprobe_test(void) +{ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} + +__attribute__((aligned(16))) +__nocf_check __weak void usdt_test(void) +{ + USDT(optimized_uprobe, usdt); +} + +static int find_uprobes_trampoline(void *tramp_addr) +{ + void *start, *end; + char line[128]; + int ret = -1; + FILE *maps; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + fprintf(stderr, "cannot open maps\n"); + return -1; + } + + while (fgets(line, sizeof(line), maps)) { + int m = -1; + + /* We care only about private r-x mappings. */ + if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", &start, &end, &m) != 2) + continue; + if (m < 0) + continue; + if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1) && (start == tramp_addr)) { + ret = 0; + break; + } + } + + fclose(maps); + return ret; +} + +static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; + +static void *find_nop5(void *fn) +{ + int i; + + for (i = 0; i < 10; i++) { + if (!memcmp(nop5, fn + i, 5)) + return fn + i; + } + return NULL; +} + +typedef void (__attribute__((nocf_check)) *trigger_t)(void); + +static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger, + void *addr, int executed) +{ + struct __arch_relative_insn { + __u8 op; + __s32 raddr; + } __packed *call; + void *tramp = NULL; + + /* Uprobe gets optimized after first trigger, so let's press twice. */ + trigger(); + trigger(); + + /* Make sure bpf program got executed.. */ + ASSERT_EQ(skel->bss->executed, executed, "executed"); + + /* .. and check the trampoline is as expected. */ + call = (struct __arch_relative_insn *) addr; + tramp = (void *) (call + 1) + call->raddr; + ASSERT_EQ(call->op, 0xe8, "call"); + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); + + return tramp; +} + +static void check_detach(void *addr, void *tramp) +{ + /* [uprobes_trampoline] stays after detach */ + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); + ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); +} + +static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link, + trigger_t trigger, void *addr, int executed) +{ + void *tramp; + + tramp = check_attach(skel, trigger, addr, executed); + bpf_link__destroy(link); + check_detach(addr, tramp); +} + +static void test_uprobe_legacy(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + /* uprobe */ + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe */ + skel->bss->executed = 0; + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_multi(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + /* uprobe.multi */ + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe.multi */ + skel->bss->executed = 0; + opts.retprobe = true; + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_session(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .session = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 4); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_usdt(void) +{ + struct uprobe_syscall_executed *skel; + struct bpf_link *link; + void *addr; + + errno = 0; + addr = find_nop5(usdt_test); + if (!ASSERT_OK_PTR(addr, "find_nop5")) + return; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_usdt(skel->progs.test_usdt, + -1 /* all PIDs */, "/proc/self/exe", + "optimized_uprobe", "usdt", NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt")) + goto cleanup; + + check(skel, link, usdt_test, addr, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + /* * Borrowed from tools/testing/selftests/x86/test_shadow_stack.c. * @@ -343,43 +632,172 @@ static void test_uretprobe_shadow_stack(void) return; } - /* Run all of the uretprobe tests. */ - test_uretprobe_regs_equal(); - test_uretprobe_regs_change(); + /* Run all the tests with shadow stack in place. */ + + test_uprobe_regs_equal(false); + test_uprobe_regs_equal(true); test_uretprobe_syscall_call(); + test_uprobe_legacy(); + test_uprobe_multi(); + test_uprobe_session(); + test_uprobe_usdt(); + + test_regs_change(); + ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } -#else -static void test_uretprobe_regs_equal(void) + +static volatile bool race_stop; + +static USDT_DEFINE_SEMA(race); + +static void *worker_trigger(void *arg) { - test__skip(); + unsigned long rounds = 0; + + while (!race_stop) { + uprobe_test(); + rounds++; + } + + printf("tid %d trigger rounds: %lu\n", gettid(), rounds); + return NULL; } -static void test_uretprobe_regs_change(void) +static void *worker_attach(void *arg) { - test__skip(); + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct uprobe_syscall_executed *skel; + unsigned long rounds = 0, offset; + const char *sema[2] = { + __stringify(USDT_SEMA(race)), + NULL, + }; + unsigned long *ref; + int err; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return NULL; + + err = elf_resolve_syms_offsets("/proc/self/exe", 1, (const char **) &sema, &ref, STT_OBJECT); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema")) + return NULL; + + opts.ref_ctr_offset = *ref; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return NULL; + + skel->bss->pid = getpid(); + + while (!race_stop) { + skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts")) + break; + + bpf_link__destroy(skel->links.test_uprobe); + skel->links.test_uprobe = NULL; + rounds++; + } + + printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed); + uprobe_syscall_executed__destroy(skel); + free(ref); + return NULL; } -static void test_uretprobe_syscall_call(void) +static useconds_t race_msec(void) { - test__skip(); + char *env; + + env = getenv("BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC"); + if (env) + return atoi(env); + + /* default duration is 500ms */ + return 500; } -static void test_uretprobe_shadow_stack(void) +static void test_uprobe_race(void) { - test__skip(); + int err, i, nr_threads; + pthread_t *threads; + + nr_threads = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_threads, 0, "libbpf_num_possible_cpus")) + return; + nr_threads = max(2, nr_threads); + + threads = alloca(sizeof(*threads) * nr_threads); + if (!ASSERT_OK_PTR(threads, "malloc")) + return; + + for (i = 0; i < nr_threads; i++) { + err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach, + NULL); + if (!ASSERT_OK(err, "pthread_create")) + goto cleanup; + } + + usleep(race_msec() * 1000); + +cleanup: + race_stop = true; + for (nr_threads = i, i = 0; i < nr_threads; i++) + pthread_join(threads[i], NULL); + + ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore"); } + +#ifndef __NR_uprobe +#define __NR_uprobe 336 #endif -void test_uprobe_syscall(void) +static void test_uprobe_error(void) +{ + long err = syscall(__NR_uprobe); + + ASSERT_EQ(err, -1, "error"); + ASSERT_EQ(errno, ENXIO, "errno"); +} + +static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) - test_uretprobe_regs_equal(); - if (test__start_subtest("uretprobe_regs_change")) - test_uretprobe_regs_change(); + test_uprobe_regs_equal(true); if (test__start_subtest("uretprobe_syscall_call")) test_uretprobe_syscall_call(); if (test__start_subtest("uretprobe_shadow_stack")) test_uretprobe_shadow_stack(); + if (test__start_subtest("uprobe_legacy")) + test_uprobe_legacy(); + if (test__start_subtest("uprobe_multi")) + test_uprobe_multi(); + if (test__start_subtest("uprobe_session")) + test_uprobe_session(); + if (test__start_subtest("uprobe_usdt")) + test_uprobe_usdt(); + if (test__start_subtest("uprobe_race")) + test_uprobe_race(); + if (test__start_subtest("uprobe_error")) + test_uprobe_error(); + if (test__start_subtest("uprobe_regs_equal")) + test_uprobe_regs_equal(false); + if (test__start_subtest("regs_change")) + test_regs_change(); +} +#else +static void __test_uprobe_syscall(void) +{ + test__skip(); +} +#endif + +void test_uprobe_syscall(void) +{ + __test_uprobe_syscall(); } diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 9057e983cc54..833eb87483a1 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -40,12 +40,19 @@ static void __always_inline trigger_func(int x) { } } -static void subtest_basic_usdt(void) +static void subtest_basic_usdt(bool optimized) { LIBBPF_OPTS(bpf_usdt_opts, opts); struct test_usdt *skel; struct test_usdt__bss *bss; - int err, i; + int err, i, called; + +#define TRIGGER(x) ({ \ + trigger_func(x); \ + if (optimized) \ + trigger_func(x); \ + optimized ? 2 : 1; \ + }) skel = test_usdt__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) @@ -66,11 +73,11 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) goto cleanup; - trigger_func(1); + called = TRIGGER(1); - ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); @@ -119,11 +126,11 @@ static void subtest_basic_usdt(void) * bpf_program__attach_usdt() handles this properly and attaches to * all possible places of USDT invocation. */ - trigger_func(2); + called += TRIGGER(2); - ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); /* only check values that depend on trigger_func()'s input value */ ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1"); @@ -142,9 +149,9 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach")) goto cleanup; - trigger_func(3); + called += TRIGGER(3); - ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); /* this time usdt3 has custom cookie */ ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie"); ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); @@ -158,6 +165,7 @@ static void subtest_basic_usdt(void) cleanup: test_usdt__destroy(skel); +#undef TRIGGER } unsigned short test_usdt_100_semaphore SEC(".probes"); @@ -425,7 +433,11 @@ cleanup: void test_usdt(void) { if (test__start_subtest("basic")) - subtest_basic_usdt(); + subtest_basic_usdt(false); +#ifdef __x86_64__ + if (test__start_subtest("basic_optimized")) + subtest_basic_usdt(true); +#endif if (test__start_subtest("multispec")) subtest_multispec_usdt(); if (test__start_subtest("urand_auto_attach")) diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h index d67466c1ff77..f90531cf3ee5 100644 --- a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h +++ b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h @@ -302,7 +302,7 @@ int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val * barriers. */ if (val & _Q_LOCKED_MASK) - smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); + (void)smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); /* * take ownership and clear the pending bit. @@ -380,7 +380,7 @@ queue: /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); - arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); + (void)arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); /* * While waiting for the MCS lock, the next pointer may have diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c index 645be6cddf36..dfd8a258f14a 100644 --- a/tools/testing/selftests/bpf/progs/crypto_sanity.c +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -14,7 +14,7 @@ unsigned char key[256] = {}; u16 udp_test_port = 7777; u32 authsize, key_len; char algo[128] = {}; -char dst[16] = {}; +char dst[16] = {}, dst_bad[8] = {}; int status; static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) @@ -59,10 +59,9 @@ int skb_crypto_setup(void *ctx) .authsize = authsize, }; struct bpf_crypto_ctx *cctx; - int err = 0; + int err; status = 0; - if (key_len > 256) { status = -EINVAL; return 0; @@ -70,8 +69,8 @@ int skb_crypto_setup(void *ctx) __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); __builtin_memcpy(¶ms.key, key, sizeof(key)); - cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); if (!cctx) { status = err; return 0; @@ -80,7 +79,6 @@ int skb_crypto_setup(void *ctx) err = crypto_ctx_insert(cctx); if (err && err != -EEXIST) status = err; - return 0; } @@ -92,6 +90,7 @@ int decrypt_sanity(struct __sk_buff *skb) struct bpf_dynptr psrc, pdst; int err; + status = 0; err = skb_dynptr_validate(skb, &psrc); if (err < 0) { status = err; @@ -110,13 +109,23 @@ int decrypt_sanity(struct __sk_buff *skb) return TC_ACT_SHOT; } - /* dst is a global variable to make testing part easier to check. In real - * production code, a percpu map should be used to store the result. + /* Check also bad case where the dst buffer is smaller than the + * skb's linear section. + */ + bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst); + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL); + if (!status) + status = -EIO; + if (status != -EINVAL) + goto err; + + /* dst is a global variable to make testing part easier to check. + * In real production code, a percpu map should be used to store + * the result. */ bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); - status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL); - +err: return TC_ACT_SHOT; } @@ -129,7 +138,6 @@ int encrypt_sanity(struct __sk_buff *skb) int err; status = 0; - err = skb_dynptr_validate(skb, &psrc); if (err < 0) { status = err; @@ -148,13 +156,23 @@ int encrypt_sanity(struct __sk_buff *skb) return TC_ACT_SHOT; } - /* dst is a global variable to make testing part easier to check. In real - * production code, a percpu map should be used to store the result. + /* Check also bad case where the dst buffer is smaller than the + * skb's linear section. + */ + bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst); + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL); + if (!status) + status = -EIO; + if (status != -EINVAL) + goto err; + + /* dst is a global variable to make testing part easier to check. + * In real production code, a percpu map should be used to store + * the result. */ bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); - status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL); - +err: return TC_ACT_SHOT; } diff --git a/tools/testing/selftests/bpf/progs/linked_list_fail.c b/tools/testing/selftests/bpf/progs/linked_list_fail.c index 6438982b928b..ddd26d1a083f 100644 --- a/tools/testing/selftests/bpf/progs/linked_list_fail.c +++ b/tools/testing/selftests/bpf/progs/linked_list_fail.c @@ -226,8 +226,7 @@ int obj_new_no_composite(void *ctx) SEC("?tc") int obj_new_no_struct(void *ctx) { - - bpf_obj_new(union { int data; unsigned udata; }); + (void)bpf_obj_new(union { int data; unsigned udata; }); return 0; } @@ -252,7 +251,7 @@ int new_null_ret(void *ctx) SEC("?tc") int obj_new_acq(void *ctx) { - bpf_obj_new(struct foo); + (void)bpf_obj_new(struct foo); return 0; } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index 46697f381878..a47690174e0e 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -30,8 +30,12 @@ __test(2) int test_strcspn(void *ctx) { return bpf_strcspn(str, "lo"); } __test(6) int test_strstr_found(void *ctx) { return bpf_strstr(str, "world"); } __test(-ENOENT) int test_strstr_notfound(void *ctx) { return bpf_strstr(str, "hi"); } __test(0) int test_strstr_empty(void *ctx) { return bpf_strstr(str, ""); } -__test(0) int test_strnstr_found(void *ctx) { return bpf_strnstr(str, "hello", 6); } -__test(-ENOENT) int test_strnstr_notfound(void *ctx) { return bpf_strnstr(str, "hi", 10); } +__test(0) int test_strnstr_found1(void *ctx) { return bpf_strnstr("", "", 0); } +__test(0) int test_strnstr_found2(void *ctx) { return bpf_strnstr(str, "hello", 5); } +__test(0) int test_strnstr_found3(void *ctx) { return bpf_strnstr(str, "hello", 6); } +__test(-ENOENT) int test_strnstr_notfound1(void *ctx) { return bpf_strnstr(str, "hi", 10); } +__test(-ENOENT) int test_strnstr_notfound2(void *ctx) { return bpf_strnstr(str, "hello", 4); } +__test(-ENOENT) int test_strnstr_notfound3(void *ctx) { return bpf_strnstr("", "a", 0); } __test(0) int test_strnstr_empty(void *ctx) { return bpf_strnstr(str, "", 1); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall.c b/tools/testing/selftests/bpf/progs/uprobe_syscall.c index 8a4fa6c7ef59..e08c31669e5a 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall.c @@ -7,8 +7,8 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; -SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger") -int uretprobe(struct pt_regs *ctx) +SEC("uprobe") +int probe(struct pt_regs *ctx) { __builtin_memcpy(®s, ctx, sizeof(regs)); return 0; diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c index 0d7f1a7db2e2..915d38591bf6 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" #include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/usdt.bpf.h> #include <string.h> struct pt_regs regs; @@ -8,10 +10,64 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; int executed = 0; +int pid; + +SEC("uprobe") +int BPF_UPROBE(test_uprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(test_uretprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.multi") +int test_uprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} SEC("uretprobe.multi") -int test(struct pt_regs *regs) +int test_uretprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.session") +int test_uprobe_session(struct pt_regs *ctx) { - executed = 1; + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("usdt") +int test_usdt(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; return 0; } diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index e9e918cdf31f..511911053bdc 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -501,14 +501,20 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { #ifdef __x86_64__ static int +uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data) +{ + regs->cx = 0x87654321feebdaed; + return 0; +} + +static int uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs, __u64 *data) { regs->ax = 0x12345678deadbeef; - regs->cx = 0x87654321feebdaed; regs->r11 = (u64) -1; - return true; + return 0; } struct testmod_uprobe { @@ -520,6 +526,7 @@ struct testmod_uprobe { static DEFINE_MUTEX(testmod_uprobe_mutex); static struct testmod_uprobe uprobe = { + .consumer.handler = uprobe_handler, .consumer.ret_handler = uprobe_ret_handler, }; diff --git a/tools/testing/selftests/bpf/usdt.h b/tools/testing/selftests/bpf/usdt.h new file mode 100644 index 000000000000..549d1f774810 --- /dev/null +++ b/tools/testing/selftests/bpf/usdt.h @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * This single-header library defines a collection of variadic macros for + * defining and triggering USDTs (User Statically-Defined Tracepoints): + * + * - For USDTs without associated semaphore: + * USDT(group, name, args...) + * + * - For USDTs with implicit (transparent to the user) semaphore: + * USDT_WITH_SEMA(group, name, args...) + * USDT_IS_ACTIVE(group, name) + * + * - For USDTs with explicit (user-defined and provided) semaphore: + * USDT_WITH_EXPLICIT_SEMA(sema, group, name, args...) + * USDT_SEMA_IS_ACTIVE(sema) + * + * all of which emit a NOP instruction into the instruction stream, and so + * have *zero* overhead for the surrounding code. USDTs are identified by + * a combination of `group` and `name` identifiers, which is used by external + * tracing tooling (tracers) for identifying exact USDTs of interest. + * + * USDTs can have an associated (2-byte) activity counter (USDT semaphore), + * automatically maintained by Linux kernel whenever any correctly written + * BPF-based tracer is attached to the USDT. This USDT semaphore can be used + * to check whether there is a need to do any extra data collection and + * processing for a given USDT (if necessary), and otherwise avoid extra work + * for a common case of USDT not being traced ("active"). + * + * See documentation for USDT_WITH_SEMA()/USDT_IS_ACTIVE() or + * USDT_WITH_EXPLICIT_SEMA()/USDT_SEMA_IS_ACTIVE() APIs below for details on + * working with USDTs with implicitly or explicitly associated + * USDT semaphores, respectively. + * + * There is also some additional data recorded into an auxiliary note + * section. The data in the note section describes the operands, in terms of + * size and location, used by tracing tooling to know where to find USDT + * arguments. Each location is encoded as an assembler operand string. + * Tracing tools (bpftrace and BPF-based tracers, systemtap, etc) insert + * breakpoints on top of the nop, and decode the location operand-strings, + * like an assembler, to find the values being passed. + * + * The operand strings are selected by the compiler for each operand. + * They are constrained by inline-assembler codes.The default is: + * + * #define USDT_ARG_CONSTRAINT nor + * + * This is a good default if the operands tend to be integral and + * moderate in number (smaller than number of registers). In other + * cases, the compiler may report "'asm' requires impossible reload" or + * similar. In this case, consider simplifying the macro call (fewer + * and simpler operands), reduce optimization, or override the default + * constraints string via: + * + * #define USDT_ARG_CONSTRAINT g + * #include <usdt.h> + * + * For some historical description of USDT v3 format (the one used by this + * library and generally recognized and assumed by BPF-based tracing tools) + * see [0]. The more formal specification can be found at [1]. Additional + * argument constraints information can be found at [2]. + * + * Original SystemTap's sys/sdt.h implementation ([3]) was used as a base for + * this USDT library implementation. Current implementation differs *a lot* in + * terms of exposed user API and general usability, which was the main goal + * and focus of the reimplementation work. Nevertheless, underlying recorded + * USDT definitions are fully binary compatible and any USDT-based tooling + * should work equally well with USDTs defined by either SystemTap's or this + * library's USDT implementation. + * + * [0] https://ecos.sourceware.org/ml/systemtap/2010-q3/msg00145.html + * [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + * [2] https://gcc.gnu.org/onlinedocs/gcc/Constraints.html + * [3] https://sourceware.org/git/?p=systemtap.git;a=blob;f=includes/sys/sdt.h + */ +#ifndef __USDT_H +#define __USDT_H + +/* + * Changelog: + * + * 0.1.0 + * ----- + * - Initial release + */ +#define USDT_MAJOR_VERSION 0 +#define USDT_MINOR_VERSION 1 +#define USDT_PATCH_VERSION 0 + +/* C++20 and C23 added __VA_OPT__ as a standard replacement for non-standard `##__VA_ARGS__` extension */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L) +#define __usdt_va_opt 1 +#define __usdt_va_args(...) __VA_OPT__(,) __VA_ARGS__ +#else +#define __usdt_va_args(...) , ##__VA_ARGS__ +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. No USDT semaphore is + * associated with this USDT. + * + * Such "semaphoreless" USDTs are commonly used when there is no extra data + * collection or processing needed to collect and prepare USDT arguments and + * they are just available in the surrounding code. USDT() macro will just + * record their locations in CPU registers or in memory for tracing tooling to + * be able to access them, if necessary. + */ +#ifdef __usdt_va_opt +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0 __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0, ##__VA_ARGS__) +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. USDT also get an + * implicitly-defined associated USDT semaphore, which will be "activated" by + * tracing tooling and can be used to check whether USDT is being actively + * observed. + * + * USDTs with semaphore are commonly used when there is a need to perform + * additional data collection and processing to prepare USDT arguments, which + * otherwise might not be necessary for the rest of application logic. In such + * case, USDT semaphore can be used to avoid unnecessary extra work. If USDT + * is not traced (which is presumed to be a common situation), the associated + * USDT semaphore is "inactive", and so there is no need to waste resources to + * prepare USDT arguments. Use USDT_IS_ACTIVE(group, name) to check whether + * USDT is "active". + * + * N.B. There is an inherent (albeit short) gap between checking whether USDT + * is active and triggering corresponding USDT, in which external tracer can + * be attached to an USDT and activate USDT semaphore after the activity check. + * If such a race occurs, tracers might miss one USDT execution. Tracers are + * expected to accommodate such possibility and this is expected to not be + * a problem for applications and tracers. + * + * N.B. Implicit USDT semaphore defined by USDT_WITH_SEMA() is contained + * within a single executable or shared library and is not shared outside + * them. I.e., if you use USDT_WITH_SEMA() with the same USDT group and name + * identifier across executable and shared library, it will work and won't + * conflict, per se, but will define independent USDT semaphores, one for each + * shared library/executable in which USDT_WITH_SEMA(group, name) is used. + * That is, if you attach to this USDT in one shared library (or executable), + * then only USDT semaphore within that shared library (or executable) will be + * updated by the kernel, while other libraries (or executable) will not see + * activated USDT semaphore. In short, it's best to use unique USDT group:name + * identifiers across different shared libraries (and, equivalently, between + * executable and shared library). This is advanced consideration and is + * rarely (if ever) seen in practice, but just to avoid surprises this is + * called out here. (Static libraries become a part of final executable, once + * linked by linker, so the above considerations don't apply to them.) + */ +#ifdef __usdt_va_opt +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name) \ + __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name), \ + ##__VA_ARGS__) +#endif + +struct usdt_sema { volatile unsigned short active; }; + +/* + * Check if USDT with `group`:`name` identifier is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into the given USDT. It is assumed that USDT is triggered with + * USDT_WITH_SEMA() macro which will implicitly define associated USDT + * semaphore. (If one needs more control over USDT semaphore, see + * USDT_DEFINE_SEMA() and USDT_WITH_EXPLICIT_SEMA() macros below.) + * + * N.B. Such checks are necessarily racy and speculative. Between checking + * whether USDT is active and triggering the USDT itself, tracer can be + * detached with no notification. This race should be extremely rare and worst + * case should result in one-time wasted extra data collection and processing. + */ +#define USDT_IS_ACTIVE(group, name) ({ \ + extern struct usdt_sema __usdt_sema_name(group, name) \ + __usdt_asm_name(__usdt_sema_name(group, name)); \ + __usdt_sema_implicit(__usdt_sema_name(group, name)); \ + __usdt_sema_name(group, name).active > 0; \ +}) + +/* + * APIs for working with user-defined explicit USDT semaphores. + * + * This is a less commonly used advanced API for use cases in which user needs + * an explicit control over (potentially shared across multiple USDTs) USDT + * semaphore instance. This can be used when there is a group of logically + * related USDTs that all need extra data collection and processing whenever + * any of a family of related USDTs are "activated" (i.e., traced). In such + * a case, all such related USDTs will be associated with the same shared USDT + * semaphore defined with USDT_DEFINE_SEMA() and the USDTs themselves will be + * triggered with USDT_WITH_EXPLICIT_SEMA() macros, taking an explicit extra + * USDT semaphore identifier as an extra parameter. + */ + +/** + * Underlying C global variable name for user-defined USDT semaphore with + * `sema` identifier. Could be useful for debugging, but normally shouldn't be + * used explicitly. + */ +#define USDT_SEMA(sema) __usdt_sema_##sema + +/* + * Define storage for user-defined USDT semaphore `sema`. + * + * Should be used only once in non-header source file to let compiler allocate + * space for the semaphore variable. Just like with any other global variable. + * + * This macro can be used anywhere where global variable declaration is + * allowed. Just like with global variable definitions, there should be only + * one definition of user-defined USDT semaphore with given `sema` identifier, + * otherwise compiler or linker will complain about duplicate variable + * definition. + * + * For C++, it is allowed to use USDT_DEFINE_SEMA() both in global namespace + * and inside namespaces (including nested namespaces). Just make sure that + * USDT_DECLARE_SEMA() is placed within the namespace where this semaphore is + * referenced, or any of its parent namespaces, so the C++ language-level + * identifier is visible to the code that needs to reference the semaphore. + * At the lowest layer, USDT semaphores have global naming and visibility + * (they have a corresponding `__usdt_sema_<name>` symbol, which can be linked + * against from C or C++ code, if necessary). To keep it simple, putting + * USDT_DECLARE_SEMA() declarations into global namespaces is the simplest + * no-brainer solution. All these aspects are irrelevant for plain C, because + * C doesn't have namespaces and everything is always in the global namespace. + * + * N.B. Due to USDT metadata being recorded in non-allocatable ELF note + * section, it has limitations when it comes to relocations, which, in + * practice, means that it's not possible to correctly share USDT semaphores + * between main executable and shared libraries, or even between multiple + * shared libraries. USDT semaphore has to be contained to individual shared + * library or executable to avoid unpleasant surprises with half-working USDT + * semaphores. We enforce this by marking semaphore ELF symbols as having + * a hidden visibility. This is quite an advanced use case and consideration + * and for most users this should have no consequences whatsoever. + */ +#define USDT_DEFINE_SEMA(sema) \ + struct usdt_sema __usdt_sema_sec USDT_SEMA(sema) \ + __usdt_asm_name(USDT_SEMA(sema)) \ + __attribute__((visibility("hidden"))) = { 0 } + +/* + * Declare extern reference to user-defined USDT semaphore `sema`. + * + * Refers to a variable defined in another compilation unit by + * USDT_DEFINE_SEMA() and allows to use the same USDT semaphore across + * multiple compilation units (i.e., .c and .cpp files). + * + * See USDT_DEFINE_SEMA() notes above for C++ language usage peculiarities. + */ +#define USDT_DECLARE_SEMA(sema) \ + extern struct usdt_sema USDT_SEMA(sema) __usdt_asm_name(USDT_SEMA(sema)) + +/* + * Check if user-defined USDT semaphore `sema` is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into USDT(s) associated with USDT semaphore `sema`. + * + * N.B. Such checks are necessarily racy. Between checking the state of USDT + * semaphore and triggering associated USDT(s), the active tracer might attach + * or detach. This race should be extremely rare and worst case should result + * in one-time missed USDT event or wasted extra data collection and + * processing. USDT-using tracers should be written with this in mind and is + * not a concern of the application defining USDTs with associated semaphore. + */ +#define USDT_SEMA_IS_ACTIVE(sema) (USDT_SEMA(sema).active > 0) + +/* + * Invoke USDT specified by `group` and `name` identifiers and associate + * explicitly user-defined semaphore `sema` with it. Pass through `args` as + * USDT arguments. `args` are optional and zero arguments are acceptable. + * + * Semaphore is defined with the help of USDT_DEFINE_SEMA() macro and can be + * checked whether active with USDT_SEMA_IS_ACTIVE(). + */ +#ifdef __usdt_va_opt +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema), ##__VA_ARGS__) +#else +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema) __VA_OPT__(,) __VA_ARGS__) +#endif + +/* + * Adjustable implementation aspects + */ +#ifndef USDT_ARG_CONSTRAINT +#if defined __powerpc__ +#define USDT_ARG_CONSTRAINT nZr +#elif defined __arm__ +#define USDT_ARG_CONSTRAINT g +#elif defined __loongarch__ +#define USDT_ARG_CONSTRAINT nmr +#else +#define USDT_ARG_CONSTRAINT nor +#endif +#endif /* USDT_ARG_CONSTRAINT */ + +#ifndef USDT_NOP +#if defined(__ia64__) || defined(__s390__) || defined(__s390x__) +#define USDT_NOP nop 0 +#else +#define USDT_NOP nop +#endif +#endif /* USDT_NOP */ + +/* + * Implementation details + */ +/* USDT name for implicitly-defined USDT semaphore, derived from group:name */ +#define __usdt_sema_name(group, name) __usdt_sema_##group##__##name +/* ELF section into which USDT semaphores are put */ +#define __usdt_sema_sec __attribute__((section(".probes"))) + +#define __usdt_concat(a, b) a ## b +#define __usdt_apply(fn, n) __usdt_concat(fn, n) + +#ifndef __usdt_nth +#define __usdt_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, N, ...) N +#endif + +#ifndef __usdt_narg +#ifdef __usdt_va_opt +#define __usdt_narg(...) __usdt_nth(_ __VA_OPT__(,) __VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#else +#define __usdt_narg(...) __usdt_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif +#endif /* __usdt_narg */ + +#define __usdt_hash # +#define __usdt_str_(x) #x +#define __usdt_str(x) __usdt_str_(x) + +#ifndef __usdt_asm_name +#define __usdt_asm_name(name) __asm__(__usdt_str(name)) +#endif + +#define __usdt_asm0() "\n" +#define __usdt_asm1(x) __usdt_str(x) "\n" +#define __usdt_asm2(x, ...) __usdt_str(x) "," __usdt_asm1(__VA_ARGS__) +#define __usdt_asm3(x, ...) __usdt_str(x) "," __usdt_asm2(__VA_ARGS__) +#define __usdt_asm4(x, ...) __usdt_str(x) "," __usdt_asm3(__VA_ARGS__) +#define __usdt_asm5(x, ...) __usdt_str(x) "," __usdt_asm4(__VA_ARGS__) +#define __usdt_asm6(x, ...) __usdt_str(x) "," __usdt_asm5(__VA_ARGS__) +#define __usdt_asm7(x, ...) __usdt_str(x) "," __usdt_asm6(__VA_ARGS__) +#define __usdt_asm8(x, ...) __usdt_str(x) "," __usdt_asm7(__VA_ARGS__) +#define __usdt_asm9(x, ...) __usdt_str(x) "," __usdt_asm8(__VA_ARGS__) +#define __usdt_asm10(x, ...) __usdt_str(x) "," __usdt_asm9(__VA_ARGS__) +#define __usdt_asm11(x, ...) __usdt_str(x) "," __usdt_asm10(__VA_ARGS__) +#define __usdt_asm12(x, ...) __usdt_str(x) "," __usdt_asm11(__VA_ARGS__) +#define __usdt_asm(...) __usdt_apply(__usdt_asm, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#ifdef __LP64__ +#define __usdt_asm_addr .8byte +#else +#define __usdt_asm_addr .4byte +#endif + +#define __usdt_asm_strz_(x) __usdt_asm1(.asciz #x) +#define __usdt_asm_strz(x) __usdt_asm_strz_(x) +#define __usdt_asm_str_(x) __usdt_asm1(.ascii #x) +#define __usdt_asm_str(x) __usdt_asm_str_(x) + +/* "semaphoreless" USDT case */ +#ifndef __usdt_sema_none +#define __usdt_sema_none(sema) +#endif + +/* implicitly defined __usdt_sema__group__name semaphore (using weak symbols) */ +#ifndef __usdt_sema_implicit +#define __usdt_sema_implicit(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm1(.ifndef sema) \ + __usdt_asm3( .pushsection .probes, "aw", "progbits") \ + __usdt_asm1( .weak sema) \ + __usdt_asm1( .hidden sema) \ + __usdt_asm1( .align 2) \ + __usdt_asm1(sema:) \ + __usdt_asm1( .zero 2) \ + __usdt_asm2( .type sema, @object) \ + __usdt_asm2( .size sema, 2) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + ); +#endif + +/* externally defined semaphore using USDT_DEFINE_SEMA() and passed explicitly by user */ +#ifndef __usdt_sema_explicit +#define __usdt_sema_explicit(sema) \ + __asm__ __volatile__ ("" :: "m" (sema)); +#endif + +/* main USDT definition (nop and .note.stapsdt metadata) */ +#define __usdt_probe(group, name, sema_def, sema, ...) do { \ + sema_def(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm( 990: USDT_NOP) \ + __usdt_asm3( .pushsection .note.stapsdt, "", "note") \ + __usdt_asm1( .balign 4) \ + __usdt_asm3( .4byte 992f-991f,994f-993f,3) \ + __usdt_asm1(991: .asciz "stapsdt") \ + __usdt_asm1(992: .balign 4) \ + __usdt_asm1(993: __usdt_asm_addr 990b) \ + __usdt_asm1( __usdt_asm_addr _.stapsdt.base) \ + __usdt_asm1( __usdt_asm_addr sema) \ + __usdt_asm_strz(group) \ + __usdt_asm_strz(name) \ + __usdt_asm_args(__VA_ARGS__) \ + __usdt_asm1( .ascii "\0") \ + __usdt_asm1(994: .balign 4) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.ifndef _.stapsdt.base) \ + __usdt_asm5( .pushsection .stapsdt.base,"aG","progbits",.stapsdt.base,comdat)\ + __usdt_asm1( .weak _.stapsdt.base) \ + __usdt_asm1( .hidden _.stapsdt.base) \ + __usdt_asm1(_.stapsdt.base:) \ + __usdt_asm1( .space 1) \ + __usdt_asm2( .size _.stapsdt.base, 1) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + :: __usdt_asm_ops(__VA_ARGS__) \ + ); \ +} while (0) + +/* + * NB: gdb PR24541 highlighted an unspecified corner of the sdt.h + * operand note format. + * + * The named register may be a longer or shorter (!) alias for the + * storage where the value in question is found. For example, on + * i386, 64-bit value may be put in register pairs, and a register + * name stored would identify just one of them. Previously, gcc was + * asked to emit the %w[id] (16-bit alias of some registers holding + * operands), even when a wider 32-bit value was used. + * + * Bottom line: the byte-width given before the @ sign governs. If + * there is a mismatch between that width and that of the named + * register, then a sys/sdt.h note consumer may need to employ + * architecture-specific heuristics to figure out where the compiler + * has actually put the complete value. + */ +#if defined(__powerpc__) || defined(__powerpc64__) +#define __usdt_argref(id) %I[id]%[id] +#elif defined(__i386__) +#define __usdt_argref(id) %k[id] /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */ +#else +#define __usdt_argref(id) %[id] +#endif + +#define __usdt_asm_arg(n) __usdt_asm_str(%c[__usdt_asz##n]) \ + __usdt_asm1(.ascii "@") \ + __usdt_asm_str(__usdt_argref(__usdt_aval##n)) + +#define __usdt_asm_args0 /* no arguments */ +#define __usdt_asm_args1 __usdt_asm_arg(1) +#define __usdt_asm_args2 __usdt_asm_args1 __usdt_asm1(.ascii " ") __usdt_asm_arg(2) +#define __usdt_asm_args3 __usdt_asm_args2 __usdt_asm1(.ascii " ") __usdt_asm_arg(3) +#define __usdt_asm_args4 __usdt_asm_args3 __usdt_asm1(.ascii " ") __usdt_asm_arg(4) +#define __usdt_asm_args5 __usdt_asm_args4 __usdt_asm1(.ascii " ") __usdt_asm_arg(5) +#define __usdt_asm_args6 __usdt_asm_args5 __usdt_asm1(.ascii " ") __usdt_asm_arg(6) +#define __usdt_asm_args7 __usdt_asm_args6 __usdt_asm1(.ascii " ") __usdt_asm_arg(7) +#define __usdt_asm_args8 __usdt_asm_args7 __usdt_asm1(.ascii " ") __usdt_asm_arg(8) +#define __usdt_asm_args9 __usdt_asm_args8 __usdt_asm1(.ascii " ") __usdt_asm_arg(9) +#define __usdt_asm_args10 __usdt_asm_args9 __usdt_asm1(.ascii " ") __usdt_asm_arg(10) +#define __usdt_asm_args11 __usdt_asm_args10 __usdt_asm1(.ascii " ") __usdt_asm_arg(11) +#define __usdt_asm_args12 __usdt_asm_args11 __usdt_asm1(.ascii " ") __usdt_asm_arg(12) +#define __usdt_asm_args(...) __usdt_apply(__usdt_asm_args, __usdt_narg(__VA_ARGS__)) + +#define __usdt_is_arr(x) (__builtin_classify_type(x) == 14 || __builtin_classify_type(x) == 5) +#define __usdt_arg_size(x) (__usdt_is_arr(x) ? sizeof(void *) : sizeof(x)) + +/* + * We can't use __builtin_choose_expr() in C++, so fall back to table-based + * signedness determination for known types, utilizing templates magic. + */ +#ifdef __cplusplus + +#define __usdt_is_signed(x) (!__usdt_is_arr(x) && __usdt_t<__typeof(x)>::is_signed) + +#include <cstddef> + +template<typename T> struct __usdt_t { static const bool is_signed = false; }; +template<typename A> struct __usdt_t<A[]> : public __usdt_t<A *> {}; +template<typename A, size_t N> struct __usdt_t<A[N]> : public __usdt_t<A *> {}; + +#define __usdt_def_signed(T) \ +template<> struct __usdt_t<T> { static const bool is_signed = true; }; \ +template<> struct __usdt_t<const T> { static const bool is_signed = true; }; \ +template<> struct __usdt_t<volatile T> { static const bool is_signed = true; }; \ +template<> struct __usdt_t<const volatile T> { static const bool is_signed = true; } +#define __usdt_maybe_signed(T) \ +template<> struct __usdt_t<T> { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t<const T> { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t<volatile T> { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t<const volatile T> { static const bool is_signed = (T)-1 < (T)1; } + +__usdt_def_signed(signed char); +__usdt_def_signed(short); +__usdt_def_signed(int); +__usdt_def_signed(long); +__usdt_def_signed(long long); +__usdt_maybe_signed(char); +__usdt_maybe_signed(wchar_t); + +#else /* !__cplusplus */ + +#define __usdt_is_inttype(x) (__builtin_classify_type(x) >= 1 && __builtin_classify_type(x) <= 4) +#define __usdt_inttype(x) __typeof(__builtin_choose_expr(__usdt_is_inttype(x), (x), 0U)) +#define __usdt_is_signed(x) ((__usdt_inttype(x))-1 < (__usdt_inttype(x))1) + +#endif /* __cplusplus */ + +#define __usdt_asm_op(n, x) \ + [__usdt_asz##n] "n" ((__usdt_is_signed(x) ? (int)-1 : 1) * (int)__usdt_arg_size(x)), \ + [__usdt_aval##n] __usdt_str(USDT_ARG_CONSTRAINT)(x) + +#define __usdt_asm_ops0() [__usdt_dummy] "g" (0) +#define __usdt_asm_ops1(x) __usdt_asm_op(1, x) +#define __usdt_asm_ops2(a,x) __usdt_asm_ops1(a), __usdt_asm_op(2, x) +#define __usdt_asm_ops3(a,b,x) __usdt_asm_ops2(a,b), __usdt_asm_op(3, x) +#define __usdt_asm_ops4(a,b,c,x) __usdt_asm_ops3(a,b,c), __usdt_asm_op(4, x) +#define __usdt_asm_ops5(a,b,c,d,x) __usdt_asm_ops4(a,b,c,d), __usdt_asm_op(5, x) +#define __usdt_asm_ops6(a,b,c,d,e,x) __usdt_asm_ops5(a,b,c,d,e), __usdt_asm_op(6, x) +#define __usdt_asm_ops7(a,b,c,d,e,f,x) __usdt_asm_ops6(a,b,c,d,e,f), __usdt_asm_op(7, x) +#define __usdt_asm_ops8(a,b,c,d,e,f,g,x) __usdt_asm_ops7(a,b,c,d,e,f,g), __usdt_asm_op(8, x) +#define __usdt_asm_ops9(a,b,c,d,e,f,g,h,x) __usdt_asm_ops8(a,b,c,d,e,f,g,h), __usdt_asm_op(9, x) +#define __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,x) __usdt_asm_ops9(a,b,c,d,e,f,g,h,i), __usdt_asm_op(10, x) +#define __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,x) __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,j), __usdt_asm_op(11, x) +#define __usdt_asm_ops12(a,b,c,d,e,f,g,h,i,j,k,x) __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,k), __usdt_asm_op(12, x) +#define __usdt_asm_ops(...) __usdt_apply(__usdt_asm_ops, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#endif /* __USDT_H */ diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 0e89fcff4d05..44c52f620fda 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -522,6 +522,18 @@ int proc_mount_contains(const char *option) return strstr(buf, option) != NULL; } +int cgroup_feature(const char *feature) +{ + char buf[PAGE_SIZE]; + ssize_t read; + + read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf)); + if (read < 0) + return read; + + return strstr(buf, feature) != NULL; +} + ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) { char path[PATH_MAX]; diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index c69cab66254b..9dc90a1b386d 100644 --- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -60,6 +60,7 @@ extern int cg_run_nowait(const char *cgroup, extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); int proc_mount_contains(const char *option); +int cgroup_feature(const char *feature); extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); extern pid_t clone_into_cgroup(int cgroup_fd); diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 8730645d363a..dfb763819581 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -804,6 +804,662 @@ cleanup: return ret; } +/* + * Get the current frozen_usec for the cgroup. + */ +static long cg_check_freezetime(const char *cgroup) +{ + return cg_read_key_long(cgroup, "cgroup.stat.local", + "frozen_usec "); +} + +/* + * Test that the freeze time will behave as expected for an empty cgroup. + */ +static int test_cgfreezer_time_empty(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_empty"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create an empty cgroup and check that its freeze time + * is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 2) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 3) Unfreeze the cgroup. Check that the freeze time is + * larger than at 2). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Check the freeze time again to ensure that it has not + * changed. + */ + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * A simple test for cgroup freezer time accounting. This test follows + * the same flow as test_cgfreezer_time_empty, but with a single process + * in the cgroup. + */ +static int test_cgfreezer_time_simple(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_simple"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create a cgroup and check that its freeze time is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Populate the cgroup with one child and check that the + * freeze time is still 0. + */ + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr > prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 3) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 4) Unfreeze the cgroup. Check that the freeze time is + * larger than at 3). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Sleep for 1000 us. Check that the freeze time is the + * same as at 4). + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that freezer time accounting works as expected, even while we're + * populating a cgroup with processes. + */ +static int test_cgfreezer_time_populate(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + int i; + + cgroup = cg_name(root, "cg_time_test_populate"); + if (!cgroup) + goto cleanup; + + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 1) Populate the cgroup with 100 processes. Check that + * the freeze time is 0. + */ + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Wait for the group to become fully populated. Check + * that the freeze time is 0. + */ + if (cg_wait_for_proc_count(cgroup, 100)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 3) Freeze the cgroup and then populate it with 100 more + * processes. Check that the freeze time continues to grow. + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Wait for the group to become fully populated. Check + * that the freeze time is larger than at 3). + */ + if (cg_wait_for_proc_count(cgroup, 200)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Unfreeze the cgroup. Check that the freeze time is + * larger than at 4). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 6) Kill the processes. Check that the freeze time is the + * same as it was at 5). + */ + if (cg_killall(cgroup)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 7) Freeze and unfreeze the cgroup. Check that the freeze + * time is larger than it was at 6). + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that frozen time for a cgroup continues to work as expected, + * even as processes are migrated. Frozen cgroup A's freeze time should + * continue to increase and running cgroup B's should stay 0. + */ +static int test_cgfreezer_time_migrate(const char *root) +{ + long prev_A, curr_A, curr_B; + char *cgroup[2] = {0}; + int ret = KSFT_FAIL; + int pid; + + cgroup[0] = cg_name(root, "cg_time_test_migrate_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(root, "cg_time_test_migrate_B"); + if (!cgroup[1]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + pid = cg_run_nowait(cgroup[0], child_fn, NULL); + if (pid < 0) + goto cleanup; + + if (cg_wait_for_proc_count(cgroup[0], 1)) + goto cleanup; + + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A) { + debug("Expect time (%ld) to be 0\n", curr_A); + goto cleanup; + } + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + /* + * Freeze cgroup A. + */ + if (cg_freeze_wait(cgroup[0], true)) + goto cleanup; + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be > 0\n", curr_A); + goto cleanup; + } + + /* + * Migrate from A (frozen) to B (running). + */ + if (cg_enter(cgroup[1], pid)) + goto cleanup; + + usleep(1000); + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr_A, prev_A); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup[0]) + cg_destroy(cgroup[0]); + free(cgroup[0]); + if (cgroup[1]) + cg_destroy(cgroup[1]); + free(cgroup[1]); + return ret; +} + +/* + * The test creates a cgroup and freezes it. Then it creates a child cgroup. + * After that it checks that the child cgroup has a non-zero freeze time + * that is less than the parent's. Next, it freezes the child, unfreezes + * the parent, and sleeps. Finally, it checks that the child's freeze + * time has grown larger than the parent's. + */ +static int test_cgfreezer_time_parent(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_parent_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_parent_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_freeze_wait(parent, true)) + goto cleanup; + + usleep(1000); + if (cg_create(child)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + /* + * Since the parent was frozen the entire time the child cgroup + * was being created, we expect the parent's freeze time to be + * larger than the child's. + * + * Ideally, we would be able to check both times simultaneously, + * but here we get the child's after we get the parent's. + */ + ptime = cg_check_freezetime(parent); + ctime = cg_check_freezetime(child); + if (ptime <= ctime) { + debug("Expect ptime (%ld) > ctime (%ld)\n", ptime, ctime); + goto cleanup; + } + + if (cg_freeze_nowait(child, true)) + goto cleanup; + + if (cg_freeze_wait(parent, false)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + usleep(100000); + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + + if (ctime <= ptime) { + debug("Expect ctime (%ld) > ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates a parent cgroup and a child cgroup. Then, it freezes + * the child and checks that the child's freeze time is greater than the + * parent's, which should be zero. + */ +static int test_cgfreezer_time_child(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_child_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_child_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(child)) + goto cleanup; + + if (cg_freeze_wait(child, true)) + goto cleanup; + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + if (ptime != 0) { + debug("Expect ptime (%ld) to be 0\n", ptime); + goto cleanup; + } + + if (ctime <= ptime) { + debug("Expect ctime (%ld) <= ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates the following hierarchy: + * A + * | + * B + * | + * C + * + * Then it freezes the cgroups in the order C, B, A. + * Then it unfreezes the cgroups in the order A, B, C. + * Then it checks that C's freeze time is larger than B's and + * that B's is larger than A's. + */ +static int test_cgfreezer_time_nested(const char *root) +{ + char *cgroup[3] = {0}; + int ret = KSFT_FAIL; + long time[3] = {0}; + int i; + + cgroup[0] = cg_name(root, "cg_test_time_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(cgroup[0], "B"); + if (!cgroup[1]) + goto cleanup; + + cgroup[2] = cg_name(cgroup[1], "C"); + if (!cgroup[2]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + if (cg_create(cgroup[2])) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[0], true)) + goto cleanup; + + usleep(1000); + + if (cg_freeze_nowait(cgroup[0], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], false)) + goto cleanup; + + time[2] = cg_check_freezetime(cgroup[2]); + time[1] = cg_check_freezetime(cgroup[1]); + time[0] = cg_check_freezetime(cgroup[0]); + + if (time[2] <= time[1]) { + debug("Expect C's time (%ld) > B's time (%ld)", time[2], time[1]); + goto cleanup; + } + + if (time[1] <= time[0]) { + debug("Expect B's time (%ld) > A's time (%ld)", time[1], time[0]); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + for (i = 2; i >= 0 && cgroup[i]; i--) { + cg_destroy(cgroup[i]); + free(cgroup[i]); + } + + return ret; +} + #define T(x) { x, #x } struct cgfreezer_test { int (*fn)(const char *root); @@ -819,6 +1475,13 @@ struct cgfreezer_test { T(test_cgfreezer_stopped), T(test_cgfreezer_ptraced), T(test_cgfreezer_vfork), + T(test_cgfreezer_time_empty), + T(test_cgfreezer_time_simple), + T(test_cgfreezer_time_populate), + T(test_cgfreezer_time_migrate), + T(test_cgfreezer_time_parent), + T(test_cgfreezer_time_child), + T(test_cgfreezer_time_nested), }; #undef T diff --git a/tools/testing/selftests/cgroup/test_pids.c b/tools/testing/selftests/cgroup/test_pids.c index 9ecb83c6cc5c..d8a1d1cd5007 100644 --- a/tools/testing/selftests/cgroup/test_pids.c +++ b/tools/testing/selftests/cgroup/test_pids.c @@ -77,6 +77,9 @@ static int test_pids_events(const char *root) char *cg_parent = NULL, *cg_child = NULL; int pid; + if (cgroup_feature("pids_localevents") <= 0) + return KSFT_SKIP; + cg_parent = cg_name(root, "pids_parent"); cg_child = cg_name(cg_parent, "pids_child"); if (!cg_parent || !cg_child) diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index 5a5a7a5f7e1d..a4ac80bb1003 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -446,9 +446,6 @@ TEST_F(coredump, socket_detect_userspace_client) if (info.coredump_mask & PIDFD_COREDUMPED) goto out; - if (read(fd_coredump, &c, 1) < 1) - goto out; - exit_code = EXIT_SUCCESS; out: if (fd_peer_pidfd >= 0) diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 5b230deb19e8..9a3499827d4b 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -4,6 +4,7 @@ TEST_GEN_FILES += access_memory access_memory_even TEST_FILES = _damon_sysfs.py +TEST_FILES += drgn_dump_damon_status.py # functionality tests TEST_PROGS += sysfs.sh diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 2b10854e4b1e..44b98f17f8ff 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -10,7 +10,8 @@ TEST_PROGS := \ mode-2-recovery-updelay.sh \ bond_options.sh \ bond-eth-type-change.sh \ - bond_macvlan_ipvlan.sh + bond_macvlan_ipvlan.sh \ + bond_passive_lacp.sh TEST_FILES := \ lag_lib.sh \ diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index 7bc148889ca7..187b478d0ddf 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -7,6 +7,8 @@ ALL_TESTS=" prio arp_validate num_grat_arp + fail_over_mac + vlan_over_bond " lib_dir=$(dirname "$0") @@ -352,8 +354,8 @@ garp_test() exp_num=$(echo "${param}" | cut -f6 -d ' ') active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") - slowwait_for_counter $((exp_num + 5)) $exp_num \ - tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" + slowwait_for_counter $((exp_num + 5)) $exp_num tc_rule_handle_stats_get \ + "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" &> /dev/null # check result real_num=$(tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}") @@ -376,6 +378,197 @@ num_grat_arp() done } +check_all_mac_same() +{ + RET=0 + # all slaves should have same mac address (with the first port's mac) + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + local eth1_mac=$(ip -n "$s_ns" -j link show eth1 | jq -r '.[]["address"]') + local eth2_mac=$(ip -n "$s_ns" -j link show eth2 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ] || [ "$eth0_mac" != "$bond_mac" ] || \ + [ "$eth1_mac" != "$bond_mac" ] || [ "$eth2_mac" != "$bond_mac" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_first() +{ + RET=0 + # bond mac address should be same with the first added slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_active() +{ + RET=0 + # bond mac address should be same with active slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + local active_slave_mac=$(ip -n "$s_ns" -j link show "$active_slave" | jq -r '.[]["address"]') + if [ "$bond_mac" != "$active_slave_mac" ]; then + RET=1 + fi +} + +check_backup_slave_mac_not_change() +{ + RET=0 + # backup slave's mac address is not changed + if ip -n "$s_ns" -d -j link show type bond_slave | jq -e '.[] + | select(.linkinfo.info_slave_data.state=="BACKUP") + | select(.address != .linkinfo.info_slave_data.perm_hwaddr)' &> /dev/null; then + RET=1 + fi +} + +check_backup_slave_mac_inherit() +{ + local backup_mac + RET=0 + + # backup slaves should use mac[1] or mac[2] + local backup_macs=$(ip -n "$s_ns" -d -j link show type bond_slave | \ + jq -r '.[] | select(.linkinfo.info_slave_data.state=="BACKUP") | .address') + for backup_mac in $backup_macs; do + if [ "$backup_mac" != "${mac[1]}" ] && [ "$backup_mac" != "${mac[2]}" ]; then + RET=1 + fi + done +} + +check_first_slave_random_mac() +{ + RET=0 + # remove the first added slave and added it back + ip -n "$s_ns" link set eth0 nomaster + ip -n "$s_ns" link set eth0 master bond0 + + # the first slave should use random mac address + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" = "${mac[0]}" ] && RET=1 + log_test "bond fail_over_mac follow" "random first slave mac" + + # remove the first slave, the permanent MAC address should be restored back + ip -n "$s_ns" link set eth0 nomaster + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" != "${mac[0]}" ] && RET=1 +} + +do_active_backup_failover() +{ + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + ip -n ${s_ns} link set ${active_slave} down + slowwait 2 active_slave_changed $active_slave + ip -n ${s_ns} link set ${active_slave} up +} + +fail_over_mac() +{ + # Bring down the first interface on the switch to force the bond to + # select another active interface instead of the first one that joined. + ip -n "$g_ns" link set s0 down + + # fail_over_mac none + bond_reset "mode active-backup miimon 100 fail_over_mac 0" + check_all_mac_same + log_test "fail_over_mac 0" "all slaves have same mac" + do_active_backup_failover + check_all_mac_same + log_test "fail_over_mac 0" "failover: all slaves have same mac" + + # fail_over_mac active + bond_reset "mode active-backup miimon 100 fail_over_mac 1" + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "backup slave mac is not changed" + do_active_backup_failover + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "failover: backup slave mac is not changed" + + # fail_over_mac follow + bond_reset "mode active-backup miimon 100 fail_over_mac 2" + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "backup slave mac inherit" + do_active_backup_failover + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "failover: bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "failover: backup slave mac inherit" + check_first_slave_random_mac + log_test "fail_over_mac 2" "first slave mac random" +} + +vlan_over_bond_arp() +{ + local mode="$1" + RET=0 + + bond_reset "mode $mode arp_interval 100 arp_ip_target 192.0.3.10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 101 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond arp" "$mode" +} + +vlan_over_bond_ns() +{ + local mode="$1" + RET=0 + + if skip_ns; then + log_test_skip "vlan_over_bond ns" "$mode" + return 0 + fi + + bond_reset "mode $mode arp_interval 100 ns_ip6_target 2001:db8::3:10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 102 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond ns" "$mode" +} + +vlan_over_bond() +{ + # add vlan 3 for client + ip -n "${c_ns}" link add eth0.3 link eth0 type vlan id 3 + ip -n "${c_ns}" link set eth0.3 up + ip -n "${c_ns}" addr add 192.0.3.10/24 dev eth0.3 + ip -n "${c_ns}" addr add 2001:db8::3:10/64 dev eth0.3 + + # Add tc rule to check the vlan pkts + tc -n "${c_ns}" qdisc add dev eth0.3 clsact + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol arp \ + handle 101 flower skip_hw arp_op request \ + arp_sip 192.0.3.1 arp_tip 192.0.3.10 action pass + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol ipv6 \ + handle 102 flower skip_hw ip_proto icmpv6 \ + type 135 src_ip 2001:db8::3:1 action pass + + vlan_over_bond_arp "active-backup" + vlan_over_bond_ns "active-backup" +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh b/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh new file mode 100755 index 000000000000..9c3b089813df --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test if a bond interface works with lacp_active=off. + +# shellcheck disable=SC2034 +REQUIRE_MZ=no +NUM_NETIFS=0 +lib_dir=$(dirname "$0") +# shellcheck disable=SC1091 +source "$lib_dir"/../../../net/forwarding/lib.sh + +# shellcheck disable=SC2317 +check_port_state() +{ + local netns=$1 + local port=$2 + local state=$3 + + ip -n "${netns}" -d -j link show "$port" | \ + jq -e ".[].linkinfo.info_slave_data.ad_actor_oper_port_state_str | index(\"${state}\") != null" > /dev/null +} + +check_pkt_count() +{ + RET=0 + local ns="$1" + local iface="$2" + + # wait 65s, one per 30s + slowwait_for_counter 65 2 tc_rule_handle_stats_get \ + "dev ${iface} egress" 101 ".packets" "-n ${ns}" &> /dev/null +} + +setup() { + setup_ns c_ns s_ns + + # shellcheck disable=SC2154 + ip -n "${c_ns}" link add eth0 type veth peer name eth0 netns "${s_ns}" + ip -n "${c_ns}" link add eth1 type veth peer name eth1 netns "${s_ns}" + + # Add tc filter to count the pkts + tc -n "${c_ns}" qdisc add dev eth0 clsact + tc -n "${c_ns}" filter add dev eth0 egress handle 101 protocol 0x8809 matchall action pass + tc -n "${s_ns}" qdisc add dev eth1 clsact + tc -n "${s_ns}" filter add dev eth1 egress handle 101 protocol 0x8809 matchall action pass + + ip -n "${s_ns}" link add bond0 type bond mode 802.3ad lacp_active on lacp_rate fast + ip -n "${s_ns}" link set eth0 master bond0 + ip -n "${s_ns}" link set eth1 master bond0 + + ip -n "${c_ns}" link add bond0 type bond mode 802.3ad lacp_active off lacp_rate fast + ip -n "${c_ns}" link set eth0 master bond0 + ip -n "${c_ns}" link set eth1 master bond0 + +} + +trap cleanup_all_ns EXIT +setup + +# The bond will send 2 lacpdu pkts during init time, let's wait at least 2s +# after interface up +ip -n "${c_ns}" link set bond0 up +sleep 2 + +# 1. The passive side shouldn't send LACPDU. +check_pkt_count "${c_ns}" "eth0" && RET=1 +log_test "802.3ad lacp_active off" "init port" + +ip -n "${s_ns}" link set bond0 up +# 2. The passive side should not have the 'active' flag. +RET=0 +slowwait 2 check_port_state "${c_ns}" "eth0" "active" && RET=1 +log_test "802.3ad lacp_active off" "port state active" + +# 3. The active side should have the 'active' flag. +RET=0 +slowwait 2 check_port_state "${s_ns}" "eth0" "active" || RET=1 +log_test "802.3ad lacp_active on" "port state active" + +# 4. Make sure the connection is not expired. +RET=0 +slowwait 5 check_port_state "${s_ns}" "eth0" "distributing" +slowwait 10 check_port_state "${s_ns}" "eth0" "expired" && RET=1 +log_test "bond 802.3ad lacp_active off" "port connection" + +# After testing, disconnect one port on each side to check the state. +ip -n "${s_ns}" link set eth0 nomaster +ip -n "${s_ns}" link set eth0 up +ip -n "${c_ns}" link set eth1 nomaster +ip -n "${c_ns}" link set eth1 up +# Due to Periodic Machine and Rx Machine state change, the bond will still +# send lacpdu pkts in a few seconds. sleep at lease 5s to make sure +# negotiation finished +sleep 5 + +# 5. The active side should keep sending LACPDU. +check_pkt_count "${s_ns}" "eth1" || RET=1 +log_test "bond 802.3ad lacp_active on" "port pkt after disconnect" + +# 6. The passive side shouldn't send LACPDU anymore. +check_pkt_count "${c_ns}" "eth0" && RET=1 +log_test "bond 802.3ad lacp_active off" "port pkt after disconnect" + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh index 195ef83cfbf1..167aa4a4a12a 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh @@ -39,6 +39,8 @@ g_ip4="192.0.2.254" s_ip6="2001:db8::1" c_ip6="2001:db8::10" g_ip6="2001:db8::254" +mac[0]="00:0a:0b:0c:0d:01" +mac[1]="00:0a:0b:0c:0d:02" gateway_create() { @@ -62,6 +64,7 @@ server_create() for i in $(seq 0 1); do ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh index 3a1333d9a85b..23a2932301cc 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh @@ -26,6 +26,7 @@ # +-------------------------------------+ source bond_topo_2d1c.sh +mac[2]="00:0a:0b:0c:0d:03" setup_prepare() { @@ -36,6 +37,7 @@ setup_prepare() # Add the extra device as we use 3 down links for bond0 local i=2 ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 ip -n ${s_ns} link set eth${i} master bond0 diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index dad4e5fda4db..832fa1caeb66 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -6,6 +6,8 @@ CONFIG_MACVLAN=y CONFIG_IPVLAN=y CONFIG_NET_ACT_GACT=y CONFIG_NET_CLS_FLOWER=y +CONFIG_NET_CLS_MATCHALL=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y CONFIG_VETH=y +CONFIG_VLAN_8021Q=m diff --git a/tools/testing/selftests/drivers/net/hw/csum.py b/tools/testing/selftests/drivers/net/hw/csum.py index cd23af875317..3e3a89a34afe 100755 --- a/tools/testing/selftests/drivers/net/hw/csum.py +++ b/tools/testing/selftests/drivers/net/hw/csum.py @@ -17,7 +17,7 @@ def test_receive(cfg, ipver="6", extra_args=None): ip_args = f"-{ipver} -S {cfg.remote_addr_v[ipver]} -D {cfg.addr_v[ipver]}" rx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -n 100 {ip_args} -r 1 -R {extra_args}" - tx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -n 100 {ip_args} -r 1 -T {extra_args}" + tx_cmd = f"{cfg.bin_remote} -i {cfg.remote_ifname} -n 100 {ip_args} -r 1 -T {extra_args}" with bkg(rx_cmd, exit_wait=True): wait_port_listen(34000, proto="udp") @@ -37,7 +37,7 @@ def test_transmit(cfg, ipver="6", extra_args=None): if extra_args != "-U -Z": extra_args += " -r 1" - rx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -L 1 -n 100 {ip_args} -R {extra_args}" + rx_cmd = f"{cfg.bin_remote} -i {cfg.remote_ifname} -L 1 -n 100 {ip_args} -R {extra_args}" tx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -L 1 -n 100 {ip_args} -T {extra_args}" with bkg(rx_cmd, host=cfg.remote, exit_wait=True): diff --git a/tools/testing/selftests/drivers/net/napi_threaded.py b/tools/testing/selftests/drivers/net/napi_threaded.py index b2698db39817..9699a100a87d 100755 --- a/tools/testing/selftests/drivers/net/napi_threaded.py +++ b/tools/testing/selftests/drivers/net/napi_threaded.py @@ -35,6 +35,8 @@ def _setup_deferred_cleanup(cfg) -> None: threaded = cmd(f"cat /sys/class/net/{cfg.ifname}/threaded").stdout defer(_set_threaded_state, cfg, threaded) + return combined + def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None: """ @@ -49,7 +51,7 @@ def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None: napi0_id = napis[0]['id'] napi1_id = napis[1]['id'] - _setup_deferred_cleanup(cfg) + qcnt = _setup_deferred_cleanup(cfg) # set threaded _set_threaded_state(cfg, 1) @@ -62,7 +64,7 @@ def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None: nl.napi_set({'id': napi1_id, 'threaded': 'disabled'}) cmd(f"ethtool -L {cfg.ifname} combined 1") - cmd(f"ethtool -L {cfg.ifname} combined 2") + cmd(f"ethtool -L {cfg.ifname} combined {qcnt}") _assert_napi_threaded_enabled(nl, napi0_id) _assert_napi_threaded_disabled(nl, napi1_id) @@ -80,7 +82,7 @@ def change_num_queues(cfg, nl) -> None: napi0_id = napis[0]['id'] napi1_id = napis[1]['id'] - _setup_deferred_cleanup(cfg) + qcnt = _setup_deferred_cleanup(cfg) # set threaded _set_threaded_state(cfg, 1) @@ -90,7 +92,7 @@ def change_num_queues(cfg, nl) -> None: _assert_napi_threaded_enabled(nl, napi1_id) cmd(f"ethtool -L {cfg.ifname} combined 1") - cmd(f"ethtool -L {cfg.ifname} combined 2") + cmd(f"ethtool -L {cfg.ifname} combined {qcnt}") # check napi threaded is set for both napis _assert_napi_threaded_enabled(nl, napi0_id) diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore index fcbdb1297e24..64ac0dfa46b7 100644 --- a/tools/testing/selftests/filesystems/.gitignore +++ b/tools/testing/selftests/filesystems/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only dnotify_test devpts_pts +fclog file_stressor anon_inode_test kernfs_test diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile index 73d4650af1a5..85427d7f19b9 100644 --- a/tools/testing/selftests/filesystems/Makefile +++ b/tools/testing/selftests/filesystems/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += $(KHDR_INCLUDES) -TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test +TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog TEST_GEN_PROGS_EXTENDED := dnotify_test include ../lib.mk diff --git a/tools/testing/selftests/filesystems/fclog.c b/tools/testing/selftests/filesystems/fclog.c new file mode 100644 index 000000000000..912a8b755c3b --- /dev/null +++ b/tools/testing/selftests/filesystems/fclog.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Aleksa Sarai <cyphar@cyphar.com> + * Copyright (C) 2025 SUSE LLC. + */ + +#include <assert.h> +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mount.h> + +#include "../kselftest_harness.h" + +#define ASSERT_ERRNO(expected, _t, seen) \ + __EXPECT(expected, #expected, \ + ({__typeof__(seen) _tmp_seen = (seen); \ + _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1) + +#define ASSERT_ERRNO_EQ(expected, seen) \ + ASSERT_ERRNO(expected, ==, seen) + +#define ASSERT_SUCCESS(seen) \ + ASSERT_ERRNO(0, <=, seen) + +FIXTURE(ns) +{ + int host_mntns; +}; + +FIXTURE_SETUP(ns) +{ + /* Stash the old mntns. */ + self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_mntns); + + /* Create a new mount namespace and make it private. */ + ASSERT_SUCCESS(unshare(CLONE_NEWNS)); + ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL)); +} + +FIXTURE_TEARDOWN(ns) +{ + ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS)); + ASSERT_SUCCESS(close(self->host_mntns)); +} + +TEST_F(ns, fscontext_log_enodata) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + /* A brand new fscontext has no log entries. */ + char buf[128] = {}; + for (int i = 0; i < 16; i++) + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_errorfc) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + char buf[128] = {}; + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_errorfc_after_fsmount) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + int mfd = fsmount(fsfd, FSMOUNT_CLOEXEC, MOUNT_ATTR_NOEXEC | MOUNT_ATTR_NOSUID); + ASSERT_SUCCESS(mfd); + ASSERT_SUCCESS(move_mount(mfd, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)); + + /* + * The fscontext log should still contain data even after + * FSCONFIG_CMD_CREATE and fsmount(). + */ + char buf[128] = {}; + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_emsgsize) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + char buf[128] = {}; + /* + * Attempting to read a message with too small a buffer should not + * result in the message getting consumed. + */ + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 0)); + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 1)); + for (int i = 0; i < 16; i++) + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 16)); + + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c index 63ce708d93ed..e4b7c2b457ee 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c @@ -2,6 +2,13 @@ // Copyright (c) 2025 Miklos Szeredi <miklos@szeredi.hu> #define _GNU_SOURCE + +// Needed for linux/fanotify.h +typedef struct { + int val[2]; +} __kernel_fsid_t; +#define __kernel_fsid_t __kernel_fsid_t + #include <fcntl.h> #include <sched.h> #include <stdio.h> @@ -10,20 +17,12 @@ #include <sys/mount.h> #include <unistd.h> #include <sys/syscall.h> +#include <sys/fanotify.h> #include "../../kselftest_harness.h" #include "../statmount/statmount.h" #include "../utils.h" -// Needed for linux/fanotify.h -#ifndef __kernel_fsid_t -typedef struct { - int val[2]; -} __kernel_fsid_t; -#endif - -#include <sys/fanotify.h> - static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; static const int mark_cmds[] = { diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c index 090a5ca65004..9f57ca46e3af 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c @@ -2,6 +2,13 @@ // Copyright (c) 2025 Miklos Szeredi <miklos@szeredi.hu> #define _GNU_SOURCE + +// Needed for linux/fanotify.h +typedef struct { + int val[2]; +} __kernel_fsid_t; +#define __kernel_fsid_t __kernel_fsid_t + #include <fcntl.h> #include <sched.h> #include <stdio.h> @@ -10,21 +17,12 @@ #include <sys/mount.h> #include <unistd.h> #include <sys/syscall.h> +#include <sys/fanotify.h> #include "../../kselftest_harness.h" -#include "../../pidfd/pidfd.h" #include "../statmount/statmount.h" #include "../utils.h" -// Needed for linux/fanotify.h -#ifndef __kernel_fsid_t -typedef struct { - int val[2]; -} __kernel_fsid_t; -#endif - -#include <sys/fanotify.h> - static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; static const int mark_types[] = { diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 651fc9f13c08..45c14323a618 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -113,7 +113,7 @@ static bool fail_nth_next(struct __test_metadata *_metadata, * necessarily mean a test failure, just that the limit has to be made * bigger. */ - ASSERT_GT(400, nth_state->iteration); + ASSERT_GT(1000, nth_state->iteration); if (nth_state->iteration != 0) { ssize_t res; ssize_t res2; diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 2925e47db995..8516e8434bc4 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -751,7 +751,7 @@ for (; _metadata->trigger; _metadata->trigger = \ __bail(_assert, _metadata)) -#define is_signed_type(var) (!!(((__typeof__(var))(-1)) < (__typeof__(var))1)) +#define is_signed_var(var) (!!(((__typeof__(var))(-1)) < (__typeof__(var))1)) #define __EXPECT(_expected, _expected_str, _seen, _seen_str, _t, _assert) do { \ /* Avoid multiple evaluation of the cases */ \ @@ -759,7 +759,7 @@ __typeof__(_seen) __seen = (_seen); \ if (!(__exp _t __seen)) { \ /* Report with actual signedness to avoid weird output. */ \ - switch (is_signed_type(__exp) * 2 + is_signed_type(__seen)) { \ + switch (is_signed_var(__exp) * 2 + is_signed_var(__seen)) { \ case 0: { \ uintmax_t __exp_print = (uintmax_t)__exp; \ uintmax_t __seen_print = (uintmax_t)__seen; \ diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index f6fe7a07a0a2..41b40c676d7f 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -169,6 +169,7 @@ TEST_GEN_PROGS_arm64 += arm64/vgic_irq TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3 +TEST_GEN_PROGS_arm64 += arm64/kvm-uuid TEST_GEN_PROGS_arm64 += access_tracking_perf_test TEST_GEN_PROGS_arm64 += arch_timer TEST_GEN_PROGS_arm64 += coalesced_io_test diff --git a/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c b/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c index cef8f7323ceb..713005b6f508 100644 --- a/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c @@ -146,7 +146,7 @@ static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu) val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1)); - el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val); + el0 = FIELD_GET(ID_AA64PFR0_EL1_EL0, val); return el0 == ID_AA64PFR0_EL1_EL0_IMP; } diff --git a/tools/testing/selftests/kvm/arm64/debug-exceptions.c b/tools/testing/selftests/kvm/arm64/debug-exceptions.c index e34963956fbc..1d431de8729c 100644 --- a/tools/testing/selftests/kvm/arm64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/arm64/debug-exceptions.c @@ -116,12 +116,12 @@ static void reset_debug_state(void) /* Reset all bcr/bvr/wcr/wvr registers */ dfr0 = read_sysreg(id_aa64dfr0_el1); - brps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), dfr0); + brps = FIELD_GET(ID_AA64DFR0_EL1_BRPs, dfr0); for (i = 0; i <= brps; i++) { write_dbgbcr(i, 0); write_dbgbvr(i, 0); } - wrps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), dfr0); + wrps = FIELD_GET(ID_AA64DFR0_EL1_WRPs, dfr0); for (i = 0; i <= wrps; i++) { write_dbgwcr(i, 0); write_dbgwvr(i, 0); @@ -418,7 +418,7 @@ static void guest_code_ss(int test_cnt) static int debug_version(uint64_t id_aa64dfr0) { - return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), id_aa64dfr0); + return FIELD_GET(ID_AA64DFR0_EL1_DebugVer, id_aa64dfr0); } static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn) @@ -539,14 +539,14 @@ void test_guest_debug_exceptions_all(uint64_t aa64dfr0) int b, w, c; /* Number of breakpoints */ - brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), aa64dfr0) + 1; + brp_num = FIELD_GET(ID_AA64DFR0_EL1_BRPs, aa64dfr0) + 1; __TEST_REQUIRE(brp_num >= 2, "At least two breakpoints are required"); /* Number of watchpoints */ - wrp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), aa64dfr0) + 1; + wrp_num = FIELD_GET(ID_AA64DFR0_EL1_WRPs, aa64dfr0) + 1; /* Number of context aware breakpoints */ - ctx_brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_CTX_CMPs), aa64dfr0) + 1; + ctx_brp_num = FIELD_GET(ID_AA64DFR0_EL1_CTX_CMPs, aa64dfr0) + 1; pr_debug("%s brp_num:%d, wrp_num:%d, ctx_brp_num:%d\n", __func__, brp_num, wrp_num, ctx_brp_num); diff --git a/tools/testing/selftests/kvm/arm64/kvm-uuid.c b/tools/testing/selftests/kvm/arm64/kvm-uuid.c new file mode 100644 index 000000000000..af9581b860f1 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/kvm-uuid.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Check that nobody has tampered with KVM's UID + +#include <errno.h> +#include <linux/arm-smccc.h> +#include <asm/kvm.h> +#include <kvm_util.h> + +#include "processor.h" + +/* + * Do NOT redefine these constants, or try to replace them with some + * "common" version. They are hardcoded here to detect any potential + * breakage happening in the rest of the kernel. + * + * KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 + */ +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 0xb66fb428U +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 0xe911c52eU +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 0x564bcaa9U +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3 0x743a004dU + +static void guest_code(void) +{ + struct arm_smccc_res res = {}; + + smccc_hvc(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, 0, 0, 0, 0, 0, 0, 0, &res); + + __GUEST_ASSERT(res.a0 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 && + res.a1 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 && + res.a2 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 && + res.a3 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3, + "Unexpected KVM-specific UID %lx %lx %lx %lx\n", res.a0, res.a1, res.a2, res.a3); + GUEST_DONE(); +} + +int main (int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + bool guest_done = false; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + while (!guest_done) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + break; + case UCALL_DONE: + guest_done = true; + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_PRINTF: + printf("%s", uc.buffer); + break; + default: + TEST_FAIL("Unexpected guest exit"); + } + } + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/arm64/no-vgic-v3.c b/tools/testing/selftests/kvm/arm64/no-vgic-v3.c index ebd70430c89d..f222538e6084 100644 --- a/tools/testing/selftests/kvm/arm64/no-vgic-v3.c +++ b/tools/testing/selftests/kvm/arm64/no-vgic-v3.c @@ -54,7 +54,7 @@ static void guest_code(void) * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having * hidden the feature at runtime without any other userspace action. */ - __GUEST_ASSERT(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), + __GUEST_ASSERT(FIELD_GET(ID_AA64PFR0_EL1_GIC, read_sysreg(id_aa64pfr0_el1)) == 0, "GICv3 wrongly advertised"); @@ -165,7 +165,7 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&vcpu, NULL); pfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1)); - __TEST_REQUIRE(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), pfr0), + __TEST_REQUIRE(FIELD_GET(ID_AA64PFR0_EL1_GIC, pfr0), "GICv3 not supported."); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/arm64/page_fault_test.c b/tools/testing/selftests/kvm/arm64/page_fault_test.c index dc6559dad9d8..4ccbd389d133 100644 --- a/tools/testing/selftests/kvm/arm64/page_fault_test.c +++ b/tools/testing/selftests/kvm/arm64/page_fault_test.c @@ -95,14 +95,14 @@ static bool guest_check_lse(void) uint64_t isar0 = read_sysreg(id_aa64isar0_el1); uint64_t atomic; - atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC), isar0); + atomic = FIELD_GET(ID_AA64ISAR0_EL1_ATOMIC, isar0); return atomic >= 2; } static bool guest_check_dc_zva(void) { uint64_t dczid = read_sysreg(dczid_el0); - uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_EL0_DZP), dczid); + uint64_t dzp = FIELD_GET(DCZID_EL0_DZP, dczid); return dzp == 0; } @@ -195,7 +195,7 @@ static bool guest_set_ha(void) uint64_t hadbs, tcr; /* Skip if HA is not supported. */ - hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS), mmfr1); + hadbs = FIELD_GET(ID_AA64MMFR1_EL1_HAFDBS, mmfr1); if (hadbs == 0) return false; diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index d3bf9204409c..189321e96925 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -243,6 +243,7 @@ static void guest_code(void) GUEST_REG_SYNC(SYS_ID_AA64MMFR0_EL1); GUEST_REG_SYNC(SYS_ID_AA64MMFR1_EL1); GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1); + GUEST_REG_SYNC(SYS_ID_AA64MMFR3_EL1); GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1); GUEST_REG_SYNC(SYS_CTR_EL0); GUEST_REG_SYNC(SYS_MIDR_EL1); @@ -594,8 +595,8 @@ static void test_user_set_mte_reg(struct kvm_vcpu *vcpu) */ val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1)); - mte = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), val); - mte_frac = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac), val); + mte = FIELD_GET(ID_AA64PFR1_EL1_MTE, val); + mte_frac = FIELD_GET(ID_AA64PFR1_EL1_MTE_frac, val); if (mte != ID_AA64PFR1_EL1_MTE_MTE2 || mte_frac != ID_AA64PFR1_EL1_MTE_frac_NI) { ksft_test_result_skip("MTE_ASYNC or MTE_ASYMM are supported, nothing to test\n"); @@ -612,7 +613,7 @@ static void test_user_set_mte_reg(struct kvm_vcpu *vcpu) } val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1)); - mte_frac = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac), val); + mte_frac = FIELD_GET(ID_AA64PFR1_EL1_MTE_frac, val); if (mte_frac == ID_AA64PFR1_EL1_MTE_frac_NI) ksft_test_result_pass("ID_AA64PFR1_EL1.MTE_frac=0 accepted and still 0xF\n"); else @@ -774,7 +775,7 @@ int main(void) /* Check for AARCH64 only system */ val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1)); - el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val); + el0 = FIELD_GET(ID_AA64PFR0_EL1_EL0, val); aarch64_only = (el0 == ID_AA64PFR0_EL1_EL0_IMP); ksft_print_header(); diff --git a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c index f16b3b27e32e..a0c4ab839155 100644 --- a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c +++ b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c @@ -441,7 +441,7 @@ static void create_vpmu_vm(void *guest_code) /* Make sure that PMUv3 support is indicated in the ID register */ dfr0 = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1)); - pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), dfr0); + pmuver = FIELD_GET(ID_AA64DFR0_EL1_PMUVer, dfr0); TEST_ASSERT(pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP, "Unexpected PMUVER (0x%x) on the vCPU with PMUv3", pmuver); diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 9d69904cb608..eb115123d741 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -573,15 +573,15 @@ void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k, err = ioctl(vcpu_fd, KVM_GET_ONE_REG, ®); TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd)); - gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN4), val); + gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN4, val); *ipa4k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN4_NI, ID_AA64MMFR0_EL1_TGRAN4_52_BIT); - gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN64), val); + gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN64, val); *ipa64k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN64_NI, ID_AA64MMFR0_EL1_TGRAN64_IMP); - gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN16), val); + gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN16, val); *ipa16k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN16_NI, ID_AA64MMFR0_EL1_TGRAN16_52_BIT); diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index d30625c18259..c744c603d688 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -1554,8 +1554,8 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc) } /* Read from the page to populate the shared zeropage. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, pagesize); munmap: diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index b0d42eb04e3a..8dd81c0a4a5a 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -145,7 +145,7 @@ static bool try_access_buf(char *ptr, bool write) if (write) *ptr = 'x'; else - FORCE_READ(ptr); + FORCE_READ(*ptr); } signal_jump_set = false; diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index 1afe14b9dc0c..c5940c0595be 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -50,8 +50,10 @@ void read_fault_pages(void *addr, unsigned long nr_pages) unsigned long i; for (i = 0; i < nr_pages; i++) { + unsigned long *addr2 = + ((unsigned long *)(addr + (i * huge_page_size))); /* Prevent the compiler from optimizing out the entire loop: */ - FORCE_READ(((unsigned long *)(addr + (i * huge_page_size)))); + FORCE_READ(*addr2); } } diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index c5a73617796a..ea945eebec2f 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -110,7 +110,7 @@ void *access_mem(void *ptr) * the memory access actually happens and prevents the compiler * from optimizing away this entire loop. */ - FORCE_READ((uint64_t *)ptr); + FORCE_READ(*(uint64_t *)ptr); } return NULL; diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index fccf9e797a0c..5bd52a951cbd 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -5,10 +5,14 @@ #define _GNU_SOURCE #include <errno.h> +#include <fcntl.h> +#include <linux/userfaultfd.h> #include <stdlib.h> #include <stdio.h> #include <string.h> +#include <sys/ioctl.h> #include <sys/mman.h> +#include <syscall.h> #include <time.h> #include <stdbool.h> @@ -168,6 +172,7 @@ static bool is_range_mapped(FILE *maps_fp, unsigned long start, if (first_val <= start && second_val >= end) { success = true; + fflush(maps_fp); break; } } @@ -175,6 +180,15 @@ static bool is_range_mapped(FILE *maps_fp, unsigned long start, return success; } +/* Check if [ptr, ptr + size) mapped in /proc/self/maps. */ +static bool is_ptr_mapped(FILE *maps_fp, void *ptr, unsigned long size) +{ + unsigned long start = (unsigned long)ptr; + unsigned long end = start + size; + + return is_range_mapped(maps_fp, start, end); +} + /* * Returns the start address of the mapping on success, else returns * NULL on failure. @@ -733,6 +747,249 @@ out: dont_unmap ? " [dontunnmap]" : ""); } +#ifdef __NR_userfaultfd +static void mremap_move_multi_invalid_vmas(FILE *maps_fp, + unsigned long page_size) +{ + char *test_name = "mremap move multiple invalid vmas"; + const size_t size = 10 * page_size; + bool success = true; + char *ptr, *tgt_ptr; + int uffd, err, i; + void *res; + struct uffdio_api api = { + .api = UFFD_API, + .features = UFFD_EVENT_PAGEFAULT, + }; + + uffd = syscall(__NR_userfaultfd, O_NONBLOCK); + if (uffd == -1) { + err = errno; + perror("userfaultfd"); + if (err == EPERM) { + ksft_test_result_skip("%s - missing uffd", test_name); + return; + } + success = false; + goto out; + } + if (ioctl(uffd, UFFDIO_API, &api)) { + perror("ioctl UFFDIO_API"); + success = false; + goto out_close_uffd; + } + + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + success = false; + goto out_close_uffd; + } + + tgt_ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (tgt_ptr == MAP_FAILED) { + perror("mmap"); + success = false; + goto out_close_uffd; + } + if (munmap(tgt_ptr, size)) { + perror("munmap"); + success = false; + goto out_unmap; + } + + /* + * Unmap so we end up with: + * + * 0 2 4 6 8 10 offset in buffer + * |*| |*| |*| |*| |*| + * |*| |*| |*| |*| |*| + * + * Additionally, register each with UFFD. + */ + for (i = 0; i < 10; i += 2) { + void *unmap_ptr = &ptr[(i + 1) * page_size]; + unsigned long start = (unsigned long)&ptr[i * page_size]; + struct uffdio_register reg = { + .range = { + .start = start, + .len = page_size, + }, + .mode = UFFDIO_REGISTER_MODE_MISSING, + }; + + if (ioctl(uffd, UFFDIO_REGISTER, ®) == -1) { + perror("ioctl UFFDIO_REGISTER"); + success = false; + goto out_unmap; + } + if (munmap(unmap_ptr, page_size)) { + perror("munmap"); + success = false; + goto out_unmap; + } + } + + /* + * Now try to move the entire range which is invalid for multi VMA move. + * + * This will fail, and no VMA should be moved, as we check this ahead of + * time. + */ + res = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr); + err = errno; + if (res != MAP_FAILED) { + fprintf(stderr, "mremap() succeeded for multi VMA uffd armed\n"); + success = false; + goto out_unmap; + } + if (err != EFAULT) { + errno = err; + perror("mrmeap() unexpected error"); + success = false; + goto out_unmap; + } + if (is_ptr_mapped(maps_fp, tgt_ptr, page_size)) { + fprintf(stderr, + "Invalid uffd-armed VMA at start of multi range moved\n"); + success = false; + goto out_unmap; + } + + /* + * Now try to move a single VMA, this should succeed as not multi VMA + * move. + */ + res = mremap(ptr, page_size, page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr); + if (res == MAP_FAILED) { + perror("mremap single invalid-multi VMA"); + success = false; + goto out_unmap; + } + + /* + * Unmap the VMA, and remap a non-uffd registered (therefore, multi VMA + * move valid) VMA at the start of ptr range. + */ + if (munmap(tgt_ptr, page_size)) { + perror("munmap"); + success = false; + goto out_unmap; + } + res = mmap(ptr, page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + if (res == MAP_FAILED) { + perror("mmap"); + success = false; + goto out_unmap; + } + + /* + * Now try to move the entire range, we should succeed in moving the + * first VMA, but no others, and report a failure. + */ + res = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr); + err = errno; + if (res != MAP_FAILED) { + fprintf(stderr, "mremap() succeeded for multi VMA uffd armed\n"); + success = false; + goto out_unmap; + } + if (err != EFAULT) { + errno = err; + perror("mrmeap() unexpected error"); + success = false; + goto out_unmap; + } + if (!is_ptr_mapped(maps_fp, tgt_ptr, page_size)) { + fprintf(stderr, "Valid VMA not moved\n"); + success = false; + goto out_unmap; + } + + /* + * Unmap the VMA, and map valid VMA at start of ptr range, and replace + * all existing multi-move invalid VMAs, except the last, with valid + * multi-move VMAs. + */ + if (munmap(tgt_ptr, page_size)) { + perror("munmap"); + success = false; + goto out_unmap; + } + if (munmap(ptr, size - 2 * page_size)) { + perror("munmap"); + success = false; + goto out_unmap; + } + for (i = 0; i < 8; i += 2) { + res = mmap(&ptr[i * page_size], page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + if (res == MAP_FAILED) { + perror("mmap"); + success = false; + goto out_unmap; + } + } + + /* + * Now try to move the entire range, we should succeed in moving all but + * the last VMA, and report a failure. + */ + res = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr); + err = errno; + if (res != MAP_FAILED) { + fprintf(stderr, "mremap() succeeded for multi VMA uffd armed\n"); + success = false; + goto out_unmap; + } + if (err != EFAULT) { + errno = err; + perror("mrmeap() unexpected error"); + success = false; + goto out_unmap; + } + + for (i = 0; i < 10; i += 2) { + bool is_mapped = is_ptr_mapped(maps_fp, + &tgt_ptr[i * page_size], page_size); + + if (i < 8 && !is_mapped) { + fprintf(stderr, "Valid VMA not moved at %d\n", i); + success = false; + goto out_unmap; + } else if (i == 8 && is_mapped) { + fprintf(stderr, "Invalid VMA moved at %d\n", i); + success = false; + goto out_unmap; + } + } + +out_unmap: + if (munmap(tgt_ptr, size)) + perror("munmap tgt"); + if (munmap(ptr, size)) + perror("munmap src"); +out_close_uffd: + close(uffd); +out: + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} +#else +static void mremap_move_multi_invalid_vmas(FILE *maps_fp, unsigned long page_size) +{ + char *test_name = "mremap move multiple invalid vmas"; + + ksft_test_result_skip("%s - missing uffd", test_name); +} +#endif /* __NR_userfaultfd */ + /* Returns the time taken for the remap on success else returns -1. */ static long long remap_region(struct config c, unsigned int threshold_mb, char *rand_addr) @@ -1074,7 +1331,7 @@ int main(int argc, char **argv) char *rand_addr; size_t rand_size; int num_expand_tests = 2; - int num_misc_tests = 8; + int num_misc_tests = 9; struct test test_cases[MAX_TEST] = {}; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; @@ -1197,8 +1454,6 @@ int main(int argc, char **argv) mremap_expand_merge(maps_fp, page_size); mremap_expand_merge_offset(maps_fp, page_size); - fclose(maps_fp); - mremap_move_within_range(pattern_seed, rand_addr); mremap_move_1mb_from_start(pattern_seed, rand_addr); mremap_shrink_multiple_vmas(page_size, /* inplace= */true); @@ -1207,6 +1462,9 @@ int main(int argc, char **argv) mremap_move_multiple_vmas(pattern_seed, page_size, /* dontunmap= */ true); mremap_move_multiple_vmas_split(pattern_seed, page_size, /* dontunmap= */ false); mremap_move_multiple_vmas_split(pattern_seed, page_size, /* dontunmap= */ true); + mremap_move_multi_invalid_vmas(maps_fp, page_size); + + fclose(maps_fp); if (run_perf_tests) { ksft_print_msg("\n%s\n", diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 0d4209eef0c3..e6face7c0166 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1525,7 +1525,7 @@ void zeropfn_tests(void) ret = madvise(mem, hpage_size, MADV_HUGEPAGE); if (!ret) { - FORCE_READ(mem); + FORCE_READ(*mem); ret = pagemap_ioctl(mem, hpage_size, &vec, 1, 0, 0, PAGE_IS_PFNZERO, 0, 0, PAGE_IS_PFNZERO); diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 05de1fc0005b..44a3f8a58806 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -439,8 +439,11 @@ int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd, } madvise(*addr, fd_size, MADV_HUGEPAGE); - for (size_t i = 0; i < fd_size; i++) - FORCE_READ((*addr + i)); + for (size_t i = 0; i < fd_size; i++) { + char *addr2 = *addr + i; + + FORCE_READ(*addr2); + } if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index c20298ae98ea..b55d1809debc 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -23,7 +23,7 @@ * anything with it in order to trigger a read page fault. We therefore must use * volatile to stop the compiler from optimising this away. */ -#define FORCE_READ(x) (*(volatile typeof(x) *)x) +#define FORCE_READ(x) (*(const volatile typeof(x) *)&(x)) extern unsigned int __page_size; extern unsigned int __page_shift; diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index b1e4618399be..a688871a98eb 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -107,6 +107,26 @@ #endif #endif +#ifndef __NR_open_tree_attr + #if defined __alpha__ + #define __NR_open_tree_attr 577 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_open_tree_attr (467 + 4000) + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_open_tree_attr (467 + 6000) + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_open_tree_attr (467 + 5000) + #endif + #elif defined __ia64__ + #define __NR_open_tree_attr (467 + 1024) + #else + #define __NR_open_tree_attr 467 + #endif +#endif + #ifndef MOUNT_ATTR_IDMAP #define MOUNT_ATTR_IDMAP 0x00100000 #endif @@ -121,6 +141,12 @@ static inline int sys_mount_setattr(int dfd, const char *path, unsigned int flag return syscall(__NR_mount_setattr, dfd, path, flags, attr, size); } +static inline int sys_open_tree_attr(int dfd, const char *path, unsigned int flags, + struct mount_attr *attr, size_t size) +{ + return syscall(__NR_open_tree_attr, dfd, path, flags, attr, size); +} + static ssize_t write_nointr(int fd, const void *buf, size_t count) { ssize_t ret; @@ -1222,6 +1248,12 @@ TEST_F(mount_setattr_idmapped, attached_mount_inside_current_mount_namespace) attr.userns_fd = get_userns_fd(0, 10000, 10000); ASSERT_GE(attr.userns_fd, 0); ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + /* + * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way + * to bypass this mount_setattr() restriction. + */ + ASSERT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); ASSERT_EQ(close(open_tree_fd), 0); } @@ -1255,6 +1287,12 @@ TEST_F(mount_setattr_idmapped, attached_mount_outside_current_mount_namespace) ASSERT_GE(attr.userns_fd, 0); ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + /* + * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way + * to bypass this mount_setattr() restriction. + */ + ASSERT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); ASSERT_EQ(close(open_tree_fd), 0); } @@ -1321,6 +1359,19 @@ TEST_F(mount_setattr_idmapped, detached_mount_outside_current_mount_namespace) ASSERT_EQ(close(open_tree_fd), 0); } +static bool expected_uid_gid(int dfd, const char *path, int flags, + uid_t expected_uid, gid_t expected_gid) +{ + int ret; + struct stat st; + + ret = fstatat(dfd, path, &st, flags); + if (ret < 0) + return false; + + return st.st_uid == expected_uid && st.st_gid == expected_gid; +} + /** * Validate that currently changing the idmapping of an idmapped mount fails. */ @@ -1331,6 +1382,8 @@ TEST_F(mount_setattr_idmapped, change_idmapping) .attr_set = MOUNT_ATTR_IDMAP, }; + ASSERT_TRUE(expected_uid_gid(-EBADF, "/mnt/D", 0, 0, 0)); + if (!mount_setattr_supported()) SKIP(return, "mount_setattr syscall not supported"); @@ -1348,27 +1401,25 @@ TEST_F(mount_setattr_idmapped, change_idmapping) AT_EMPTY_PATH, &attr, sizeof(attr)), 0); ASSERT_EQ(close(attr.userns_fd), 0); + EXPECT_FALSE(expected_uid_gid(open_tree_fd, ".", 0, 0, 0)); + EXPECT_TRUE(expected_uid_gid(open_tree_fd, ".", 0, 10000, 10000)); + /* Change idmapping on a detached mount that is already idmapped. */ attr.userns_fd = get_userns_fd(0, 20000, 10000); ASSERT_GE(attr.userns_fd, 0); ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + /* + * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way + * to bypass this mount_setattr() restriction. + */ + EXPECT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + EXPECT_FALSE(expected_uid_gid(open_tree_fd, ".", 0, 20000, 20000)); + EXPECT_TRUE(expected_uid_gid(open_tree_fd, ".", 0, 10000, 10000)); + ASSERT_EQ(close(attr.userns_fd), 0); ASSERT_EQ(close(open_tree_fd), 0); } -static bool expected_uid_gid(int dfd, const char *path, int flags, - uid_t expected_uid, gid_t expected_gid) -{ - int ret; - struct stat st; - - ret = fstatat(dfd, path, &st, flags); - if (ret < 0) - return false; - - return st.st_uid == expected_uid && st.st_gid == expected_gid; -} - TEST_F(mount_setattr_idmapped, idmap_mount_tree_invalid) { int open_tree_fd = -EBADF; diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore new file mode 100644 index 000000000000..ccfb40837a73 --- /dev/null +++ b/tools/testing/selftests/namespaces/.gitignore @@ -0,0 +1,3 @@ +nsid_test +file_handle_test +init_ino_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile new file mode 100644 index 000000000000..5fe4b3dc07d3 --- /dev/null +++ b/tools/testing/selftests/namespaces/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) + +TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test + +include ../lib.mk + diff --git a/tools/testing/selftests/namespaces/config b/tools/testing/selftests/namespaces/config new file mode 100644 index 000000000000..d09836260262 --- /dev/null +++ b/tools/testing/selftests/namespaces/config @@ -0,0 +1,7 @@ +CONFIG_UTS_NS=y +CONFIG_TIME_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CGROUPS=y diff --git a/tools/testing/selftests/namespaces/file_handle_test.c b/tools/testing/selftests/namespaces/file_handle_test.c new file mode 100644 index 000000000000..f1bc5773f552 --- /dev/null +++ b/tools/testing/selftests/namespaces/file_handle_test.c @@ -0,0 +1,1429 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <grp.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/unistd.h> +#include "../kselftest_harness.h" + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ +#endif + +TEST(nsfs_net_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open a namespace file descriptor */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT as unprivileged user */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + if (fd < 0 && errno == EPERM) { + SKIP(free(handle); close(ns_fd); + return, + "Permission denied for unprivileged user (expected)"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_uts_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open UTS namespace file descriptor */ + ns_fd = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_ipc_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open IPC namespace file descriptor */ + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_pid_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open PID namespace file descriptor */ + ns_fd = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_mnt_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open mount namespace file descriptor */ + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_user_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open user namespace file descriptor */ + ns_fd = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_cgroup_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open cgroup namespace file descriptor */ + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); return, "cgroup namespace not available"); + } + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_time_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open time namespace file descriptor */ + ns_fd = open("/proc/self/ns/time", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); return, "time namespace not available"); + } + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_user_net_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current network namespace */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create network namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's network namespace handle from new user+net namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new network namespace"); + } + + /* Should fail with permission denied since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_uts_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current UTS namespace */ + ns_fd = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new UTS namespace */ + ret = unshare(CLONE_NEWUTS); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create UTS namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's UTS namespace handle from new user+uts namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new UTS namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_ipc_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current IPC namespace */ + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new IPC namespace */ + ret = unshare(CLONE_NEWIPC); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create IPC namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's IPC namespace handle from new user+ipc namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new IPC namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_mnt_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current mount namespace */ + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new mount namespace */ + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create mount namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's mount namespace handle from new user+mnt namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new mount namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_cgroup_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current cgroup namespace */ + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); + return, "cgroup namespace not available"); + } + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new cgroup namespace */ + ret = unshare(CLONE_NEWCGROUP); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create cgroup namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's cgroup namespace handle from new user+cgroup namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new cgroup namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_pid_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current PID namespace */ + ns_fd = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new PID namespace - requires fork to take effect */ + ret = unshare(CLONE_NEWPID); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create PID namespace */ + close(pipefd[1]); + exit(0); + } + + /* Fork again for PID namespace to take effect */ + pid_t child_pid = fork(); + if (child_pid < 0) { + write(pipefd[1], "N", + 1); /* Unable to fork in PID namespace */ + close(pipefd[1]); + exit(0); + } + + if (child_pid == 0) { + /* Grandchild in new PID namespace */ + /* Try to open parent's PID namespace handle from new user+pid namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", + 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child_pid, NULL, 0); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new PID namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_time_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current time namespace */ + ns_fd = open("/proc/self/ns/time", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); + return, "time namespace not available"); + } + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new time namespace - requires fork to take effect */ + ret = unshare(CLONE_NEWTIME); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create time namespace */ + close(pipefd[1]); + exit(0); + } + + /* Fork again for time namespace to take effect */ + pid_t child_pid = fork(); + if (child_pid < 0) { + write(pipefd[1], "N", + 1); /* Unable to fork in time namespace */ + close(pipefd[1]); + exit(0); + } + + if (child_pid == 0) { + /* Grandchild in new time namespace */ + /* Try to open parent's time namespace handle from new user+time namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", + 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child_pid, NULL, 0); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new time namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_open_flags) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open a namespace file descriptor */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Test invalid flags that should fail */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_WRONLY); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDWR); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TRUNC); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECT); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EINVAL); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TMPFILE); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EINVAL); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECTORY); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, ENOTDIR); + + close(ns_fd); + free(handle); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/init_ino_test.c b/tools/testing/selftests/namespaces/init_ino_test.c new file mode 100644 index 000000000000..5b6993c3740b --- /dev/null +++ b/tools/testing/selftests/namespaces/init_ino_test.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2025 Christian Brauner <brauner@kernel.org> + +#define _GNU_SOURCE +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <linux/nsfs.h> + +#include "../kselftest_harness.h" + +struct ns_info { + const char *name; + const char *proc_path; + unsigned int expected_ino; +}; + +static struct ns_info namespaces[] = { + { "ipc", "/proc/1/ns/ipc", IPC_NS_INIT_INO }, + { "uts", "/proc/1/ns/uts", UTS_NS_INIT_INO }, + { "user", "/proc/1/ns/user", USER_NS_INIT_INO }, + { "pid", "/proc/1/ns/pid", PID_NS_INIT_INO }, + { "cgroup", "/proc/1/ns/cgroup", CGROUP_NS_INIT_INO }, + { "time", "/proc/1/ns/time", TIME_NS_INIT_INO }, + { "net", "/proc/1/ns/net", NET_NS_INIT_INO }, + { "mnt", "/proc/1/ns/mnt", MNT_NS_INIT_INO }, +}; + +TEST(init_namespace_inodes) +{ + struct stat st; + + for (int i = 0; i < sizeof(namespaces) / sizeof(namespaces[0]); i++) { + int ret = stat(namespaces[i].proc_path, &st); + + /* Some namespaces might not be available (e.g., time namespace on older kernels) */ + if (ret < 0) { + if (errno == ENOENT) { + ksft_test_result_skip("%s namespace not available\n", + namespaces[i].name); + continue; + } + ASSERT_GE(ret, 0) + TH_LOG("Failed to stat %s: %s", + namespaces[i].proc_path, strerror(errno)); + } + + ASSERT_EQ(st.st_ino, namespaces[i].expected_ino) + TH_LOG("Namespace %s has inode 0x%lx, expected 0x%x", + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); + + ksft_print_msg("Namespace %s: inode 0x%lx matches expected 0x%x\n", + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c new file mode 100644 index 000000000000..e28accd74a57 --- /dev/null +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -0,0 +1,986 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <assert.h> +#include <fcntl.h> +#include <inttypes.h> +#include <libgen.h> +#include <limits.h> +#include <pthread.h> +#include <string.h> +#include <sys/mount.h> +#include <poll.h> +#include <sys/epoll.h> +#include <sys/resource.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <unistd.h> +#include <linux/fs.h> +#include <linux/limits.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" + +TEST(nsid_mntns_basic) +{ + __u64 mnt_ns_id = 0; + int fd_mntns; + int ret; + + /* Open the current mount namespace */ + fd_mntns = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(fd_mntns, 0); + + /* Get the mount namespace ID */ + ret = ioctl(fd_mntns, NS_GET_MNTNS_ID, &mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(mnt_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 mnt_ns_id2 = 0; + ret = ioctl(fd_mntns, NS_GET_ID, &mnt_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(mnt_ns_id, mnt_ns_id2); + + close(fd_mntns); +} + +TEST(nsid_mntns_separate) +{ + __u64 parent_mnt_ns_id = 0; + __u64 child_mnt_ns_id = 0; + int fd_parent_mntns, fd_child_mntns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's mount namespace ID */ + fd_parent_mntns = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(fd_parent_mntns, 0); + ret = ioctl(fd_parent_mntns, NS_GET_ID, &parent_mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_mnt_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new mount namespace */ + ret = unshare(CLONE_NEWNS); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_mntns); + SKIP(return, "No permission to create mount namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's mount namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/mnt", pid); + fd_child_mntns = open(path, O_RDONLY); + ASSERT_GE(fd_child_mntns, 0); + + /* Get child's mount namespace ID */ + ret = ioctl(fd_child_mntns, NS_GET_ID, &child_mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_mnt_ns_id, 0); + + /* Parent and child should have different mount namespace IDs */ + ASSERT_NE(parent_mnt_ns_id, child_mnt_ns_id); + + close(fd_parent_mntns); + close(fd_child_mntns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_cgroupns_basic) +{ + __u64 cgroup_ns_id = 0; + int fd_cgroupns; + int ret; + + /* Open the current cgroup namespace */ + fd_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); + ASSERT_GE(fd_cgroupns, 0); + + /* Get the cgroup namespace ID */ + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(cgroup_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 cgroup_ns_id2 = 0; + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(cgroup_ns_id, cgroup_ns_id2); + + close(fd_cgroupns); +} + +TEST(nsid_cgroupns_separate) +{ + __u64 parent_cgroup_ns_id = 0; + __u64 child_cgroup_ns_id = 0; + int fd_parent_cgroupns, fd_child_cgroupns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's cgroup namespace ID */ + fd_parent_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); + ASSERT_GE(fd_parent_cgroupns, 0); + ret = ioctl(fd_parent_cgroupns, NS_GET_ID, &parent_cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_cgroup_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new cgroup namespace */ + ret = unshare(CLONE_NEWCGROUP); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_cgroupns); + SKIP(return, "No permission to create cgroup namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's cgroup namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/cgroup", pid); + fd_child_cgroupns = open(path, O_RDONLY); + ASSERT_GE(fd_child_cgroupns, 0); + + /* Get child's cgroup namespace ID */ + ret = ioctl(fd_child_cgroupns, NS_GET_ID, &child_cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_cgroup_ns_id, 0); + + /* Parent and child should have different cgroup namespace IDs */ + ASSERT_NE(parent_cgroup_ns_id, child_cgroup_ns_id); + + close(fd_parent_cgroupns); + close(fd_child_cgroupns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_ipcns_basic) +{ + __u64 ipc_ns_id = 0; + int fd_ipcns; + int ret; + + /* Open the current IPC namespace */ + fd_ipcns = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(fd_ipcns, 0); + + /* Get the IPC namespace ID */ + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(ipc_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 ipc_ns_id2 = 0; + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(ipc_ns_id, ipc_ns_id2); + + close(fd_ipcns); +} + +TEST(nsid_ipcns_separate) +{ + __u64 parent_ipc_ns_id = 0; + __u64 child_ipc_ns_id = 0; + int fd_parent_ipcns, fd_child_ipcns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's IPC namespace ID */ + fd_parent_ipcns = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(fd_parent_ipcns, 0); + ret = ioctl(fd_parent_ipcns, NS_GET_ID, &parent_ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_ipc_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new IPC namespace */ + ret = unshare(CLONE_NEWIPC); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_ipcns); + SKIP(return, "No permission to create IPC namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's IPC namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/ipc", pid); + fd_child_ipcns = open(path, O_RDONLY); + ASSERT_GE(fd_child_ipcns, 0); + + /* Get child's IPC namespace ID */ + ret = ioctl(fd_child_ipcns, NS_GET_ID, &child_ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_ipc_ns_id, 0); + + /* Parent and child should have different IPC namespace IDs */ + ASSERT_NE(parent_ipc_ns_id, child_ipc_ns_id); + + close(fd_parent_ipcns); + close(fd_child_ipcns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_utsns_basic) +{ + __u64 uts_ns_id = 0; + int fd_utsns; + int ret; + + /* Open the current UTS namespace */ + fd_utsns = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(fd_utsns, 0); + + /* Get the UTS namespace ID */ + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(uts_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 uts_ns_id2 = 0; + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(uts_ns_id, uts_ns_id2); + + close(fd_utsns); +} + +TEST(nsid_utsns_separate) +{ + __u64 parent_uts_ns_id = 0; + __u64 child_uts_ns_id = 0; + int fd_parent_utsns, fd_child_utsns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's UTS namespace ID */ + fd_parent_utsns = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(fd_parent_utsns, 0); + ret = ioctl(fd_parent_utsns, NS_GET_ID, &parent_uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_uts_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new UTS namespace */ + ret = unshare(CLONE_NEWUTS); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_utsns); + SKIP(return, "No permission to create UTS namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's UTS namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid); + fd_child_utsns = open(path, O_RDONLY); + ASSERT_GE(fd_child_utsns, 0); + + /* Get child's UTS namespace ID */ + ret = ioctl(fd_child_utsns, NS_GET_ID, &child_uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_uts_ns_id, 0); + + /* Parent and child should have different UTS namespace IDs */ + ASSERT_NE(parent_uts_ns_id, child_uts_ns_id); + + close(fd_parent_utsns); + close(fd_child_utsns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_userns_basic) +{ + __u64 user_ns_id = 0; + int fd_userns; + int ret; + + /* Open the current user namespace */ + fd_userns = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(fd_userns, 0); + + /* Get the user namespace ID */ + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(user_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 user_ns_id2 = 0; + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(user_ns_id, user_ns_id2); + + close(fd_userns); +} + +TEST(nsid_userns_separate) +{ + __u64 parent_user_ns_id = 0; + __u64 child_user_ns_id = 0; + int fd_parent_userns, fd_child_userns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's user namespace ID */ + fd_parent_userns = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(fd_parent_userns, 0); + ret = ioctl(fd_parent_userns, NS_GET_ID, &parent_user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_user_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new user namespace */ + ret = unshare(CLONE_NEWUSER); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_userns); + SKIP(return, "No permission to create user namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's user namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/user", pid); + fd_child_userns = open(path, O_RDONLY); + ASSERT_GE(fd_child_userns, 0); + + /* Get child's user namespace ID */ + ret = ioctl(fd_child_userns, NS_GET_ID, &child_user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_user_ns_id, 0); + + /* Parent and child should have different user namespace IDs */ + ASSERT_NE(parent_user_ns_id, child_user_ns_id); + + close(fd_parent_userns); + close(fd_child_userns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_timens_basic) +{ + __u64 time_ns_id = 0; + int fd_timens; + int ret; + + /* Open the current time namespace */ + fd_timens = open("/proc/self/ns/time", O_RDONLY); + if (fd_timens < 0) { + SKIP(return, "Time namespaces not supported"); + } + + /* Get the time namespace ID */ + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(time_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 time_ns_id2 = 0; + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(time_ns_id, time_ns_id2); + + close(fd_timens); +} + +TEST(nsid_timens_separate) +{ + __u64 parent_time_ns_id = 0; + __u64 child_time_ns_id = 0; + int fd_parent_timens, fd_child_timens; + int ret; + pid_t pid; + int pipefd[2]; + + /* Open the current time namespace */ + fd_parent_timens = open("/proc/self/ns/time", O_RDONLY); + if (fd_parent_timens < 0) { + SKIP(return, "Time namespaces not supported"); + } + + /* Get parent's time namespace ID */ + ret = ioctl(fd_parent_timens, NS_GET_ID, &parent_time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_time_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new time namespace */ + ret = unshare(CLONE_NEWTIME); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES || errno == EINVAL) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Fork a grandchild to actually enter the new namespace */ + pid_t grandchild = fork(); + if (grandchild == 0) { + /* Grandchild is in the new namespace */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + pause(); + _exit(0); + } else if (grandchild > 0) { + /* Child writes grandchild PID and waits */ + write(pipefd[1], "Y", 1); + write(pipefd[1], &grandchild, sizeof(grandchild)); + close(pipefd[1]); + pause(); /* Keep the parent alive to maintain the grandchild */ + _exit(0); + } else { + _exit(1); + } + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_timens); + close(pipefd[0]); + SKIP(return, "Cannot create time namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + pid_t grandchild_pid; + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + close(pipefd[0]); + + /* Open grandchild's time namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/time", grandchild_pid); + fd_child_timens = open(path, O_RDONLY); + ASSERT_GE(fd_child_timens, 0); + + /* Get child's time namespace ID */ + ret = ioctl(fd_child_timens, NS_GET_ID, &child_time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_time_ns_id, 0); + + /* Parent and child should have different time namespace IDs */ + ASSERT_NE(parent_time_ns_id, child_time_ns_id); + + close(fd_parent_timens); + close(fd_child_timens); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_pidns_basic) +{ + __u64 pid_ns_id = 0; + int fd_pidns; + int ret; + + /* Open the current PID namespace */ + fd_pidns = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(fd_pidns, 0); + + /* Get the PID namespace ID */ + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(pid_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 pid_ns_id2 = 0; + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(pid_ns_id, pid_ns_id2); + + close(fd_pidns); +} + +TEST(nsid_pidns_separate) +{ + __u64 parent_pid_ns_id = 0; + __u64 child_pid_ns_id = 0; + int fd_parent_pidns, fd_child_pidns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's PID namespace ID */ + fd_parent_pidns = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(fd_parent_pidns, 0); + ret = ioctl(fd_parent_pidns, NS_GET_ID, &parent_pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_pid_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new PID namespace */ + ret = unshare(CLONE_NEWPID); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Fork a grandchild to actually enter the new namespace */ + pid_t grandchild = fork(); + if (grandchild == 0) { + /* Grandchild is in the new namespace */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + pause(); + _exit(0); + } else if (grandchild > 0) { + /* Child writes grandchild PID and waits */ + write(pipefd[1], "Y", 1); + write(pipefd[1], &grandchild, sizeof(grandchild)); + close(pipefd[1]); + pause(); /* Keep the parent alive to maintain the grandchild */ + _exit(0); + } else { + _exit(1); + } + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_pidns); + close(pipefd[0]); + SKIP(return, "No permission to create PID namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + pid_t grandchild_pid; + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + close(pipefd[0]); + + /* Open grandchild's PID namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/pid", grandchild_pid); + fd_child_pidns = open(path, O_RDONLY); + ASSERT_GE(fd_child_pidns, 0); + + /* Get child's PID namespace ID */ + ret = ioctl(fd_child_pidns, NS_GET_ID, &child_pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_pid_ns_id, 0); + + /* Parent and child should have different PID namespace IDs */ + ASSERT_NE(parent_pid_ns_id, child_pid_ns_id); + + close(fd_parent_pidns); + close(fd_child_pidns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_netns_basic) +{ + __u64 net_ns_id = 0; + __u64 netns_cookie = 0; + int fd_netns; + int sock; + socklen_t optlen; + int ret; + + /* Open the current network namespace */ + fd_netns = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(fd_netns, 0); + + /* Get the network namespace ID via ioctl */ + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(net_ns_id, 0); + + /* Create a socket to get the SO_NETNS_COOKIE */ + sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(sock, 0); + + /* Get the network namespace cookie via socket option */ + optlen = sizeof(netns_cookie); + ret = getsockopt(sock, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + ASSERT_EQ(optlen, sizeof(netns_cookie)); + + /* The namespace ID and cookie should be identical */ + ASSERT_EQ(net_ns_id, netns_cookie); + + /* Verify we can get the same ID again */ + __u64 net_ns_id2 = 0; + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(net_ns_id, net_ns_id2); + + close(sock); + close(fd_netns); +} + +TEST(nsid_netns_separate) +{ + __u64 parent_net_ns_id = 0; + __u64 parent_netns_cookie = 0; + __u64 child_net_ns_id = 0; + __u64 child_netns_cookie = 0; + int fd_parent_netns, fd_child_netns; + int parent_sock, child_sock; + socklen_t optlen; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's network namespace ID */ + fd_parent_netns = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(fd_parent_netns, 0); + ret = ioctl(fd_parent_netns, NS_GET_ID, &parent_net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_net_ns_id, 0); + + /* Get parent's network namespace cookie */ + parent_sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(parent_sock, 0); + optlen = sizeof(parent_netns_cookie); + ret = getsockopt(parent_sock, SOL_SOCKET, SO_NETNS_COOKIE, &parent_netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + + /* Verify parent's ID and cookie match */ + ASSERT_EQ(parent_net_ns_id, parent_netns_cookie); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_netns); + close(parent_sock); + SKIP(return, "No permission to create network namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's network namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/net", pid); + fd_child_netns = open(path, O_RDONLY); + ASSERT_GE(fd_child_netns, 0); + + /* Get child's network namespace ID */ + ret = ioctl(fd_child_netns, NS_GET_ID, &child_net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_net_ns_id, 0); + + /* Create socket in child's namespace to get cookie */ + ret = setns(fd_child_netns, CLONE_NEWNET); + if (ret == 0) { + child_sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(child_sock, 0); + + optlen = sizeof(child_netns_cookie); + ret = getsockopt(child_sock, SOL_SOCKET, SO_NETNS_COOKIE, &child_netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + + /* Verify child's ID and cookie match */ + ASSERT_EQ(child_net_ns_id, child_netns_cookie); + + close(child_sock); + + /* Return to parent namespace */ + setns(fd_parent_netns, CLONE_NEWNET); + } + + /* Parent and child should have different network namespace IDs */ + ASSERT_NE(parent_net_ns_id, child_net_ns_id); + if (child_netns_cookie != 0) { + ASSERT_NE(parent_netns_cookie, child_netns_cookie); + } + + close(fd_parent_netns); + close(fd_child_netns); + close(parent_sock); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index b31a71f2b372..2b31d4a93ad7 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -99,6 +99,7 @@ TEST_GEN_PROGS += bind_wildcard TEST_GEN_PROGS += bind_timewait TEST_PROGS += test_vxlan_mdb.sh TEST_PROGS += test_bridge_neigh_suppress.sh +TEST_PROGS += test_vxlan_nh.sh TEST_PROGS += test_vxlan_nolocalbypass.sh TEST_PROGS += test_bridge_backup_port.sh TEST_PROGS += test_neigh.sh @@ -115,6 +116,7 @@ TEST_PROGS += skf_net_off.sh TEST_GEN_FILES += skf_net_off TEST_GEN_FILES += tfo TEST_PROGS += tfo_passive.sh +TEST_PROGS += broadcast_ether_dst.sh TEST_PROGS += broadcast_pmtu.sh TEST_PROGS += ipv6_force_forwarding.sh diff --git a/tools/testing/selftests/net/bind_bhash.c b/tools/testing/selftests/net/bind_bhash.c index 57ff67a3751e..da04b0b19b73 100644 --- a/tools/testing/selftests/net/bind_bhash.c +++ b/tools/testing/selftests/net/bind_bhash.c @@ -75,7 +75,7 @@ static void *setup(void *arg) int *array = (int *)arg; for (i = 0; i < MAX_CONNECTIONS; i++) { - sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr); + sock_fd = bind_socket(SO_REUSEPORT, setup_addr); if (sock_fd < 0) { ret = sock_fd; pthread_exit(&ret); @@ -103,7 +103,7 @@ int main(int argc, const char *argv[]) setup_addr = use_v6 ? setup_addr_v6 : setup_addr_v4; - listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr); + listener_fd = bind_socket(SO_REUSEPORT, setup_addr); if (listen(listener_fd, 100) < 0) { perror("listen failed"); return -1; diff --git a/tools/testing/selftests/net/broadcast_ether_dst.sh b/tools/testing/selftests/net/broadcast_ether_dst.sh new file mode 100755 index 000000000000..334a7eca8a80 --- /dev/null +++ b/tools/testing/selftests/net/broadcast_ether_dst.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Author: Brett A C Sheffield <bacs@librecast.net> +# Author: Oscar Maes <oscmaes92@gmail.com> +# +# Ensure destination ethernet field is correctly set for +# broadcast packets + +source lib.sh + +CLIENT_IP4="192.168.0.1" +GW_IP4="192.168.0.2" + +setup() { + setup_ns CLIENT_NS SERVER_NS + + ip -net "${SERVER_NS}" link add link1 type veth \ + peer name link0 netns "${CLIENT_NS}" + + ip -net "${CLIENT_NS}" link set link0 up + ip -net "${CLIENT_NS}" addr add "${CLIENT_IP4}"/24 dev link0 + + ip -net "${SERVER_NS}" link set link1 up + + ip -net "${CLIENT_NS}" route add default via "${GW_IP4}" + ip netns exec "${CLIENT_NS}" arp -s "${GW_IP4}" 00:11:22:33:44:55 +} + +cleanup() { + rm -f "${CAPFILE}" "${OUTPUT}" + ip -net "${SERVER_NS}" link del link1 + cleanup_ns "${CLIENT_NS}" "${SERVER_NS}" +} + +test_broadcast_ether_dst() { + local rc=0 + CAPFILE=$(mktemp -u cap.XXXXXXXXXX) + OUTPUT=$(mktemp -u out.XXXXXXXXXX) + + echo "Testing ethernet broadcast destination" + + # start tcpdump listening for icmp + # tcpdump will exit after receiving a single packet + # timeout will kill tcpdump if it is still running after 2s + timeout 2s ip netns exec "${CLIENT_NS}" \ + tcpdump -i link0 -c 1 -w "${CAPFILE}" icmp &> "${OUTPUT}" & + pid=$! + slowwait 1 grep -qs "listening" "${OUTPUT}" + + # send broadcast ping + ip netns exec "${CLIENT_NS}" \ + ping -W0.01 -c1 -b 255.255.255.255 &> /dev/null + + # wait for tcpdump for exit after receiving packet + wait "${pid}" + + # compare ethernet destination field to ff:ff:ff:ff:ff:ff + ether_dst=$(tcpdump -r "${CAPFILE}" -tnne 2>/dev/null | \ + awk '{sub(/,/,"",$3); print $3}') + if [[ "${ether_dst}" == "ff:ff:ff:ff:ff:ff" ]]; then + echo "[ OK ]" + rc="${ksft_pass}" + else + echo "[FAIL] expected dst ether addr to be ff:ff:ff:ff:ff:ff," \ + "got ${ether_dst}" + rc="${ksft_fail}" + fi + + return "${rc}" +} + +if [ ! -x "$(command -v tcpdump)" ]; then + echo "SKIP: Could not run test without tcpdump tool" + exit "${ksft_skip}" +fi + +trap cleanup EXIT + +setup +test_broadcast_ether_dst + +exit $? diff --git a/tools/testing/selftests/net/can/config b/tools/testing/selftests/net/can/config new file mode 100644 index 000000000000..188f79796670 --- /dev/null +++ b/tools/testing/selftests/net/can/config @@ -0,0 +1,3 @@ +CONFIG_CAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_VCAN=m diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index b39f748c2572..2b0a90581e2f 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -467,8 +467,8 @@ ipv6_fdb_grp_fcnal() log_test $? 0 "Get Fdb nexthop group by id" # fdb nexthop group can only contain fdb nexthops - run_cmd "$IP nexthop add id 63 via 2001:db8:91::4" - run_cmd "$IP nexthop add id 64 via 2001:db8:91::5" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::4 dev veth1" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::5 dev veth1" run_cmd "$IP nexthop add id 103 group 63/64 fdb" log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" @@ -494,6 +494,26 @@ ipv6_fdb_grp_fcnal() run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb" log_test $? 2 "Fdb Nexthop with encap" + # Replace FDB nexthop to non-FDB and vice versa + run_cmd "$IP nexthop add id 70 via 2001:db8:91::2 fdb" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 dev veth1" + log_test $? 0 "Replace FDB nexthop to non-FDB nexthop" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 fdb" + log_test $? 0 "Replace non-FDB nexthop to FDB nexthop" + + # Replace FDB nexthop address while in a group + run_cmd "$IP nexthop add id 71 group 70 fdb" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::3 fdb" + log_test $? 0 "Replace FDB nexthop address while in a group" + + # Cannot replace FDB nexthop to non-FDB and vice versa while in a group + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 dev veth1" + log_test $? 2 "Replace FDB nexthop to non-FDB nexthop while in a group" + run_cmd "$IP nexthop add id 72 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 73 group 72" + run_cmd "$IP nexthop replace id 72 via 2001:db8:91::2 fdb" + log_test $? 2 "Replace non-FDB nexthop to FDB nexthop while in a group" + run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100" run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" log_test $? 0 "Fdb mac add with nexthop group" @@ -547,15 +567,15 @@ ipv4_fdb_grp_fcnal() log_test $? 0 "Get Fdb nexthop group by id" # fdb nexthop group can only contain fdb nexthops - run_cmd "$IP nexthop add id 14 via 172.16.1.2" - run_cmd "$IP nexthop add id 15 via 172.16.1.3" + run_cmd "$IP nexthop add id 14 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 15 via 172.16.1.3 dev veth1" run_cmd "$IP nexthop add id 103 group 14/15 fdb" log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" # Non fdb nexthop group can not contain fdb nexthops run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb" run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb" - run_cmd "$IP nexthop add id 104 group 14/15" + run_cmd "$IP nexthop add id 104 group 16/17" log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" # fdb nexthop cannot have blackhole @@ -574,6 +594,26 @@ ipv4_fdb_grp_fcnal() run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb" log_test $? 2 "Fdb Nexthop with encap" + # Replace FDB nexthop to non-FDB and vice versa + run_cmd "$IP nexthop add id 18 via 172.16.1.2 fdb" + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 dev veth1" + log_test $? 0 "Replace FDB nexthop to non-FDB nexthop" + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 fdb" + log_test $? 0 "Replace non-FDB nexthop to FDB nexthop" + + # Replace FDB nexthop address while in a group + run_cmd "$IP nexthop add id 19 group 18 fdb" + run_cmd "$IP nexthop replace id 18 via 172.16.1.3 fdb" + log_test $? 0 "Replace FDB nexthop address while in a group" + + # Cannot replace FDB nexthop to non-FDB and vice versa while in a group + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 dev veth1" + log_test $? 2 "Replace FDB nexthop to non-FDB nexthop while in a group" + run_cmd "$IP nexthop add id 20 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 21 group 20" + run_cmd "$IP nexthop replace id 20 via 172.16.1.2 fdb" + log_test $? 2 "Replace non-FDB nexthop to FDB nexthop while in a group" + run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100" run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" log_test $? 0 "Fdb mac add with nexthop group" @@ -582,7 +622,7 @@ ipv4_fdb_grp_fcnal() run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self" log_test $? 255 "Fdb mac add with nexthop" - run_cmd "$IP ro add 172.16.0.0/22 nhid 15" + run_cmd "$IP ro add 172.16.0.0/22 nhid 16" log_test $? 2 "Route add with fdb nexthop" run_cmd "$IP ro add 172.16.0.0/22 nhid 103" diff --git a/tools/testing/selftests/net/forwarding/router.sh b/tools/testing/selftests/net/forwarding/router.sh index b98ea9449b8b..dfb6646cb97b 100755 --- a/tools/testing/selftests/net/forwarding/router.sh +++ b/tools/testing/selftests/net/forwarding/router.sh @@ -18,6 +18,8 @@ # | 2001:db8:1::1/64 2001:db8:2::1/64 | # | | # +-----------------------------------------------------------------+ +# +#shellcheck disable=SC2034 # SC doesn't see our uses of global variables ALL_TESTS=" ping_ipv4 @@ -27,6 +29,7 @@ ALL_TESTS=" ipv4_sip_equal_dip ipv6_sip_equal_dip ipv4_dip_link_local + ipv4_sip_link_local " NUM_NETIFS=4 @@ -330,6 +333,32 @@ ipv4_dip_link_local() tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower } +ipv4_sip_link_local() +{ + local sip=169.254.1.1 + + RET=0 + + # Disable rpfilter to prevent packets to be dropped because of it. + sysctl_set net.ipv4.conf.all.rp_filter 0 + sysctl_set net.ipv4.conf."$rp1".rp_filter 0 + + tc filter add dev "$rp2" egress protocol ip pref 1 handle 101 \ + flower src_ip "$sip" action pass + + $MZ "$h1" -t udp "sp=54321,dp=12345" -c 5 -d 1msec -b "$rp1mac" \ + -A "$sip" -B 198.51.100.2 -q + + tc_check_packets "dev $rp2 egress" 101 5 + check_err $? "Packets were dropped" + + log_test "IPv4 source IP is link-local" + + tc filter del dev "$rp2" egress protocol ip pref 1 handle 101 flower + sysctl_restore net.ipv4.conf."$rp1".rp_filter + sysctl_restore net.ipv4.conf.all.rp_filter +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/sch_ets.sh b/tools/testing/selftests/net/forwarding/sch_ets.sh index 1f6f53e284b5..6269d5e23487 100755 --- a/tools/testing/selftests/net/forwarding/sch_ets.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets.sh @@ -11,6 +11,7 @@ ALL_TESTS=" ets_test_strict ets_test_mixed ets_test_dwrr + ets_test_plug classifier_mode ets_test_strict ets_test_mixed diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh index 08240d3e3c87..79d837a2868a 100644 --- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh @@ -224,3 +224,11 @@ ets_test_dwrr() ets_set_dwrr_two_bands xfail_on_slow ets_dwrr_test_01 } + +ets_test_plug() +{ + ets_change_qdisc $put 2 "3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3" "1514 1514" + tc qdisc add dev $put handle 20: parent 10:4 plug + start_traffic_pktsize 100 $h1.10 192.0.2.1 192.0.2.2 00:c1:a0:c1:a0:00 "-c 1" + ets_qdisc_setup $put 2 +} diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 7a3cb4c09e45..d847ff1737c3 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -28,7 +28,7 @@ flush_pids() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGKILL &>/dev/null diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index ac1349c4b9e5..b148cadb96d0 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -183,9 +183,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; @@ -1092,6 +1093,7 @@ int main_loop_s(int listensock) struct pollfd polls; socklen_t salen; int remotesock; + int err = 0; int fd = 0; again: @@ -1124,7 +1126,7 @@ again: SOCK_TEST_TCPULP(remotesock, 0); memset(&winfo, 0, sizeof(winfo)); - copyfd_io(fd, remotesock, 1, true, &winfo); + err = copyfd_io(fd, remotesock, 1, true, &winfo); } else { perror("accept"); return 1; @@ -1133,10 +1135,10 @@ again: if (cfg_input) close(fd); - if (--cfg_repeat > 0) + if (!err && --cfg_repeat > 0) goto again; - return 0; + return err; } static void init_rng(void) @@ -1246,7 +1248,7 @@ void xdisconnect(int fd) else xerror("bad family"); - strcpy(cmd, "ss -M | grep -q "); + strcpy(cmd, "ss -Mnt | grep -q "); cmdlen = strlen(cmd); if (!inet_ntop(addr.ss_family, raw_addr, &cmd[cmdlen], sizeof(cmd) - cmdlen)) @@ -1256,7 +1258,7 @@ void xdisconnect(int fd) /* * wait until the pending data is completely flushed and all - * the MPTCP sockets reached the closed status. + * the sockets reached the closed status. * disconnect will bypass/ignore/drop any pending data. */ for (i = 0; ; i += msec_sleep) { diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 5e3c56253274..47ecb5b3836e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -134,7 +134,7 @@ ns4="" TEST_GROUP="" # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "$cin_disconnect" @@ -211,6 +211,11 @@ if $checksum; then done fi +if $capture; then + rndh="${ns1:4}" + mptcp_lib_pr_info "Packet capture files will have this prefix: ${rndh}-" +fi + set_ethtool_flags() { local ns="$1" local dev="$2" @@ -361,7 +366,6 @@ do_transfer() if $capture; then local capuser - local rndh="${connector_ns:4}" if [ -z $SUDO_USER ] ; then capuser="" else diff --git a/tools/testing/selftests/net/mptcp/mptcp_inq.c b/tools/testing/selftests/net/mptcp/mptcp_inq.c index 3cf1e2a612ce..f3bcaa48df8f 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_inq.c +++ b/tools/testing/selftests/net/mptcp/mptcp_inq.c @@ -75,9 +75,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index b8af65373b3a..7fd555b123b9 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -8,7 +8,7 @@ # ShellCheck incorrectly believes that most of the code here is unreachable # because it's invoked by variable name, see how the "tests" array is used -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 . "$(dirname "${0}")/mptcp_lib.sh" @@ -3842,6 +3842,7 @@ endpoint_tests() # remove and re-add if reset_with_events "delete re-add signal" && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=0 pm_nl_set_limits $ns1 0 3 pm_nl_set_limits $ns2 3 3 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 09cd24b2ae46..d62e653d48b0 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -384,7 +384,7 @@ mptcp_lib_make_file() { mptcp_lib_print_file_err() { ls -l "${1}" 1>&2 echo "Trailing bytes are: " - tail -c 27 "${1}" + tail -c 32 "${1}" | od -x | head -n2 } # $1: input file ; $2: output file ; $3: what kind of file diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index 9934a68df237..112c07c4c37a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -162,9 +162,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; @@ -666,22 +667,26 @@ static void process_one_client(int fd, int pipefd) do_getsockopts(&s, fd, ret, ret2); if (s.mptcpi_rcv_delta != (uint64_t)ret + 1) - xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64, s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - ret); + xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64 ", diff %" PRId64, + s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - (ret + 1)); /* be nice when running on top of older kernel */ if (s.pkt_stats_avail) { if (s.last_sample.mptcpi_bytes_sent != ret2) - xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_sent, ret2, s.last_sample.mptcpi_bytes_sent - ret2); if (s.last_sample.mptcpi_bytes_received != ret) - xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_received, ret, s.last_sample.mptcpi_bytes_received - ret); if (s.last_sample.mptcpi_bytes_acked != ret) - xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64, - s.last_sample.mptcpi_bytes_acked, ret2, - s.last_sample.mptcpi_bytes_acked - ret2); + xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, + s.last_sample.mptcpi_bytes_acked, ret, + s.last_sample.mptcpi_bytes_acked - ret); } close(fd); diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 418a903c3a4d..f01989be6e9b 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -95,7 +95,7 @@ init() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { mptcp_lib_ns_exit "${ns1}" "${ns2}" "${ns_sbox}" diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 2e6648a2b2c0..ec6a87588191 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -32,7 +32,7 @@ ns1="" err=$(mktemp) # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "${err}" @@ -70,8 +70,9 @@ format_endpoints() { mptcp_lib_pm_nl_format_endpoints "${@}" } +# This function is invoked indirectly +#shellcheck disable=SC2317,SC2329 get_endpoint() { - # shellcheck disable=SC2317 # invoked indirectly mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}" } @@ -198,6 +199,7 @@ set_limits 1 9 2>/dev/null check "get_limits" "${default_limits}" "subflows above hard limit" set_limits 8 8 +flush_endpoint ## to make sure it doesn't affect the limits check "get_limits" "$(format_limits 8 8)" "set limits" flush_endpoint diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 994a556f46c1..93fea3442216 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -188,6 +188,13 @@ static int capture_events(int fd, int event_group) fprintf(stderr, ",error:%u", *(__u8 *)RTA_DATA(attrs)); else if (attrs->rta_type == MPTCP_ATTR_SERVER_SIDE) fprintf(stderr, ",server_side:%u", *(__u8 *)RTA_DATA(attrs)); + else if (attrs->rta_type == MPTCP_ATTR_FLAGS) { + __u16 flags = *(__u16 *)RTA_DATA(attrs); + + /* only print when present, easier */ + if (flags & MPTCP_PM_EV_FLAG_DENY_JOIN_ID0) + fprintf(stderr, ",deny_join_id0:1"); + } attrs = RTA_NEXT(attrs, msg_len); } diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 2329c2f8519b..1903e8e84a31 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -35,7 +35,7 @@ usage() { } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "$cout" "$sout" diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 333064b0b5ac..3d45991f24ed 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -94,7 +94,7 @@ test_fail() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { print_title "Cleanup" @@ -201,6 +201,9 @@ make_connection() is_v6="v4" fi + # set this on the client side only: will not affect the rest + ip netns exec "$ns2" sysctl -q net.mptcp.allow_join_initial_addr_port=0 + :>"$client_evts" :>"$server_evts" @@ -223,23 +226,28 @@ make_connection() local client_token local client_port local client_serverside + local client_nojoin local server_token local server_serverside + local server_nojoin client_token=$(mptcp_lib_evts_get_info token "$client_evts") client_port=$(mptcp_lib_evts_get_info sport "$client_evts") client_serverside=$(mptcp_lib_evts_get_info server_side "$client_evts") + client_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$client_evts") server_token=$(mptcp_lib_evts_get_info token "$server_evts") server_serverside=$(mptcp_lib_evts_get_info server_side "$server_evts") + server_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$server_evts") print_test "Established IP${is_v6} MPTCP Connection ns2 => ns1" - if [ "$client_token" != "" ] && [ "$server_token" != "" ] && [ "$client_serverside" = 0 ] && - [ "$server_serverside" = 1 ] + if [ "${client_token}" != "" ] && [ "${server_token}" != "" ] && + [ "${client_serverside}" = 0 ] && [ "${server_serverside}" = 1 ] && + [ "${client_nojoin:-0}" = 0 ] && [ "${server_nojoin:-0}" = 1 ] then test_pass print_title "Connection info: ${client_addr}:${client_port} -> ${connect_addr}:${app_port}" else - test_fail "Expected tokens (c:${client_token} - s:${server_token}) and server (c:${client_serverside} - s:${server_serverside})" + test_fail "Expected tokens (c:${client_token} - s:${server_token}), server (c:${client_serverside} - s:${server_serverside}), nojoin (c:${client_nojoin} - s:${server_nojoin})" mptcp_lib_result_print_all_tap exit ${KSFT_FAIL} fi diff --git a/tools/testing/selftests/net/netfilter/conntrack_clash.sh b/tools/testing/selftests/net/netfilter/conntrack_clash.sh index 606a43a60f73..7fc6c5dbd551 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_clash.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_clash.sh @@ -99,7 +99,7 @@ run_one_clash_test() local entries local cre - if ! ip netns exec "$ns" ./udpclash $daddr $dport;then + if ! ip netns exec "$ns" timeout 30 ./udpclash $daddr $dport;then echo "INFO: did not receive expected number of replies for $daddr:$dport" ip netns exec "$ctns" conntrack -S # don't fail: check if clash resolution triggered after all. diff --git a/tools/testing/selftests/net/netfilter/conntrack_resize.sh b/tools/testing/selftests/net/netfilter/conntrack_resize.sh index 788cd56ea4a0..615fe3c6f405 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_resize.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_resize.sh @@ -187,7 +187,7 @@ ct_udpclash() [ -x udpclash ] || return while [ $now -lt $end ]; do - ip netns exec "$ns" ./udpclash 127.0.0.1 $((RANDOM%65536)) > /dev/null 2>&1 + ip netns exec "$ns" timeout 30 ./udpclash 127.0.0.1 $((RANDOM%65536)) > /dev/null 2>&1 now=$(date +%s) done @@ -277,6 +277,7 @@ check_taint() insert_flood() { local n="$1" + local timeout="$2" local r=0 r=$((RANDOM%$insert_count)) @@ -302,7 +303,7 @@ test_floodresize_all() read tainted_then < /proc/sys/kernel/tainted for n in "$nsclient1" "$nsclient2";do - insert_flood "$n" & + insert_flood "$n" "$timeout" & done # resize table constantly while flood/insert/dump/flushs diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index a4ee5496f2a1..45832df98295 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -20,6 +20,7 @@ ret=0 SOCAT_TIMEOUT=60 nsin="" +nsin_small="" ns1out="" ns2out="" @@ -36,7 +37,7 @@ cleanup() { cleanup_all_ns - rm -f "$nsin" "$ns1out" "$ns2out" + rm -f "$nsin" "$nsin_small" "$ns1out" "$ns2out" [ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns="$log_netns" } @@ -72,6 +73,7 @@ lmtu=1500 rmtu=2000 filesize=$((2 * 1024 * 1024)) +filesize_small=$((filesize / 16)) usage(){ echo "nft_flowtable.sh [OPTIONS]" @@ -89,7 +91,10 @@ do o) omtu=$OPTARG;; l) lmtu=$OPTARG;; r) rmtu=$OPTARG;; - s) filesize=$OPTARG;; + s) + filesize=$OPTARG + filesize_small=$((OPTARG / 16)) + ;; *) usage;; esac done @@ -215,6 +220,7 @@ if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then fi nsin=$(mktemp) +nsin_small=$(mktemp) ns1out=$(mktemp) ns2out=$(mktemp) @@ -265,6 +271,7 @@ check_counters() check_dscp() { local what=$1 + local pmtud="$2" local ok=1 local counter @@ -277,37 +284,39 @@ check_dscp() local pc4z=${counter%*bytes*} local pc4z=${pc4z#*packets} + local failmsg="FAIL: pmtu $pmtu: $what counters do not match, expected" + case "$what" in "dscp_none") if [ "$pc4" -gt 0 ] || [ "$pc4z" -eq 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 + echo "$failmsg dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_fwd") if [ "$pc4" -eq 0 ] || [ "$pc4z" -eq 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 + echo "$failmsg dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_ingress") if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 + echo "$failmsg dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; "dscp_egress") if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then - echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 + echo "$failmsg dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 ret=1 ok=0 fi ;; *) - echo "FAIL: Unknown DSCP check" 1>&2 + echo "$failmsg: Unknown DSCP check" 1>&2 ret=1 ok=0 esac @@ -319,9 +328,9 @@ check_dscp() check_transfer() { - in=$1 - out=$2 - what=$3 + local in=$1 + local out=$2 + local what=$3 if ! cmp "$in" "$out" > /dev/null 2>&1; then echo "FAIL: file mismatch for $what" 1>&2 @@ -342,25 +351,39 @@ test_tcp_forwarding_ip() { local nsa=$1 local nsb=$2 - local dstip=$3 - local dstport=$4 + local pmtu=$3 + local dstip=$4 + local dstport=$5 local lret=0 + local socatc + local socatl + local infile="$nsin" + + if [ $pmtu -eq 0 ]; then + infile="$nsin_small" + fi - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$nsin" > "$ns2out" & + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" & lpid=$! busywait 1000 listener_ready - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$nsin" > "$ns1out" + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" + socatc=$? wait $lpid + socatl=$? - if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then + if [ $socatl -ne 0 ] || [ $socatc -ne 0 ];then + rc=1 + fi + + if ! check_transfer "$infile" "$ns2out" "ns1 -> ns2"; then lret=1 ret=1 fi - if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then + if ! check_transfer "$infile" "$ns1out" "ns1 <- ns2"; then lret=1 ret=1 fi @@ -370,14 +393,16 @@ test_tcp_forwarding_ip() test_tcp_forwarding() { - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 + local pmtu="$3" + + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 return $? } test_tcp_forwarding_set_dscp() { - check_dscp "dscp_none" + local pmtu="$3" ip netns exec "$nsr1" nft -f - <<EOF table netdev dscpmangle { @@ -388,8 +413,8 @@ table netdev dscpmangle { } EOF if [ $? -eq 0 ]; then - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 - check_dscp "dscp_ingress" + test_tcp_forwarding_ip "$1" "$2" "$3" 10.0.2.99 12345 + check_dscp "dscp_ingress" "$pmtu" ip netns exec "$nsr1" nft delete table netdev dscpmangle else @@ -405,10 +430,10 @@ table netdev dscpmangle { } EOF if [ $? -eq 0 ]; then - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 - check_dscp "dscp_egress" + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 + check_dscp "dscp_egress" "$pmtu" - ip netns exec "$nsr1" nft flush table netdev dscpmangle + ip netns exec "$nsr1" nft delete table netdev dscpmangle else echo "SKIP: Could not load netdev:egress for veth1" fi @@ -416,48 +441,53 @@ fi # partial. If flowtable really works, then both dscp-is-0 and dscp-is-cs3 # counters should have seen packets (before and after ft offload kicks in). ip netns exec "$nsr1" nft -a insert rule inet filter forward ip dscp set cs3 - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 - check_dscp "dscp_fwd" + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 + check_dscp "dscp_fwd" "$pmtu" } test_tcp_forwarding_nat() { + local nsa="$1" + local nsb="$2" + local pmtu="$3" + local what="$4" local lret - local pmtu - test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 - lret=$? + [ "$pmtu" -eq 0 ] && what="$what (pmtu disabled)" - pmtu=$3 - what=$4 + test_tcp_forwarding_ip "$nsa" "$nsb" "$pmtu" 10.0.2.99 12345 + lret=$? if [ "$lret" -eq 0 ] ; then if [ "$pmtu" -eq 1 ] ;then - check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what" + check_counters "flow offload for ns1/ns2 with masquerade $what" else echo "PASS: flow offload for ns1/ns2 with masquerade $what" fi - test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.6.6.6 1666 lret=$? if [ "$pmtu" -eq 1 ] ;then - check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what" + check_counters "flow offload for ns1/ns2 with dnat $what" elif [ "$lret" -eq 0 ] ; then echo "PASS: flow offload for ns1/ns2 with dnat $what" fi + else + echo "FAIL: flow offload for ns1/ns2 with dnat $what" fi return $lret } make_file "$nsin" "$filesize" +make_file "$nsin_small" "$filesize_small" # First test: # No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed. # Due to MTU mismatch in both directions, all packets (except small packets like pure # acks) have to be handled by normal forwarding path. Therefore, packet counters # are not checked. -if test_tcp_forwarding "$ns1" "$ns2"; then +if test_tcp_forwarding "$ns1" "$ns2" 0; then echo "PASS: flow offloaded for ns1/ns2" else echo "FAIL: flow offload for ns1/ns2:" 1>&2 @@ -489,8 +519,9 @@ table ip nat { } EOF +check_dscp "dscp_none" "0" if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then - echo "FAIL: flow offload for ns1/ns2 with dscp update" 1>&2 + echo "FAIL: flow offload for ns1/ns2 with dscp update and no pmtu discovery" 1>&2 exit 0 fi @@ -513,6 +544,14 @@ ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null # For earlier tests (large mtus), packets cannot be handled via flowtable # (except pure acks and other small packets). ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null +ip netns exec "$ns2" nft reset counters table inet filter >/dev/null + +if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 ""; then + echo "FAIL: flow offload for ns1/ns2 with dscp update and pmtu discovery" 1>&2 + exit 0 +fi + +ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 @@ -644,7 +683,7 @@ ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1 ip -net "$ns2" route add default via 10.0.2.1 ip -net "$ns2" route add default via dead:2::1 -if test_tcp_forwarding "$ns1" "$ns2"; then +if test_tcp_forwarding "$ns1" "$ns2" 1; then check_counters "ipsec tunnel mode for ns1/ns2" else echo "FAIL: ipsec tunnel mode for ns1/ns2" @@ -668,7 +707,7 @@ if [ "$1" = "" ]; then fi echo "re-run with random mtus and file size: -o $o -l $l -r $r -s $filesize" - $0 -o "$o" -l "$l" -r "$r" -s "$filesize" + $0 -o "$o" -l "$l" -r "$r" -s "$filesize" || ret=1 fi exit $ret diff --git a/tools/testing/selftests/net/netfilter/udpclash.c b/tools/testing/selftests/net/netfilter/udpclash.c index 85c7b906ad08..79de163d61ab 100644 --- a/tools/testing/selftests/net/netfilter/udpclash.c +++ b/tools/testing/selftests/net/netfilter/udpclash.c @@ -29,7 +29,7 @@ struct thread_args { int sockfd; }; -static int wait = 1; +static volatile int wait = 1; static void *thread_main(void *varg) { diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index 3c8d3455d8e7..b327d3061ed5 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -25,6 +25,7 @@ tests=" nat_related_v4 ip4-nat-related: ICMP related matches work with SNAT netlink_checks ovsnl: validate netlink attrs and settings upcall_interfaces ovs: test the upcall interfaces + tunnel_metadata ovs: test extraction of tunnel metadata drop_reason drop: test drop reasons are emitted psample psample: Sampling packets with psample" @@ -113,13 +114,13 @@ ovs_add_dp () { } ovs_add_if () { - info "Adding IF to DP: br:$2 if:$3" - if [ "$4" != "-u" ]; then - ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if "$2" "$3" \ - || return 1 + info "Adding IF to DP: br:$3 if:$4 ($2)" + if [ "$5" != "-u" ]; then + ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if \ + -t "$2" "$3" "$4" || return 1 else python3 $ovs_base/ovs-dpctl.py add-if \ - -u "$2" "$3" >$ovs_dir/$3.out 2>$ovs_dir/$3.err & + -u -t "$2" "$3" "$4" >$ovs_dir/$4.out 2>$ovs_dir/$4.err & pid=$! on_exit "ovs_sbx $1 kill -TERM $pid 2>/dev/null" fi @@ -166,9 +167,9 @@ ovs_add_netns_and_veths () { fi if [ "$7" != "-u" ]; then - ovs_add_if "$1" "$2" "$4" || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" || return 1 else - ovs_add_if "$1" "$2" "$4" -u || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" -u || return 1 fi if [ $TRACING -eq 1 ]; then @@ -756,6 +757,79 @@ test_upcall_interfaces() { return 0 } +ovs_add_kernel_tunnel() { + local sbxname=$1; shift + local ns=$1; shift + local tnl_type=$1; shift + local name=$1; shift + local addr=$1; shift + + info "setting up kernel ${tnl_type} tunnel ${name}" + ovs_sbx "${sbxname}" ip -netns ${ns} link add dev ${name} type ${tnl_type} $* || return 1 + on_exit "ovs_sbx ${sbxname} ip -netns ${ns} link del ${name} >/dev/null 2>&1" + ovs_sbx "${sbxname}" ip -netns ${ns} addr add dev ${name} ${addr} || return 1 + ovs_sbx "${sbxname}" ip -netns ${ns} link set dev ${name} mtu 1450 up || return 1 +} + +test_tunnel_metadata() { + which arping >/dev/null 2>&1 || return $ksft_skip + + sbxname="test_tunnel_metadata" + sbx_add "${sbxname}" || return 1 + + info "setting up new DP" + ovs_add_dp "${sbxname}" tdp0 -V 2:1 || return 1 + + ovs_add_netns_and_veths "${sbxname}" tdp0 tns left0 l0 \ + 172.31.110.1/24 || return 1 + + info "removing veth interface from openvswitch and setting IP" + ovs_del_if "${sbxname}" tdp0 left0 || return 1 + ovs_sbx "${sbxname}" ip addr add 172.31.110.2/24 dev left0 || return 1 + ovs_sbx "${sbxname}" ip link set left0 up || return 1 + + info "setting up tunnel port in openvswitch" + ovs_add_if "${sbxname}" "vxlan" tdp0 ovs-vxlan0 -u || return 1 + on_exit "ovs_sbx ${sbxname} ip link del ovs-vxlan0" + ovs_wait ip link show ovs-vxlan0 &>/dev/null || return 1 + ovs_sbx "${sbxname}" ip link set ovs-vxlan0 up || return 1 + + configs=$(echo ' + 1 172.31.221.1/24 1155332 32 set udpcsum flags\(df\|csum\) + 2 172.31.222.1/24 1234567 45 set noudpcsum flags\(df\) + 3 172.31.223.1/24 1020304 23 unset udpcsum flags\(csum\) + 4 172.31.224.1/24 1357986 15 unset noudpcsum' | sed '/^$/d') + + while read -r i addr id ttl df csum flags; do + ovs_add_kernel_tunnel "${sbxname}" tns vxlan vxlan${i} ${addr} \ + remote 172.31.110.2 id ${id} dstport 4789 \ + ttl ${ttl} df ${df} ${csum} || return 1 + done <<< "${configs}" + + ovs_wait grep -q 'listening on upcall packet handler' \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + + info "sending arping" + for i in 1 2 3 4; do + ovs_sbx "${sbxname}" ip netns exec tns \ + arping -I vxlan${i} 172.31.22${i}.2 -c 1 \ + >${ovs_dir}/arping.stdout 2>${ovs_dir}/arping.stderr + done + + info "checking that received decapsulated packets carry correct metadata" + while read -r i addr id ttl df csum flags; do + arp_hdr="arp\\(sip=172.31.22${i}.1,tip=172.31.22${i}.2,op=1,sha=" + addrs="src=172.31.110.1,dst=172.31.110.2" + ports="tp_src=[0-9]*,tp_dst=4789" + tnl_md="tunnel\\(tun_id=${id},${addrs},ttl=${ttl},${ports},${flags}\\)" + + ovs_sbx "${sbxname}" grep -qE "MISS upcall.*${tnl_md}.*${arp_hdr}" \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + done <<< "${configs}" + + return 0 +} + run_test() { ( tname="$1" diff --git a/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt new file mode 100644 index 000000000000..26794e7ddfd5 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +`./defaults.sh + ./set_sysctls.py /proc/sys/net/ipv4/tcp_fastopen=0x602 /proc/sys/net/ipv4/tcp_timestamps=0` + + 0 socket(..., SOCK_STREAM|SOCK_NONBLOCK, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:10(10) win 32792 <mss 1460,nop,nop,sackOK> + +0 > S. 0:0(0) ack 11 win 65535 <mss 1460,nop,nop,sackOK> + +// sk->sk_state is TCP_SYN_RECV + +.1 accept(3, ..., ...) = 4 + +// tcp_disconnect() sets sk->sk_state to TCP_CLOSE + +0 connect(4, AF_UNSPEC, ...) = 0 + +0 > R. 1:1(0) ack 11 win 65535 + +// connect() sets sk->sk_state to TCP_SYN_SENT + +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 connect(4, ..., ...) = -1 EINPROGRESS (Operation is now in progress) + +0 > S 0:0(0) win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +// tp->fastopen_rsk must be NULL + +1 > S 0:0(0) win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 8> diff --git a/tools/testing/selftests/net/test_vxlan_nh.sh b/tools/testing/selftests/net/test_vxlan_nh.sh new file mode 100755 index 000000000000..20f3369f776b --- /dev/null +++ b/tools/testing/selftests/net/test_vxlan_nh.sh @@ -0,0 +1,223 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh +TESTS=" + basic_tx_ipv4 + basic_tx_ipv6 + learning + proxy_ipv4 + proxy_ipv6 +" +VERBOSE=0 + +################################################################################ +# Utilities + +run_cmd() +{ + local cmd="$1" + local out + local stderr="2>/dev/null" + + if [ "$VERBOSE" = "1" ]; then + echo "COMMAND: $cmd" + stderr= + fi + + out=$(eval "$cmd" "$stderr") + rc=$? + if [ "$VERBOSE" -eq 1 ] && [ -n "$out" ]; then + echo " $out" + fi + + return $rc +} + +################################################################################ +# Cleanup + +exit_cleanup_all() +{ + cleanup_all_ns + exit "${EXIT_STATUS}" +} + +################################################################################ +# Tests + +nh_stats_get() +{ + ip -n "$ns1" -s -j nexthop show id 10 | jq ".[][\"group_stats\"][][\"packets\"]" +} + +tc_stats_get() +{ + tc_rule_handle_stats_get "dev dummy1 egress" 101 ".packets" "-n $ns1" +} + +basic_tx_common() +{ + local af_str=$1; shift + local proto=$1; shift + local local_addr=$1; shift + local plen=$1; shift + local remote_addr=$1; shift + + RET=0 + + # Test basic Tx functionality. Check that stats are incremented on + # both the FDB nexthop group and the egress device. + + run_cmd "ip -n $ns1 link add name dummy1 up type dummy" + run_cmd "ip -n $ns1 route add $remote_addr/$plen dev dummy1" + run_cmd "tc -n $ns1 qdisc add dev dummy1 clsact" + run_cmd "tc -n $ns1 filter add dev dummy1 egress proto $proto pref 1 handle 101 flower ip_proto udp dst_ip $remote_addr dst_port 4789 action pass" + + run_cmd "ip -n $ns1 address add $local_addr/$plen dev lo" + + run_cmd "ip -n $ns1 nexthop add id 1 via $remote_addr fdb" + run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb" + + run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local $local_addr dstport 4789" + run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static nhid 10" + + run_cmd "ip netns exec $ns1 mausezahn vx0 -a own -b 00:11:22:33:44:55 -c 1 -q" + + busywait "$BUSYWAIT_TIMEOUT" until_counter_is "== 1" nh_stats_get > /dev/null + check_err $? "FDB nexthop group stats did not increase" + + busywait "$BUSYWAIT_TIMEOUT" until_counter_is "== 1" tc_stats_get > /dev/null + check_err $? "tc filter stats did not increase" + + log_test "VXLAN FDB nexthop: $af_str basic Tx" +} + +basic_tx_ipv4() +{ + basic_tx_common "IPv4" ipv4 192.0.2.1 32 192.0.2.2 +} + +basic_tx_ipv6() +{ + basic_tx_common "IPv6" ipv6 2001:db8:1::1 128 2001:db8:1::2 +} + +learning() +{ + RET=0 + + # When learning is enabled on the VXLAN device, an incoming packet + # might try to refresh an FDB entry that points to an FDB nexthop group + # instead of an ordinary remote destination. Check that the kernel does + # not crash in this situation. + + run_cmd "ip -n $ns1 address add 192.0.2.1/32 dev lo" + run_cmd "ip -n $ns1 address add 192.0.2.2/32 dev lo" + + run_cmd "ip -n $ns1 nexthop add id 1 via 192.0.2.3 fdb" + run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb" + + run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local 192.0.2.1 dstport 12345 localbypass" + run_cmd "ip -n $ns1 link add name vx1 up type vxlan id 10020 local 192.0.2.2 dstport 54321 learning" + + run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static dst 192.0.2.2 port 54321 vni 10020" + run_cmd "bridge -n $ns1 fdb add 00:aa:bb:cc:dd:ee dev vx1 self static nhid 10" + + run_cmd "ip netns exec $ns1 mausezahn vx0 -a 00:aa:bb:cc:dd:ee -b 00:11:22:33:44:55 -c 1 -q" + + log_test "VXLAN FDB nexthop: learning" +} + +proxy_common() +{ + local af_str=$1; shift + local local_addr=$1; shift + local plen=$1; shift + local remote_addr=$1; shift + local neigh_addr=$1; shift + local ping_cmd=$1; shift + + RET=0 + + # When the "proxy" option is enabled on the VXLAN device, the device + # will suppress ARP requests and IPv6 Neighbor Solicitation messages if + # it is able to reply on behalf of the remote host. That is, if a + # matching and valid neighbor entry is configured on the VXLAN device + # whose MAC address is not behind the "any" remote (0.0.0.0 / ::). The + # FDB entry for the neighbor's MAC address might point to an FDB + # nexthop group instead of an ordinary remote destination. Check that + # the kernel does not crash in this situation. + + run_cmd "ip -n $ns1 address add $local_addr/$plen dev lo" + + run_cmd "ip -n $ns1 nexthop add id 1 via $remote_addr fdb" + run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb" + + run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local $local_addr dstport 4789 proxy" + + run_cmd "ip -n $ns1 neigh add $neigh_addr lladdr 00:11:22:33:44:55 nud perm dev vx0" + + run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static nhid 10" + + run_cmd "ip netns exec $ns1 $ping_cmd" + + log_test "VXLAN FDB nexthop: $af_str proxy" +} + +proxy_ipv4() +{ + proxy_common "IPv4" 192.0.2.1 32 192.0.2.2 192.0.2.3 \ + "arping -b -c 1 -s 192.0.2.1 -I vx0 192.0.2.3" +} + +proxy_ipv6() +{ + proxy_common "IPv6" 2001:db8:1::1 128 2001:db8:1::2 2001:db8:1::3 \ + "ndisc6 -r 1 -s 2001:db8:1::1 -w 1 2001:db8:1::3 vx0" +} + +################################################################################ +# Usage + +usage() +{ + cat <<EOF +usage: ${0##*/} OPTS + + -t <test> Test(s) to run (default: all) + (options: $TESTS) + -p Pause on fail + -v Verbose mode (show commands and output) +EOF +} + +################################################################################ +# Main + +while getopts ":t:pvh" opt; do + case $opt in + t) TESTS=$OPTARG;; + p) PAUSE_ON_FAIL=yes;; + v) VERBOSE=$((VERBOSE + 1));; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +require_command mausezahn +require_command arping +require_command ndisc6 +require_command jq + +if ! ip nexthop help 2>&1 | grep -q "stats"; then + echo "SKIP: iproute2 ip too old, missing nexthop stats support" + exit "$ksft_skip" +fi + +trap exit_cleanup_all EXIT + +for t in $TESTS +do + setup_ns ns1; $t; cleanup_all_ns; +done diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 5ded3b3a7538..dd093f9df6f1 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -181,13 +181,12 @@ static int tls_send_cmsg(int fd, unsigned char record_type, return sendmsg(fd, &msg, flags); } -static int tls_recv_cmsg(struct __test_metadata *_metadata, - int fd, unsigned char record_type, - void *data, size_t len, int flags) +static int __tls_recv_cmsg(struct __test_metadata *_metadata, + int fd, unsigned char *ctype, + void *data, size_t len, int flags) { char cbuf[CMSG_SPACE(sizeof(char))]; struct cmsghdr *cmsg; - unsigned char ctype; struct msghdr msg; struct iovec vec; int n; @@ -206,7 +205,20 @@ static int tls_recv_cmsg(struct __test_metadata *_metadata, EXPECT_NE(cmsg, NULL); EXPECT_EQ(cmsg->cmsg_level, SOL_TLS); EXPECT_EQ(cmsg->cmsg_type, TLS_GET_RECORD_TYPE); - ctype = *((unsigned char *)CMSG_DATA(cmsg)); + if (ctype) + *ctype = *((unsigned char *)CMSG_DATA(cmsg)); + + return n; +} + +static int tls_recv_cmsg(struct __test_metadata *_metadata, + int fd, unsigned char record_type, + void *data, size_t len, int flags) +{ + unsigned char ctype; + int n; + + n = __tls_recv_cmsg(_metadata, fd, &ctype, data, len, flags); EXPECT_EQ(ctype, record_type); return n; @@ -2164,6 +2176,284 @@ TEST_F(tls, rekey_poll_delay) } } +struct raw_rec { + unsigned int plain_len; + unsigned char plain_data[100]; + unsigned int cipher_len; + unsigned char cipher_data[128]; +}; + +/* TLS 1.2, AES_CCM, data, seqno:0, plaintext: 'Hello world' */ +static const struct raw_rec id0_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x26, 0xa2, 0x33, + 0xde, 0x8d, 0x94, 0xf0, 0x29, 0x6c, 0xb1, 0xaf, + 0x6a, 0x75, 0xb2, 0x93, 0xad, 0x45, 0xd5, 0xfd, + 0x03, 0x51, 0x57, 0x8f, 0xf9, 0xcc, 0x3b, 0x42, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:0, plaintext: '' */ +static const struct raw_rec id0_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x38, 0x7b, + 0xa6, 0x1c, 0xdd, 0xa7, 0x19, 0x33, 0xab, 0xae, + 0x88, 0xe1, 0xd2, 0x08, 0x4f, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:0, plaintext: '' */ +static const struct raw_rec id0_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xc5, 0x37, 0x90, + 0x70, 0x45, 0x89, 0xfb, 0x5c, 0xc7, 0x89, 0x03, + 0x68, 0x80, 0xd3, 0xd8, 0xcc, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:1, plaintext: 'Hello world' */ +static const struct raw_rec id1_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x3a, 0x1a, 0x9c, + 0xd0, 0xa8, 0x9a, 0xd6, 0x69, 0xd6, 0x1a, 0xe3, + 0xb5, 0x1f, 0x0d, 0x2c, 0xe2, 0x97, 0x46, 0xff, + 0x2b, 0xcc, 0x5a, 0xc4, 0xa3, 0xb9, 0xef, 0xba, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:1, plaintext: '' */ +static const struct raw_rec id1_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x3e, 0xf0, 0xfe, + 0xee, 0xd9, 0xe2, 0x5d, 0xc7, 0x11, 0x4c, 0xe6, + 0xb4, 0x7e, 0xef, 0x40, 0x2b, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:1, plaintext: '' */ +static const struct raw_rec id1_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0xce, 0xfc, 0x86, + 0xc8, 0xf0, 0x55, 0xf9, 0x47, 0x3f, 0x74, 0xdc, + 0xc9, 0xbf, 0xfe, 0x5b, 0xb1, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:2, plaintext: 'Hello world' */ +static const struct raw_rec id2_ctrl_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xe5, 0x3d, 0x19, + 0x3d, 0xca, 0xb8, 0x16, 0xb6, 0xff, 0x79, 0x87, + 0x2a, 0x04, 0x11, 0x3d, 0xf8, 0x64, 0x5f, 0x36, + 0x8b, 0xa8, 0xee, 0x4c, 0x6d, 0x62, 0xa5, 0x00, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:2, plaintext: 'Hello world' */ +static const struct raw_rec id2_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xe5, 0x3d, 0x19, + 0x3d, 0xca, 0xb8, 0x16, 0xb6, 0xff, 0x79, 0x87, + 0x8e, 0xa1, 0xd0, 0xcd, 0x33, 0xb5, 0x86, 0x2b, + 0x17, 0xf1, 0x52, 0x2a, 0x55, 0x62, 0x65, 0x11, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:2, plaintext: '' */ +static const struct raw_rec id2_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xdc, 0x5c, 0x0e, + 0x41, 0xdd, 0xba, 0xd3, 0xcc, 0xcf, 0x6d, 0xd9, + 0x06, 0xdb, 0x79, 0xe5, 0x5d, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:2, plaintext: '' */ +static const struct raw_rec id2_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xc3, 0xca, 0x26, + 0x22, 0xe4, 0x25, 0xfb, 0x5f, 0x6d, 0xbf, 0x83, + 0x30, 0x48, 0x69, 0x1a, 0x47, + }, +}; + +FIXTURE(zero_len) +{ + int fd, cfd; + bool notls; +}; + +FIXTURE_VARIANT(zero_len) +{ + const struct raw_rec *recs[4]; + ssize_t recv_ret[4]; +}; + +FIXTURE_VARIANT_ADD(zero_len, data_data_data) +{ + .recs = { &id0_data_l11, &id1_data_l11, &id2_data_l11, }, + .recv_ret = { 33, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, data_0ctrl_data) +{ + .recs = { &id0_data_l11, &id1_ctrl_l0, &id2_data_l11, }, + .recv_ret = { 11, 0, 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_0data) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_data_l0, }, + .recv_ret = { -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_ctrl) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_ctrl_l11, }, + .recv_ret = { 0, 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_0ctrl) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_ctrl_l0, }, + .recv_ret = { 0, 0, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0ctrl_0ctrl_0ctrl) +{ + .recs = { &id0_ctrl_l0, &id1_ctrl_l0, &id2_ctrl_l0, }, + .recv_ret = { 0, 0, 0, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_data) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_data_l11, }, + .recv_ret = { 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, data_0data_0data) +{ + .recs = { &id0_data_l11, &id1_data_l0, &id2_data_l0, }, + .recv_ret = { 11, -EAGAIN, }, +}; + +FIXTURE_SETUP(zero_len) +{ + struct tls_crypto_info_keys tls12; + int ret; + + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128, + &tls12, 0); + + ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls); + if (self->notls) + return; + + /* Don't install keys on fd, we'll send raw records */ + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len); + ASSERT_EQ(ret, 0); +} + +FIXTURE_TEARDOWN(zero_len) +{ + close(self->fd); + close(self->cfd); +} + +TEST_F(zero_len, test) +{ + const struct raw_rec *const *rec; + unsigned char buf[128]; + int rec_off; + int i; + + for (i = 0; i < 4 && variant->recs[i]; i++) + EXPECT_EQ(send(self->fd, variant->recs[i]->cipher_data, + variant->recs[i]->cipher_len, 0), + variant->recs[i]->cipher_len); + + rec = &variant->recs[0]; + rec_off = 0; + for (i = 0; i < 4; i++) { + int j, ret; + + ret = variant->recv_ret[i] >= 0 ? variant->recv_ret[i] : -1; + EXPECT_EQ(__tls_recv_cmsg(_metadata, self->cfd, NULL, + buf, sizeof(buf), MSG_DONTWAIT), ret); + if (ret == -1) + EXPECT_EQ(errno, -variant->recv_ret[i]); + if (variant->recv_ret[i] == -EAGAIN) + break; + + for (j = 0; j < ret; j++) { + while (rec_off == (*rec)->plain_len) { + rec++; + rec_off = 0; + } + EXPECT_EQ(buf[j], (*rec)->plain_data[rec_off]); + rec_off++; + } + } +}; + FIXTURE(tls_err) { int fd, cfd; @@ -2480,6 +2770,22 @@ TEST_F(tls_err, poll_partial_rec_async) } } +/* Use OOB+large send to trigger copy mode due to memory pressure. + * OOB causes a short read. + */ +TEST_F(tls_err, oob_pressure) +{ + char buf[1<<16]; + int i; + + memrnd(buf, sizeof(buf)); + + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); + EXPECT_EQ(send(self->fd2, buf, sizeof(buf), 0), sizeof(buf)); + for (i = 0; i < 64; i++) + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); +} + TEST(non_established) { struct tls12_crypto_info_aes_gcm_256 tls12; struct sockaddr_in addr; @@ -2708,6 +3014,67 @@ TEST(prequeue) { close(cfd); } +TEST(data_steal) { + struct tls_crypto_info_keys tls; + char buf[20000], buf2[20000]; + struct sockaddr_in addr; + int sfd, cfd, ret, fd; + int pid, status; + socklen_t len; + + len = sizeof(addr); + memrnd(buf, sizeof(buf)); + + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_256, &tls, 0); + + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_ANY); + addr.sin_port = 0; + + fd = socket(AF_INET, SOCK_STREAM, 0); + sfd = socket(AF_INET, SOCK_STREAM, 0); + + ASSERT_EQ(bind(sfd, &addr, sizeof(addr)), 0); + ASSERT_EQ(listen(sfd, 10), 0); + ASSERT_EQ(getsockname(sfd, &addr, &len), 0); + ASSERT_EQ(connect(fd, &addr, sizeof(addr)), 0); + ASSERT_GE(cfd = accept(sfd, &addr, &len), 0); + close(sfd); + + ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")); + if (ret) { + ASSERT_EQ(errno, ENOENT); + SKIP(return, "no TLS support"); + } + ASSERT_EQ(setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")), 0); + + /* Spawn a child and get it into the read wait path of the underlying + * TCP socket. + */ + pid = fork(); + ASSERT_GE(pid, 0); + if (!pid) { + EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2, MSG_WAITALL), + sizeof(buf) / 2); + exit(!__test_passed(_metadata)); + } + + usleep(10000); + ASSERT_EQ(setsockopt(fd, SOL_TLS, TLS_TX, &tls, tls.len), 0); + ASSERT_EQ(setsockopt(cfd, SOL_TLS, TLS_RX, &tls, tls.len), 0); + + EXPECT_EQ(send(fd, buf, sizeof(buf), 0), sizeof(buf)); + EXPECT_EQ(wait(&status), pid); + EXPECT_EQ(status, 0); + EXPECT_EQ(recv(cfd, buf2, sizeof(buf2), MSG_DONTWAIT), -1); + /* Don't check errno, the error will be different depending + * on what random bytes TLS interpreted as the record length. + */ + + close(fd); + close(cfd); +} + static void __attribute__((constructor)) fips_check(void) { int res; FILE *f; diff --git a/tools/testing/selftests/powerpc/include/instructions.h b/tools/testing/selftests/powerpc/include/instructions.h index 4efa6314bd96..864f0c9f1afc 100644 --- a/tools/testing/selftests/powerpc/include/instructions.h +++ b/tools/testing/selftests/powerpc/include/instructions.h @@ -67,7 +67,7 @@ static inline int paste_last(void *i) #define PPC_INST_PASTE_LAST __PASTE(0, 0, 1, 1) /* This defines the prefixed load/store instructions */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define stringify_in_c(...) __VA_ARGS__ #else # define __stringify_in_c(...) #__VA_ARGS__ diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 19bb333e2485..6b78a8382d40 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -18,6 +18,7 @@ /proc-tid0 /proc-uptime-001 /proc-uptime-002 +/proc-pidns /read /self /setns-dcache diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 50aba102201a..be3013515aae 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -28,5 +28,6 @@ TEST_GEN_PROGS += setns-sysvipc TEST_GEN_PROGS += thread-self TEST_GEN_PROGS += proc-multiple-procfs TEST_GEN_PROGS += proc-fsconfig-hidepid +TEST_GEN_PROGS += proc-pidns include ../lib.mk diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c index 66773685a047..94bba4553130 100644 --- a/tools/testing/selftests/proc/proc-maps-race.c +++ b/tools/testing/selftests/proc/proc-maps-race.c @@ -202,11 +202,11 @@ static void print_first_lines(char *text, int nr) int offs = end - text; text[offs] = '\0'; - printf(text); + printf("%s", text); text[offs] = '\n'; printf("\n"); } else { - printf(text); + printf("%s", text); } } @@ -221,7 +221,7 @@ static void print_last_lines(char *text, int nr) nr--; start--; } - printf(start); + printf("%s", start); } static void print_boundaries(const char *title, FIXTURE_DATA(proc_maps_race) *self) diff --git a/tools/testing/selftests/proc/proc-pidns.c b/tools/testing/selftests/proc/proc-pidns.c new file mode 100644 index 000000000000..52500597f951 --- /dev/null +++ b/tools/testing/selftests/proc/proc-pidns.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Aleksa Sarai <cyphar@cyphar.com> + * Copyright (C) 2025 SUSE LLC. + */ + +#include <assert.h> +#include <errno.h> +#include <sched.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdio.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/prctl.h> + +#include "../kselftest_harness.h" + +#define ASSERT_ERRNO(expected, _t, seen) \ + __EXPECT(expected, #expected, \ + ({__typeof__(seen) _tmp_seen = (seen); \ + _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1) + +#define ASSERT_ERRNO_EQ(expected, seen) \ + ASSERT_ERRNO(expected, ==, seen) + +#define ASSERT_SUCCESS(seen) \ + ASSERT_ERRNO(0, <=, seen) + +static int touch(char *path) +{ + int fd = open(path, O_WRONLY|O_CREAT|O_CLOEXEC, 0644); + if (fd < 0) + return -1; + return close(fd); +} + +FIXTURE(ns) +{ + int host_mntns, host_pidns; + int dummy_pidns; +}; + +FIXTURE_SETUP(ns) +{ + /* Stash the old mntns. */ + self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_mntns); + + /* Create a new mount namespace and make it private. */ + ASSERT_SUCCESS(unshare(CLONE_NEWNS)); + ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL)); + + /* + * Create a proper tmpfs that we can use and will disappear once we + * leave this mntns. + */ + ASSERT_SUCCESS(mount("tmpfs", "/tmp", "tmpfs", 0, NULL)); + + /* + * Create a pidns we can use for later tests. We need to fork off a + * child so that we get a usable nsfd that we can bind-mount and open. + */ + ASSERT_SUCCESS(mkdir("/tmp/dummy", 0755)); + ASSERT_SUCCESS(touch("/tmp/dummy/pidns")); + ASSERT_SUCCESS(mkdir("/tmp/dummy/proc", 0755)); + + self->host_pidns = open("/proc/self/ns/pid", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_pidns); + ASSERT_SUCCESS(unshare(CLONE_NEWPID)); + + pid_t pid = fork(); + ASSERT_SUCCESS(pid); + if (!pid) { + prctl(PR_SET_PDEATHSIG, SIGKILL); + ASSERT_SUCCESS(mount("/proc/self/ns/pid", "/tmp/dummy/pidns", NULL, MS_BIND, NULL)); + ASSERT_SUCCESS(mount("proc", "/tmp/dummy/proc", "proc", 0, NULL)); + exit(0); + } + + int wstatus; + ASSERT_EQ(waitpid(pid, &wstatus, 0), pid); + ASSERT_TRUE(WIFEXITED(wstatus)); + ASSERT_EQ(WEXITSTATUS(wstatus), 0); + + ASSERT_SUCCESS(setns(self->host_pidns, CLONE_NEWPID)); + + self->dummy_pidns = open("/tmp/dummy/pidns", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->dummy_pidns); +} + +FIXTURE_TEARDOWN(ns) +{ + ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS)); + ASSERT_SUCCESS(close(self->host_mntns)); + + ASSERT_SUCCESS(close(self->host_pidns)); + ASSERT_SUCCESS(close(self->dummy_pidns)); +} + +TEST_F(ns, pidns_mount_string_path) +{ + ASSERT_SUCCESS(mkdir("/tmp/proc-host", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc-host", "proc", 0, "pidns=/proc/self/ns/pid")); + ASSERT_SUCCESS(access("/tmp/proc-host/self/", X_OK)); + + ASSERT_SUCCESS(mkdir("/tmp/proc-dummy", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc-dummy", "proc", 0, "pidns=/tmp/dummy/pidns")); + ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/1/", X_OK)); + ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/self/", X_OK)); +} + +TEST_F(ns, pidns_fsconfig_string_path) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_fsconfig_fd) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_reconfigure_remount) +{ + ASSERT_SUCCESS(mkdir("/tmp/proc", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc", "proc", 0, "")); + + ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK)); + ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK)); + + ASSERT_ERRNO_EQ(-EBUSY, mount(NULL, "/tmp/proc", NULL, MS_REMOUNT, "pidns=/tmp/dummy/pidns")); + + ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK)); + ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK)); +} + +TEST_F(ns, pidns_reconfigure_fsconfig_string_path) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */ + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_reconfigure_fsconfig_fd) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */ + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/README b/tools/testing/selftests/riscv/README new file mode 100644 index 000000000000..443da395da68 --- /dev/null +++ b/tools/testing/selftests/riscv/README @@ -0,0 +1,24 @@ +KSelfTest RISC-V +================ + +- These tests are riscv specific and so not built or run but just skipped + completely when env-variable ARCH is found to be different than 'riscv'. + +- Holding true the above, RISC-V KSFT tests can be run within the + KSelfTest framework using standard Linux top-level-makefile targets: + + $ make TARGETS=riscv kselftest-clean + $ make TARGETS=riscv kselftest + + or + + $ make -C tools/testing/selftests TARGETS=riscv \ + INSTALL_PATH=<your-installation-path> install + + or, alternatively, only specific riscv/ subtargets can be picked: + + $ make -C tools/testing/selftests TARGETS=riscv RISCV_SUBTARGETS="mm vector" \ + INSTALL_PATH=<your-installation-path> install + + Further details on building and running KSFT can be found in: + Documentation/dev-tools/kselftest.rst diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c index 1c9ceb661c43..0cfbb111a2d0 100644 --- a/tools/testing/selftests/sched_ext/hotplug.c +++ b/tools/testing/selftests/sched_ext/hotplug.c @@ -6,7 +6,6 @@ #include <bpf/bpf.h> #include <sched.h> #include <scx/common.h> -#include <sched.h> #include <sys/wait.h> #include <unistd.h> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 61acbd45ffaa..874f17763536 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -24,6 +24,7 @@ #include <linux/filter.h> #include <sys/prctl.h> #include <sys/ptrace.h> +#include <sys/time.h> #include <sys/user.h> #include <linux/prctl.h> #include <linux/ptrace.h> @@ -73,6 +74,14 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + +#ifndef __naked +#define __naked __attribute__((__naked__)) +#endif + #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #define PR_GET_NO_NEW_PRIVS 39 @@ -3547,6 +3556,10 @@ static void signal_handler(int signal) perror("write from signal"); } +static void signal_handler_nop(int signal) +{ +} + TEST(user_notification_signal) { pid_t pid; @@ -4819,6 +4832,132 @@ TEST(user_notification_wait_killable_fatal) EXPECT_EQ(SIGTERM, WTERMSIG(status)); } +/* Ensure signals after the reply do not interrupt */ +TEST(user_notification_wait_killable_after_reply) +{ + int i, max_iter = 100000; + int listener, status; + int pipe_fds[2]; + pid_t pid; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) + { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + listener = user_notif_syscall( + __NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV); + ASSERT_GE(listener, 0); + + /* + * Used to count invocations. One token is transferred from the child + * to the parent per syscall invocation, the parent tries to take + * one token per successful RECV. If the syscall is restarted after + * RECV the parent will try to get two tokens while the child only + * provided one. + */ + ASSERT_EQ(pipe(pipe_fds), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct sigaction new_action = { + .sa_handler = signal_handler_nop, + .sa_flags = SA_RESTART, + }; + struct itimerval timer = { + .it_value = { .tv_usec = 1000 }, + .it_interval = { .tv_usec = 1000 }, + }; + char c = 'a'; + + close(pipe_fds[0]); + + /* Setup the sigaction with SA_RESTART */ + if (sigaction(SIGALRM, &new_action, NULL)) { + perror("sigaction"); + exit(1); + } + + /* + * Kill with SIGALRM repeatedly, to try to hit the race when + * handling the syscall. + */ + if (setitimer(ITIMER_REAL, &timer, NULL) < 0) + perror("setitimer"); + + for (i = 0; i < max_iter; ++i) { + int fd; + + /* Send one token per iteration to catch repeats. */ + if (write(pipe_fds[1], &c, sizeof(c)) != 1) { + perror("write"); + exit(1); + } + + fd = syscall(__NR_dup, 0); + if (fd < 0) { + perror("dup"); + exit(1); + } + close(fd); + } + + exit(0); + } + + close(pipe_fds[1]); + + for (i = 0; i < max_iter; ++i) { + struct seccomp_notif req = {}; + struct seccomp_notif_addfd addfd = {}; + struct pollfd pfd = { + .fd = pipe_fds[0], + .events = POLLIN, + }; + char c; + + /* + * Try to receive one token. If it failed, one child syscall + * was restarted after RECV and needed to be handled twice. + */ + ASSERT_EQ(poll(&pfd, 1, 1000), 1) + kill(pid, SIGKILL); + + ASSERT_EQ(read(pipe_fds[0], &c, sizeof(c)), 1) + kill(pid, SIGKILL); + + /* + * Get the notification, reply to it as fast as possible to test + * whether the child wrongly skips going into the non-preemptible + * (TASK_KILLABLE) state. + */ + do + ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req); + while (ret < 0 && errno == ENOENT); /* Accept interruptions before RECV */ + ASSERT_EQ(ret, 0) + kill(pid, SIGKILL); + + addfd.id = req.id; + addfd.flags = SECCOMP_ADDFD_FLAG_SEND; + addfd.srcfd = 0; + ASSERT_GE(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), 0) + kill(pid, SIGKILL); + } + + /* + * Wait for the process to exit, and make sure the process terminated + * with a zero exit code.. + */ + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + struct tsync_vs_thread_leader_args { pthread_t leader; }; @@ -4896,7 +5035,36 @@ TEST(tsync_vs_dead_thread_leader) EXPECT_EQ(0, status); } -noinline int probed(void) +#ifdef __x86_64__ + +/* + * We need naked probed_uprobe function. Using __nocf_check + * check to skip possible endbr64 instruction and ignoring + * -Wattributes, otherwise the compilation might fail. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" + +__naked __nocf_check noinline int probed_uprobe(void) +{ + /* + * Optimized uprobe is possible only on top of nop5 instruction. + */ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} +#pragma GCC diagnostic pop + +#else +noinline int probed_uprobe(void) +{ + return 1; +} +#endif + +noinline int probed_uretprobe(void) { return 1; } @@ -4949,35 +5117,46 @@ static ssize_t get_uprobe_offset(const void *addr) return found ? (uintptr_t)addr - start + base : -1; } -FIXTURE(URETPROBE) { +FIXTURE(UPROBE) { int fd; }; -FIXTURE_VARIANT(URETPROBE) { +FIXTURE_VARIANT(UPROBE) { /* - * All of the URETPROBE behaviors can be tested with either - * uretprobe attached or not + * All of the U(RET)PROBE behaviors can be tested with either + * u(ret)probe attached or not */ bool attach; + /* + * Test both uprobe and uretprobe. + */ + bool uretprobe; +}; + +FIXTURE_VARIANT_ADD(UPROBE, not_attached) { + .attach = false, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, attached) { +FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) { .attach = true, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { - .attach = false, +FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) { + .attach = true, + .uretprobe = true, }; -FIXTURE_SETUP(URETPROBE) +FIXTURE_SETUP(UPROBE) { const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; ssize_t offset; int type, bit; -#ifndef __NR_uretprobe - SKIP(return, "__NR_uretprobe syscall not defined"); +#if !defined(__NR_uprobe) || !defined(__NR_uretprobe) + SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined"); #endif if (!variant->attach) @@ -4987,12 +5166,17 @@ FIXTURE_SETUP(URETPROBE) type = determine_uprobe_perf_type(); ASSERT_GE(type, 0); - bit = determine_uprobe_retprobe_bit(); - ASSERT_GE(bit, 0); - offset = get_uprobe_offset(probed); + + if (variant->uretprobe) { + bit = determine_uprobe_retprobe_bit(); + ASSERT_GE(bit, 0); + } + + offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe); ASSERT_GE(offset, 0); - attr.config |= 1 << bit; + if (variant->uretprobe) + attr.config |= 1 << bit; attr.size = attr_sz; attr.type = type; attr.config1 = ptr_to_u64("/proc/self/exe"); @@ -5003,7 +5187,7 @@ FIXTURE_SETUP(URETPROBE) PERF_FLAG_FD_CLOEXEC); } -FIXTURE_TEARDOWN(URETPROBE) +FIXTURE_TEARDOWN(UPROBE) { /* we could call close(self->fd), but we'd need extra filter for * that and since we are calling _exit right away.. @@ -5017,11 +5201,17 @@ static int run_probed_with_filter(struct sock_fprog *prog) return -1; } - probed(); + /* + * Uprobe is optimized after first hit, so let's hit twice. + */ + probed_uprobe(); + probed_uprobe(); + + probed_uretprobe(); return 0; } -TEST_F(URETPROBE, uretprobe_default_allow) +TEST_F(UPROBE, uprobe_default_allow) { struct sock_filter filter[] = { BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), @@ -5034,7 +5224,7 @@ TEST_F(URETPROBE, uretprobe_default_allow) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block) +TEST_F(UPROBE, uprobe_default_block) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, @@ -5051,11 +5241,14 @@ TEST_F(URETPROBE, uretprobe_default_block) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) +TEST_F(UPROBE, uprobe_block_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), #endif @@ -5070,11 +5263,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) +TEST_F(UPROBE, uprobe_default_block_with_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), #endif diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 23a61e5b99d0..998e5a2f4579 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -186,6 +186,204 @@ ] }, { + "id": "34c0", + "name": "Test TBF with HHF Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "hhf" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 hhf limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 hhf limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "fd68", + "name": "Test TBF with CODEL Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "codel" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 codel limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 codel limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "514e", + "name": "Test TBF with PIE Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "pie" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 pie limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 pie limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "6c97", + "name": "Test TBF with FQ Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "5d0b", + "name": "Test TBF with FQ_CODEL Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq_codel" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq_codel limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq_codel limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "21c3", + "name": "Test TBF with FQ_PIE Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq_pie" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq_pie limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq_pie limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { "id": "a4bb", "name": "Test FQ_CODEL with HTB parent - force packet drop with empty queue", "category": [ diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 2d93ac860bd5..cd9fe69ecce2 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -20,7 +20,7 @@ static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, struct io_uring_sqe *sqe[1]; ublk_io_alloc_sqes(t, sqe, 1); - io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC); + io_uring_prep_fsync(sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/, IORING_FSYNC_DATASYNC); io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); @@ -42,7 +42,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, if (!sqe[0]) return -ENOMEM; - io_uring_prep_rw(op, sqe[0], 1 /*fds[1]*/, + io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/, addr, iod->nr_sectors << 9, iod->start_sector << 9); @@ -56,19 +56,19 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); - io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0, + io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, iod->nr_sectors << 9, iod->start_sector << 9); sqe[1]->buf_index = tag; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); return 2; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 95188065b2e9..b71faba86c3b 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -432,7 +432,7 @@ static void ublk_thread_deinit(struct ublk_thread *t) } } -static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags) +static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) { struct ublk_dev *dev = q->dev; int depth = dev->dev_info.queue_depth; @@ -446,6 +446,9 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags) q->flags = dev->dev_info.flags; q->flags |= extra_flags; + /* Cache fd in queue for fast path access */ + q->ublk_fd = dev->fds[0]; + cmd_buf_size = ublk_queue_cmd_buf_sz(q); off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ, @@ -481,9 +484,10 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags) return -ENOMEM; } -static int ublk_thread_init(struct ublk_thread *t) +static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flags) { struct ublk_dev *dev = t->dev; + unsigned long long flags = dev->dev_info.flags | extra_flags; int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; int ret; @@ -512,7 +516,17 @@ static int ublk_thread_init(struct ublk_thread *t) io_uring_register_ring_fd(&t->ring); - ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds); + if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) { + /* Register only backing files starting from index 1, exclude ublk control device */ + if (dev->nr_fds > 1) { + ret = io_uring_register_files(&t->ring, &dev->fds[1], dev->nr_fds - 1); + } else { + /* No backing files to register, skip file registration */ + ret = 0; + } + } else { + ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds); + } if (ret) { ublk_err("ublk dev %d thread %d register files failed %d\n", t->dev->dev_info.dev_id, t->idx, ret); @@ -626,9 +640,12 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) /* These fields should be written once, never change */ ublk_set_sqe_cmd_op(sqe[0], cmd_op); - sqe[0]->fd = 0; /* dev->fds[0] */ + sqe[0]->fd = ublk_get_registered_fd(q, 0); /* dev->fds[0] */ sqe[0]->opcode = IORING_OP_URING_CMD; - sqe[0]->flags = IOSQE_FIXED_FILE; + if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) + sqe[0]->flags = 0; /* Use raw FD, not fixed file */ + else + sqe[0]->flags = IOSQE_FIXED_FILE; sqe[0]->rw_flags = 0; cmd->tag = io->tag; cmd->q_id = q->q_id; @@ -832,6 +849,7 @@ struct ublk_thread_info { unsigned idx; sem_t *ready; cpu_set_t *affinity; + unsigned long long extra_flags; }; static void *ublk_io_handler_fn(void *data) @@ -844,7 +862,7 @@ static void *ublk_io_handler_fn(void *data) t->dev = info->dev; t->idx = info->idx; - ret = ublk_thread_init(t); + ret = ublk_thread_init(t, info->extra_flags); if (ret) { ublk_err("ublk dev %d thread %u init failed\n", dev_id, t->idx); @@ -934,6 +952,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) if (ctx->auto_zc_fallback) extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; + if (ctx->no_ublk_fixed_fd) + extra_flags |= UBLKS_Q_NO_UBLK_FIXED_FD; for (i = 0; i < dinfo->nr_hw_queues; i++) { dev->q[i].dev = dev; @@ -951,6 +971,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) tinfo[i].dev = dev; tinfo[i].idx = i; tinfo[i].ready = &ready; + tinfo[i].extra_flags = extra_flags; /* * If threads are not tied 1:1 to queues, setting thread @@ -1400,7 +1421,7 @@ static int cmd_dev_get_features(void) if (!((1ULL << i) & features)) continue; - if (i < sizeof(feat_map) / sizeof(feat_map[0])) + if (i < ARRAY_SIZE(feat_map)) feat = feat_map[i]; else feat = "unknown"; @@ -1471,13 +1492,13 @@ static void __cmd_create_help(char *exe, bool recovery) printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n", exe, recovery ? "recover" : "add"); printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n"); - printf("\t[-e 0|1 ] [-i 0|1]\n"); + printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n"); printf("\t[--nthreads threads] [--per_io_tasks]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); - for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) { + for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) { const struct ublk_tgt_ops *ops = tgt_ops_list[i]; if (ops->usage) @@ -1534,6 +1555,7 @@ int main(int argc, char *argv[]) { "size", 1, NULL, 's'}, { "nthreads", 1, NULL, 0 }, { "per_io_tasks", 0, NULL, 0 }, + { "no_ublk_fixed_fd", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1613,6 +1635,8 @@ int main(int argc, char *argv[]) ctx.nthreads = strtol(optarg, NULL, 10); if (!strcmp(longopts[option_idx].name, "per_io_tasks")) ctx.per_io_tasks = 1; + if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd")) + ctx.no_ublk_fixed_fd = 1; break; case '?': /* diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 219233f8a053..5e55484fb0aa 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -77,6 +77,7 @@ struct dev_ctx { unsigned int recovery:1; unsigned int auto_zc_fallback:1; unsigned int per_io_tasks:1; + unsigned int no_ublk_fixed_fd:1; int _evtfd; int _shmid; @@ -166,7 +167,9 @@ struct ublk_queue { /* borrow one bit of ublk uapi flags, which may never be used */ #define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63) +#define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) __u64 flags; + int ublk_fd; /* cached ublk char device fd */ struct ublk_io ios[UBLK_QUEUE_DEPTH]; }; @@ -273,34 +276,48 @@ static inline int ublk_io_alloc_sqes(struct ublk_thread *t, return nr_sqes; } -static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, - int dev_fd, int tag, int q_id, __u64 index) +static inline int ublk_get_registered_fd(struct ublk_queue *q, int fd_index) +{ + if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) { + if (fd_index == 0) + /* Return the raw ublk FD for index 0 */ + return q->ublk_fd; + /* Adjust index for backing files (index 1 becomes 0, etc.) */ + return fd_index - 1; + } + return fd_index; +} + +static inline void __io_uring_prep_buf_reg_unreg(struct io_uring_sqe *sqe, + struct ublk_queue *q, int tag, int q_id, __u64 index) { struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + int dev_fd = ublk_get_registered_fd(q, 0); io_uring_prep_read(sqe, dev_fd, 0, 0, 0); sqe->opcode = IORING_OP_URING_CMD; - sqe->flags |= IOSQE_FIXED_FILE; - sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; + if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) + sqe->flags &= ~IOSQE_FIXED_FILE; + else + sqe->flags |= IOSQE_FIXED_FILE; cmd->tag = tag; cmd->addr = index; cmd->q_id = q_id; } -static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, - int dev_fd, int tag, int q_id, __u64 index) +static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, + struct ublk_queue *q, int tag, int q_id, __u64 index) { - struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + __io_uring_prep_buf_reg_unreg(sqe, q, tag, q_id, index); + sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; +} - io_uring_prep_read(sqe, dev_fd, 0, 0, 0); - sqe->opcode = IORING_OP_URING_CMD; - sqe->flags |= IOSQE_FIXED_FILE; +static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, + struct ublk_queue *q, int tag, int q_id, __u64 index) +{ + __io_uring_prep_buf_reg_unreg(sqe, q, tag, q_id, index); sqe->cmd_op = UBLK_U_IO_UNREGISTER_IO_BUF; - - cmd->tag = tag; - cmd->addr = index; - cmd->q_id = q_id; } static inline void *ublk_get_sqe_cmd(const struct io_uring_sqe *sqe) diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index f0e0003a4860..280043f6b689 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -63,7 +63,7 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; @@ -71,7 +71,7 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, __setup_nop_io(tag, iod, sqe[1], q->q_id); sqe[1]->flags |= IOSQE_IO_HARDLINK; - io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); // buf register is marked as IOSQE_CQE_SKIP_SUCCESS diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index 1fb9b7cc281b..791fa8dc1651 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -142,7 +142,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, s->nr + extra); if (zc) { - io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -168,7 +168,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, if (zc) { struct io_uring_sqe *unreg = sqe[s->nr + 1]; - io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, io->buf_index); unreg->user_data = build_user_data( tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1); } diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh index 40d1437ca298..3f901db4d09d 100755 --- a/tools/testing/selftests/ublk/test_stress_04.sh +++ b/tools/testing/selftests/ublk/test_stress_04.sh @@ -28,14 +28,14 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M -ublk_io_and_kill_daemon 8G -t null -q 4 -z & -ublk_io_and_kill_daemon 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" & +ublk_io_and_kill_daemon 8G -t null -q 4 -z --no_ublk_fixed_fd & +ublk_io_and_kill_daemon 256M -t loop -q 4 -z --no_ublk_fixed_fd "${UBLK_BACKFILES[0]}" & ublk_io_and_kill_daemon 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & if _have_feature "AUTO_BUF_REG"; then ublk_io_and_kill_daemon 8G -t null -q 4 --auto_zc & ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc "${UBLK_BACKFILES[0]}" & - ublk_io_and_kill_daemon 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & + ublk_io_and_kill_daemon 256M -t stripe -q 4 --auto_zc --no_ublk_fixed_fd "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback & fi diff --git a/tools/testing/shared/linux/idr.h b/tools/testing/shared/linux/idr.h index 4e342f2e37cf..676c5564e33f 100644 --- a/tools/testing/shared/linux/idr.h +++ b/tools/testing/shared/linux/idr.h @@ -1 +1,5 @@ +/* Avoid duplicate definitions due to system headers. */ +#ifdef __CONCAT +#undef __CONCAT +#endif #include "../../../../include/linux/idr.h" diff --git a/tools/tracing/latency/Makefile.config b/tools/tracing/latency/Makefile.config index 0fe6b50f029b..6efa13e3ca93 100644 --- a/tools/tracing/latency/Makefile.config +++ b/tools/tracing/latency/Makefile.config @@ -1,7 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-only +include $(srctree)/tools/scripts/utilities.mak + STOP_ERROR := +ifndef ($(NO_LIBTRACEEVENT),1) + ifeq ($(call get-executable,$(PKG_CONFIG)),) + $(error Error: $(PKG_CONFIG) needed by libtraceevent/libtracefs is missing on this system, please install it) + endif +endif + define lib_setup $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)")) $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)")) diff --git a/tools/tracing/rtla/Makefile.config b/tools/tracing/rtla/Makefile.config index 5f2231d8d626..07ff5e8f3006 100644 --- a/tools/tracing/rtla/Makefile.config +++ b/tools/tracing/rtla/Makefile.config @@ -1,10 +1,18 @@ # SPDX-License-Identifier: GPL-2.0-only +include $(srctree)/tools/scripts/utilities.mak + STOP_ERROR := LIBTRACEEVENT_MIN_VERSION = 1.5 LIBTRACEFS_MIN_VERSION = 1.6 +ifndef ($(NO_LIBTRACEEVENT),1) + ifeq ($(call get-executable,$(PKG_CONFIG)),) + $(error Error: $(PKG_CONFIG) needed by libtraceevent/libtracefs is missing on this system, please install it) + endif +endif + define lib_setup $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)")) $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)")) diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c index aaf0808125d7..13ff1934d47c 100644 --- a/tools/tracing/rtla/src/actions.c +++ b/tools/tracing/rtla/src/actions.c @@ -49,7 +49,7 @@ actions_destroy(struct actions *self) static struct action * actions_new(struct actions *self) { - if (self->size >= self->len) { + if (self->len >= self->size) { self->size *= 2; self->list = realloc(self->list, self->size * sizeof(struct action)); } @@ -131,7 +131,7 @@ actions_parse(struct actions *self, const char *trigger) { enum action_type type = ACTION_NONE; char *token; - char trigger_c[strlen(trigger)]; + char trigger_c[strlen(trigger) + 1]; /* For ACTION_SIGNAL */ int signal = 0, pid = 0; |