From db751e309ff05461a0c8e114b1238d7a69cc1f18 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 16 Mar 2020 16:50:43 +0000 Subject: ELF: UAPI and Kconfig additions for ELF program properties Pull the basic ELF definitions relating to the NT_GNU_PROPERTY_TYPE_0 note from Yu-Cheng Yu's earlier x86 shstk series. Signed-off-by: Mark Brown Signed-off-by: Dave Martin Signed-off-by: Yu-cheng Yu Reviewed-by: Catalin Marinas Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas --- include/uapi/linux/elf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 34c02e4290fe..c37731407074 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -36,6 +36,7 @@ typedef __s64 Elf64_Sxword; #define PT_LOPROC 0x70000000 #define PT_HIPROC 0x7fffffff #define PT_GNU_EH_FRAME 0x6474e550 +#define PT_GNU_PROPERTY 0x6474e553 #define PT_GNU_STACK (PT_LOOS + 0x474e551) -- cgit v1.2.3 From 00e19ceec80b03a43f626f891fcc53e57919f1b3 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 16 Mar 2020 16:50:44 +0000 Subject: ELF: Add ELF program property parsing support ELF program properties will be needed for detecting whether to enable optional architecture or ABI features for a new ELF process. For now, there are no generic properties that we care about, so do nothing unless CONFIG_ARCH_USE_GNU_PROPERTY=y. Otherwise, the presence of properties using the PT_PROGRAM_PROPERTY phdrs entry (if any), and notify each property to the arch code. For now, the added code is not used. Signed-off-by: Mark Brown Signed-off-by: Dave Martin Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas --- fs/binfmt_elf.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++ fs/compat_binfmt_elf.c | 4 ++ include/linux/elf.h | 19 +++++++ include/uapi/linux/elf.h | 4 ++ 4 files changed, 154 insertions(+) (limited to 'include/uapi') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f4713ea76e82..1fb67e506b68 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -39,12 +39,18 @@ #include #include #include +#include +#include #include #include #include #include #include +#ifndef ELF_COMPAT +#define ELF_COMPAT 0 +#endif + #ifndef user_long_t #define user_long_t long #endif @@ -681,6 +687,111 @@ out: * libraries. There is no binary dependent code anywhere else. */ +static int parse_elf_property(const char *data, size_t *off, size_t datasz, + struct arch_elf_state *arch, + bool have_prev_type, u32 *prev_type) +{ + size_t o, step; + const struct gnu_property *pr; + int ret; + + if (*off == datasz) + return -ENOENT; + + if (WARN_ON_ONCE(*off > datasz || *off % ELF_GNU_PROPERTY_ALIGN)) + return -EIO; + o = *off; + datasz -= *off; + + if (datasz < sizeof(*pr)) + return -ENOEXEC; + pr = (const struct gnu_property *)(data + o); + o += sizeof(*pr); + datasz -= sizeof(*pr); + + if (pr->pr_datasz > datasz) + return -ENOEXEC; + + WARN_ON_ONCE(o % ELF_GNU_PROPERTY_ALIGN); + step = round_up(pr->pr_datasz, ELF_GNU_PROPERTY_ALIGN); + if (step > datasz) + return -ENOEXEC; + + /* Properties are supposed to be unique and sorted on pr_type: */ + if (have_prev_type && pr->pr_type <= *prev_type) + return -ENOEXEC; + *prev_type = pr->pr_type; + + ret = arch_parse_elf_property(pr->pr_type, data + o, + pr->pr_datasz, ELF_COMPAT, arch); + if (ret) + return ret; + + *off = o + step; + return 0; +} + +#define NOTE_DATA_SZ SZ_1K +#define GNU_PROPERTY_TYPE_0_NAME "GNU" +#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME)) + +static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, + struct arch_elf_state *arch) +{ + union { + struct elf_note nhdr; + char data[NOTE_DATA_SZ]; + } note; + loff_t pos; + ssize_t n; + size_t off, datasz; + int ret; + bool have_prev_type; + u32 prev_type; + + if (!IS_ENABLED(CONFIG_ARCH_USE_GNU_PROPERTY) || !phdr) + return 0; + + /* load_elf_binary() shouldn't call us unless this is true... */ + if (WARN_ON_ONCE(phdr->p_type != PT_GNU_PROPERTY)) + return -ENOEXEC; + + /* If the properties are crazy large, that's too bad (for now): */ + if (phdr->p_filesz > sizeof(note)) + return -ENOEXEC; + + pos = phdr->p_offset; + n = kernel_read(f, ¬e, phdr->p_filesz, &pos); + + BUILD_BUG_ON(sizeof(note) < sizeof(note.nhdr) + NOTE_NAME_SZ); + if (n < 0 || n < sizeof(note.nhdr) + NOTE_NAME_SZ) + return -EIO; + + if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || + note.nhdr.n_namesz != NOTE_NAME_SZ || + strncmp(note.data + sizeof(note.nhdr), + GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr))) + return -ENOEXEC; + + off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, + ELF_GNU_PROPERTY_ALIGN); + if (off > n) + return -ENOEXEC; + + if (note.nhdr.n_descsz > n - off) + return -ENOEXEC; + datasz = off + note.nhdr.n_descsz; + + have_prev_type = false; + do { + ret = parse_elf_property(note.data, &off, datasz, arch, + have_prev_type, &prev_type); + have_prev_type = true; + } while (!ret); + + return ret == -ENOENT ? 0 : ret; +} + static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ @@ -688,6 +799,7 @@ static int load_elf_binary(struct linux_binprm *bprm) int load_addr_set = 0; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; + struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_bss, elf_brk; int bss_prot = 0; int retval, i; @@ -733,6 +845,11 @@ static int load_elf_binary(struct linux_binprm *bprm) for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter; + if (elf_ppnt->p_type == PT_GNU_PROPERTY) { + elf_property_phdata = elf_ppnt; + continue; + } + if (elf_ppnt->p_type != PT_INTERP) continue; @@ -820,9 +937,14 @@ out_free_interp: goto out_free_dentry; /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ + elf_property_phdata = NULL; elf_ppnt = interp_elf_phdata; for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { + case PT_GNU_PROPERTY: + elf_property_phdata = elf_ppnt; + break; + case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(&loc->interp_elf_ex, elf_ppnt, interpreter, @@ -833,6 +955,11 @@ out_free_interp: } } + retval = parse_elf_properties(interpreter ?: bprm->file, + elf_property_phdata, &arch_state); + if (retval) + goto out_free_dentry; + /* * Allow arch code to reject the ELF at this point, whilst it's * still possible to return an error to the code that invoked diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index aaad4ca1217e..13a087bc816b 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -17,6 +17,8 @@ #include #include +#define ELF_COMPAT 1 + /* * Rename the basic ELF layout types to refer to the 32-bit class of files. */ @@ -28,11 +30,13 @@ #undef elf_shdr #undef elf_note #undef elf_addr_t +#undef ELF_GNU_PROPERTY_ALIGN #define elfhdr elf32_hdr #define elf_phdr elf32_phdr #define elf_shdr elf32_shdr #define elf_note elf32_note #define elf_addr_t Elf32_Addr +#define ELF_GNU_PROPERTY_ALIGN ELF32_GNU_PROPERTY_ALIGN /* * Some data types as stored in coredump. diff --git a/include/linux/elf.h b/include/linux/elf.h index f7b24c5fcfb6..db5113479f5e 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -22,6 +22,9 @@ SET_PERSONALITY(ex) #endif +#define ELF32_GNU_PROPERTY_ALIGN 4 +#define ELF64_GNU_PROPERTY_ALIGN 8 + #if ELF_CLASS == ELFCLASS32 extern Elf32_Dyn _DYNAMIC []; @@ -32,6 +35,7 @@ extern Elf32_Dyn _DYNAMIC []; #define elf_addr_t Elf32_Off #define Elf_Half Elf32_Half #define Elf_Word Elf32_Word +#define ELF_GNU_PROPERTY_ALIGN ELF32_GNU_PROPERTY_ALIGN #else @@ -43,6 +47,7 @@ extern Elf64_Dyn _DYNAMIC []; #define elf_addr_t Elf64_Off #define Elf_Half Elf64_Half #define Elf_Word Elf64_Word +#define ELF_GNU_PROPERTY_ALIGN ELF64_GNU_PROPERTY_ALIGN #endif @@ -68,4 +73,18 @@ struct gnu_property { u32 pr_datasz; }; +struct arch_elf_state; + +#ifndef CONFIG_ARCH_USE_GNU_PROPERTY +static inline int arch_parse_elf_property(u32 type, const void *data, + size_t datasz, bool compat, + struct arch_elf_state *arch) +{ + return 0; +} +#else +extern int arch_parse_elf_property(u32 type, const void *data, size_t datasz, + bool compat, struct arch_elf_state *arch); +#endif + #endif /* _LINUX_ELF_H */ diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index c37731407074..20900f4496b7 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -368,6 +368,7 @@ typedef struct elf64_shdr { * Notes used in ET_CORE. Architectures export some of the arch register sets * using the corresponding note types via the PTRACE_GETREGSET and * PTRACE_SETREGSET requests. + * The note name for all these is "LINUX". */ #define NT_PRSTATUS 1 #define NT_PRFPREG 2 @@ -430,6 +431,9 @@ typedef struct elf64_shdr { #define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ #define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ +/* Note types with note name "GNU" */ +#define NT_GNU_PROPERTY_TYPE_0 5 + /* Note header in a PT_NOTE section */ typedef struct elf32_note { Elf32_Word n_namesz; /* Name size */ -- cgit v1.2.3 From ab7876a98a2160092133de4c648e94b18bc3f139 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 16 Mar 2020 16:50:47 +0000 Subject: arm64: elf: Enable BTI at exec based on ELF program properties For BTI protection to be as comprehensive as possible, it is desirable to have BTI enabled from process startup. If this is not done, the process must use mprotect() to enable BTI for each of its executable mappings, but this is painful to do in the libc startup code. It's simpler and more sound to have the kernel do it instead. To this end, detect BTI support in the executable (or ELF interpreter, as appropriate), via the NT_GNU_PROGRAM_PROPERTY_TYPE_0 note, and tweak the initial prot flags for the process' executable pages to include PROT_BTI as appropriate. Signed-off-by: Mark Brown Signed-off-by: Dave Martin Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 3 +++ arch/arm64/include/asm/elf.h | 50 ++++++++++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/process.c | 19 +++++++++++++++++ include/uapi/linux/elf.h | 6 ++++++ 4 files changed, 78 insertions(+) (limited to 'include/uapi') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0b30e884e088..8a15bc68dadd 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -9,6 +9,7 @@ config ARM64 select ACPI_MCFG if (ACPI && PCI) select ACPI_SPCR_TABLE if ACPI select ACPI_PPTT if ACPI + select ARCH_BINFMT_ELF_STATE select ARCH_CLOCKSOURCE_DATA select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED @@ -33,6 +34,7 @@ config ARM64 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK if !PREEMPTION select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION @@ -62,6 +64,7 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_GNU_PROPERTY if BINFMT_ELF select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_SUPPORTS_MEMORY_FAILURE diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index b618017205a3..4f00d50585a4 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -114,7 +114,11 @@ #ifndef __ASSEMBLY__ +#include #include +#include +#include +#include #include /* for signal_minsigstksz, used by ARCH_DLINFO */ typedef unsigned long elf_greg_t; @@ -224,6 +228,52 @@ extern int aarch32_setup_additional_pages(struct linux_binprm *bprm, #endif /* CONFIG_COMPAT */ +struct arch_elf_state { + int flags; +}; + +#define ARM64_ELF_BTI (1 << 0) + +#define INIT_ARCH_ELF_STATE { \ + .flags = 0, \ +} + +static inline int arch_parse_elf_property(u32 type, const void *data, + size_t datasz, bool compat, + struct arch_elf_state *arch) +{ + /* No known properties for AArch32 yet */ + if (IS_ENABLED(CONFIG_COMPAT) && compat) + return 0; + + if (type == GNU_PROPERTY_AARCH64_FEATURE_1_AND) { + const u32 *p = data; + + if (datasz != sizeof(*p)) + return -ENOEXEC; + + if (system_supports_bti() && + (*p & GNU_PROPERTY_AARCH64_FEATURE_1_BTI)) + arch->flags |= ARM64_ELF_BTI; + } + + return 0; +} + +static inline int arch_elf_pt_proc(void *ehdr, void *phdr, + struct file *f, bool is_interp, + struct arch_elf_state *state) +{ + return 0; +} + +static inline int arch_check_elf(void *ehdr, bool has_interp, + void *interp_ehdr, + struct arch_elf_state *state) +{ + return 0; +} + #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 00626057a384..b8e3faa8d406 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -654,3 +656,20 @@ asmlinkage void __sched arm64_preempt_schedule_irq(void) if (system_capabilities_finalized()) preempt_schedule_irq(); } + +#ifdef CONFIG_BINFMT_ELF +int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, + bool has_interp, bool is_interp) +{ + if (is_interp != has_interp) + return prot; + + if (!(state->flags & ARM64_ELF_BTI)) + return prot; + + if (prot & PROT_EXEC) + prot |= PROT_BTI; + + return prot; +} +#endif diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 20900f4496b7..c6dd0215482e 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -448,4 +448,10 @@ typedef struct elf64_note { Elf64_Word n_type; /* Content type */ } Elf64_Nhdr; +/* .note.gnu.property types for EM_AARCH64: */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 + +/* Bits for GNU_PROPERTY_AARCH64_FEATURE_1_BTI */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) + #endif /* _UAPI_LINUX_ELF_H */ -- cgit v1.2.3 From 11ecbdddf2f8b6cc2480aff6d877b7a4076e3b7f Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Tue, 17 Mar 2020 15:22:22 +0200 Subject: drm/i915/perf: introduce global sseu pinning On Gen11 powergating half the execution units is a functional requirement when using the VME samplers. Not fullfilling this requirement can lead to hangs. This unfortunately plays fairly poorly with the NOA requirements. NOA requires a stable power configuration to maintain its configuration. As a result using OA (and NOA feeding into it) so far has required us to use a power configuration that can work for all contexts. The only power configuration fullfilling this is powergating half the execution units. This makes performance analysis for 3D workloads somewhat pointless. Failing to find a solution that would work for everybody, this change introduces a new i915-perf stream open parameter that punts the decision off to userspace. If this parameter is omitted, the existing Gen11 behavior remains (half EU array powergating). This change takes the initiative to move all perf related sseu configuration into i915_perf.c v2: Make parameter priviliged if different from default v3: Fix context modifying its sseu config while i915-perf is enabled v4: Always consider global sseu a privileged operation (Tvrtko) Override req_sseu point in intel_sseu_make_rpcs() (Tvrtko) Remove unrelated changes (Tvrtko) v5: Some typos (Tvrtko) Process sseu param in read_properties_unlocked() (Tvrtko) v6: Actually commit the bits from v5... Fixup some checkpath warnings v7: Only compare engine uabi field (Chris) Signed-off-by: Lionel Landwerlin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200317132222.2638719-3-lionel.g.landwerlin@intel.com --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 10 ++-- drivers/gpu/drm/i915/gem/i915_gem_context.h | 4 ++ drivers/gpu/drm/i915/gt/intel_sseu.c | 33 +++---------- drivers/gpu/drm/i915/i915_perf.c | 76 ++++++++++++++++++++++++++++- drivers/gpu/drm/i915/i915_perf_types.h | 7 +++ include/uapi/drm/i915_drm.h | 11 +++++ 6 files changed, 108 insertions(+), 33 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 026999b34abd..c0e476fcd1fa 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -1401,10 +1401,10 @@ static int get_ringsize(struct i915_gem_context *ctx, return 0; } -static int -user_to_context_sseu(struct drm_i915_private *i915, - const struct drm_i915_gem_context_param_sseu *user, - struct intel_sseu *context) +int +i915_gem_user_to_context_sseu(struct drm_i915_private *i915, + const struct drm_i915_gem_context_param_sseu *user, + struct intel_sseu *context) { const struct sseu_dev_info *device = &RUNTIME_INFO(i915)->sseu; @@ -1539,7 +1539,7 @@ static int set_sseu(struct i915_gem_context *ctx, goto out_ce; } - ret = user_to_context_sseu(i915, &user_sseu, &sseu); + ret = i915_gem_user_to_context_sseu(i915, &user_sseu, &sseu); if (ret) goto out_ce; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h index f1d884d304bd..3702b2fb27ab 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h @@ -225,4 +225,8 @@ i915_gem_engines_iter_next(struct i915_gem_engines_iter *it); struct i915_lut_handle *i915_lut_handle_alloc(void); void i915_lut_handle_free(struct i915_lut_handle *lut); +int i915_gem_user_to_context_sseu(struct drm_i915_private *i915, + const struct drm_i915_gem_context_param_sseu *user, + struct intel_sseu *context); + #endif /* !__I915_GEM_CONTEXT_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_sseu.c b/drivers/gpu/drm/i915/gt/intel_sseu.c index 74f793423231..d173271c7397 100644 --- a/drivers/gpu/drm/i915/gt/intel_sseu.c +++ b/drivers/gpu/drm/i915/gt/intel_sseu.c @@ -65,7 +65,6 @@ u32 intel_sseu_make_rpcs(struct drm_i915_private *i915, { const struct sseu_dev_info *sseu = &RUNTIME_INFO(i915)->sseu; bool subslice_pg = sseu->has_subslice_pg; - struct intel_sseu ctx_sseu; u8 slices, subslices; u32 rpcs = 0; @@ -78,31 +77,13 @@ u32 intel_sseu_make_rpcs(struct drm_i915_private *i915, /* * If i915/perf is active, we want a stable powergating configuration - * on the system. - * - * We could choose full enablement, but on ICL we know there are use - * cases which disable slices for functional, apart for performance - * reasons. So in this case we select a known stable subset. + * on the system. Use the configuration pinned by i915/perf. */ - if (!i915->perf.exclusive_stream) { - ctx_sseu = *req_sseu; - } else { - ctx_sseu = intel_sseu_from_device_info(sseu); - - if (IS_GEN(i915, 11)) { - /* - * We only need subslice count so it doesn't matter - * which ones we select - just turn off low bits in the - * amount of half of all available subslices per slice. - */ - ctx_sseu.subslice_mask = - ~(~0 << (hweight8(ctx_sseu.subslice_mask) / 2)); - ctx_sseu.slice_mask = 0x1; - } - } + if (i915->perf.exclusive_stream) + req_sseu = &i915->perf.sseu; - slices = hweight8(ctx_sseu.slice_mask); - subslices = hweight8(ctx_sseu.subslice_mask); + slices = hweight8(req_sseu->slice_mask); + subslices = hweight8(req_sseu->subslice_mask); /* * Since the SScount bitfield in GEN8_R_PWR_CLK_STATE is only three bits @@ -175,13 +156,13 @@ u32 intel_sseu_make_rpcs(struct drm_i915_private *i915, if (sseu->has_eu_pg) { u32 val; - val = ctx_sseu.min_eus_per_subslice << GEN8_RPCS_EU_MIN_SHIFT; + val = req_sseu->min_eus_per_subslice << GEN8_RPCS_EU_MIN_SHIFT; GEM_BUG_ON(val & ~GEN8_RPCS_EU_MIN_MASK); val &= GEN8_RPCS_EU_MIN_MASK; rpcs |= val; - val = ctx_sseu.max_eus_per_subslice << GEN8_RPCS_EU_MAX_SHIFT; + val = req_sseu->max_eus_per_subslice << GEN8_RPCS_EU_MAX_SHIFT; GEM_BUG_ON(val & ~GEN8_RPCS_EU_MAX_MASK); val &= GEN8_RPCS_EU_MAX_MASK; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 86c6abaa3e0e..4611f72cd036 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -344,6 +344,10 @@ static const struct i915_oa_format gen12_oa_formats[I915_OA_FORMAT_MAX] = { * @oa_periodic: Whether to enable periodic OA unit sampling * @oa_period_exponent: The OA unit sampling period is derived from this * @engine: The engine (typically rcs0) being monitored by the OA unit + * @has_sseu: Whether @sseu was specified by userspace + * @sseu: internal SSEU configuration computed either from the userspace + * specified configuration in the opening parameters or a default value + * (see get_default_sseu_config()) * * As read_properties_unlocked() enumerates and validates the properties given * to open a stream of metrics the configuration is built up in the structure @@ -363,6 +367,9 @@ struct perf_open_properties { int oa_period_exponent; struct intel_engine_cs *engine; + + bool has_sseu; + struct intel_sseu sseu; }; struct i915_oa_config_bo { @@ -2720,6 +2727,39 @@ static int i915_perf_stream_enable_sync(struct i915_perf_stream *stream) return 0; } +static void +get_default_sseu_config(struct intel_sseu *out_sseu, + struct intel_engine_cs *engine) +{ + const struct sseu_dev_info *devinfo_sseu = + &RUNTIME_INFO(engine->i915)->sseu; + + *out_sseu = intel_sseu_from_device_info(devinfo_sseu); + + if (IS_GEN(engine->i915, 11)) { + /* + * We only need subslice count so it doesn't matter which ones + * we select - just turn off low bits in the amount of half of + * all available subslices per slice. + */ + out_sseu->subslice_mask = + ~(~0 << (hweight8(out_sseu->subslice_mask) / 2)); + out_sseu->slice_mask = 0x1; + } +} + +static int +get_sseu_config(struct intel_sseu *out_sseu, + struct intel_engine_cs *engine, + const struct drm_i915_gem_context_param_sseu *drm_sseu) +{ + if (drm_sseu->engine.engine_class != engine->uabi_class || + drm_sseu->engine.engine_instance != engine->uabi_instance) + return -EINVAL; + + return i915_gem_user_to_context_sseu(engine->i915, drm_sseu, out_sseu); +} + /** * i915_oa_stream_init - validate combined props for OA stream and init * @stream: An i915 perf stream @@ -2852,6 +2892,8 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, goto err_oa_buf_alloc; stream->ops = &i915_oa_stream_ops; + + perf->sseu = props->sseu; WRITE_ONCE(perf->exclusive_stream, stream); ret = i915_perf_stream_enable_sync(stream); @@ -3397,6 +3439,14 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf, privileged_op = true; } + /* + * Asking for SSEU configuration is a priviliged operation. + */ + if (props->has_sseu) + privileged_op = true; + else + get_default_sseu_config(&props->sseu, props->engine); + /* Similar to perf's kernel.perf_paranoid_cpu sysctl option * we check a dev.i915.perf_stream_paranoid sysctl option * to determine if it's ok to access system wide OA counters @@ -3492,6 +3542,7 @@ static int read_properties_unlocked(struct i915_perf *perf, { u64 __user *uprop = uprops; u32 i; + int ret; memset(props, 0, sizeof(struct perf_open_properties)); @@ -3523,7 +3574,6 @@ static int read_properties_unlocked(struct i915_perf *perf, for (i = 0; i < n_props; i++) { u64 oa_period, oa_freq_hz; u64 id, value; - int ret; ret = get_user(id, uprop); if (ret) @@ -3609,6 +3659,24 @@ static int read_properties_unlocked(struct i915_perf *perf, case DRM_I915_PERF_PROP_HOLD_PREEMPTION: props->hold_preemption = !!value; break; + case DRM_I915_PERF_PROP_GLOBAL_SSEU: { + struct drm_i915_gem_context_param_sseu user_sseu; + + if (copy_from_user(&user_sseu, + u64_to_user_ptr(value), + sizeof(user_sseu))) { + DRM_DEBUG("Unable to copy global sseu parameter\n"); + return -EFAULT; + } + + ret = get_sseu_config(&props->sseu, props->engine, &user_sseu); + if (ret) { + DRM_DEBUG("Invalid SSEU configuration\n"); + return ret; + } + props->has_sseu = true; + break; + } case DRM_I915_PERF_PROP_MAX: MISSING_CASE(id); return -EINVAL; @@ -4382,8 +4450,12 @@ int i915_perf_ioctl_version(void) * preemption on a particular context so that performance data is * accessible from a delta of MI_RPC reports without looking at the * OA buffer. + * + * 4: Add DRM_I915_PERF_PROP_ALLOWED_SSEU to limit what contexts can + * be run for the duration of the performance recording based on + * their SSEU configuration. */ - return 3; + return 4; } #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h index f4ccd2adfee6..32289cbda648 100644 --- a/drivers/gpu/drm/i915/i915_perf_types.h +++ b/drivers/gpu/drm/i915/i915_perf_types.h @@ -16,6 +16,7 @@ #include #include +#include "gt/intel_sseu.h" #include "i915_reg.h" #include "intel_wakeref.h" @@ -407,6 +408,12 @@ struct i915_perf { */ struct i915_perf_stream *exclusive_stream; + /** + * @sseu: sseu configuration selected to run while perf is active, + * applies to all contexts. + */ + struct intel_sseu sseu; + /** * For rate limiting any notifications of spurious * invalid OA reports diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 2813e579b480..db649d03ab52 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1969,6 +1969,17 @@ enum drm_i915_perf_property_id { */ DRM_I915_PERF_PROP_HOLD_PREEMPTION, + /** + * Specifying this pins all contexts to the specified SSEU power + * configuration for the duration of the recording. + * + * This parameter's value is a pointer to a struct + * drm_i915_gem_context_param_sseu. + * + * This property is available in perf revision 4. + */ + DRM_I915_PERF_PROP_GLOBAL_SSEU, + DRM_I915_PERF_PROP_MAX /* non-ABI */ }; -- cgit v1.2.3 From 4ef10fe05ba0b08ce7029c07878afe3c8d5754d8 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Tue, 24 Mar 2020 11:54:57 -0700 Subject: drm/i915/perf: add new open param to configure polling of OA buffer This new parameter let's the application choose how often the OA buffer should be checked on the CPU side for data availability. Longer polling period tend to reduce CPU overhead if the application does not care about somewhat real time data collection. v2: Allow disabling polling completely with 0 value (Lionel) v3: Version the new parameter (Joonas) v4: Rebase (Umesh) v5: Make poll delay value of 0 invalid (Umesh) v6: - Describe poll_oa_period (Ashutosh) - Fix comment for new poll parameter (Lionel) - Drop open_flags in read_properties_unlocked (Lionel) - Rename uapi parameter (Ashutosh) v7: Reword the comment in uapi (Ashutosh) Signed-off-by: Lionel Landwerlin Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Ashutosh Dixit Signed-off-by: Lionel Landwerlin Link: https://patchwork.freedesktop.org/patch/msgid/20200324185457.14635-4-umesh.nerlige.ramappa@intel.com --- drivers/gpu/drm/i915/i915_perf.c | 32 +++++++++++++++++++++++++------- drivers/gpu/drm/i915/i915_perf_types.h | 6 ++++++ include/uapi/drm/i915_drm.h | 13 +++++++++++++ 3 files changed, 44 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 4cadf97f94bc..c74ebac50015 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -248,11 +248,11 @@ #define OA_TAIL_MARGIN_NSEC 100000ULL #define INVALID_TAIL_PTR 0xffffffff -/* frequency for checking whether the OA unit has written new reports to the - * circular OA buffer... +/* The default frequency for checking whether the OA unit has written new + * reports to the circular OA buffer... */ -#define POLL_FREQUENCY 200 -#define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY) +#define DEFAULT_POLL_FREQUENCY_HZ 200 +#define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ) /* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */ static u32 i915_perf_stream_paranoid = true; @@ -339,6 +339,8 @@ static const struct i915_oa_format gen12_oa_formats[I915_OA_FORMAT_MAX] = { * @sseu: internal SSEU configuration computed either from the userspace * specified configuration in the opening parameters or a default value * (see get_default_sseu_config()) + * @poll_oa_period: The period in nanoseconds at which the CPU will check for OA + * data availability * * As read_properties_unlocked() enumerates and validates the properties given * to open a stream of metrics the configuration is built up in the structure @@ -361,6 +363,8 @@ struct perf_open_properties { bool has_sseu; struct intel_sseu sseu; + + u64 poll_oa_period; }; struct i915_oa_config_bo { @@ -2600,7 +2604,7 @@ static void i915_oa_stream_enable(struct i915_perf_stream *stream) if (stream->periodic) hrtimer_start(&stream->poll_check_timer, - ns_to_ktime(POLL_PERIOD), + ns_to_ktime(stream->poll_oa_period), HRTIMER_MODE_REL_PINNED); } @@ -3035,7 +3039,8 @@ static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer) wake_up(&stream->poll_wq); } - hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD)); + hrtimer_forward_now(hrtimer, + ns_to_ktime(stream->poll_oa_period)); return HRTIMER_RESTART; } @@ -3424,6 +3429,7 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf, stream->perf = perf; stream->ctx = specific_ctx; + stream->poll_oa_period = props->poll_oa_period; ret = i915_oa_stream_init(stream, param, props); if (ret) @@ -3502,6 +3508,7 @@ static int read_properties_unlocked(struct i915_perf *perf, int ret; memset(props, 0, sizeof(struct perf_open_properties)); + props->poll_oa_period = DEFAULT_POLL_PERIOD_NS; if (!n_props) { DRM_DEBUG("No i915 perf properties given\n"); @@ -3634,6 +3641,14 @@ static int read_properties_unlocked(struct i915_perf *perf, props->has_sseu = true; break; } + case DRM_I915_PERF_PROP_POLL_OA_PERIOD: + if (value < 100000 /* 100us */) { + DRM_DEBUG("OA availability timer too small (%lluns < 100us)\n", + value); + return -EINVAL; + } + props->poll_oa_period = value; + break; case DRM_I915_PERF_PROP_MAX: MISSING_CASE(id); return -EINVAL; @@ -4416,8 +4431,11 @@ int i915_perf_ioctl_version(void) * 4: Add DRM_I915_PERF_PROP_ALLOWED_SSEU to limit what contexts can * be run for the duration of the performance recording based on * their SSEU configuration. + * + * 5: Add DRM_I915_PERF_PROP_POLL_OA_PERIOD parameter that controls the + * interval for the hrtimer used to check for OA data. */ - return 4; + return 5; } #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h index c3ab184c604a..371dcf243622 100644 --- a/drivers/gpu/drm/i915/i915_perf_types.h +++ b/drivers/gpu/drm/i915/i915_perf_types.h @@ -304,6 +304,12 @@ struct i915_perf_stream { * reprogrammed. */ struct i915_vma *noa_wait; + + /** + * @poll_oa_period: The period in nanoseconds at which the OA + * buffer should be checked for available data. + */ + u64 poll_oa_period; }; /** diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index db649d03ab52..14b67cd6b54b 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1980,6 +1980,19 @@ enum drm_i915_perf_property_id { */ DRM_I915_PERF_PROP_GLOBAL_SSEU, + /** + * This optional parameter specifies the timer interval in nanoseconds + * at which the i915 driver will check the OA buffer for available data. + * Minimum allowed value is 100 microseconds. A default value is used by + * the driver if this parameter is not specified. Note that larger timer + * values will reduce cpu consumption during OA perf captures. However, + * excessively large values would potentially result in OA buffer + * overwrites as captures reach end of the OA buffer. + * + * This property is available in perf revision 5. + */ + DRM_I915_PERF_PROP_POLL_OA_PERIOD, + DRM_I915_PERF_PROP_MAX /* non-ABI */ }; -- cgit v1.2.3 From e150ef4169a766aa20003506c0f25b5519981e80 Mon Sep 17 00:00:00 2001 From: Karol Trzcinski Date: Wed, 15 Apr 2020 15:27:56 -0500 Subject: ASoC: SOF: Introduce extended manifest Extended manifest is a place to store build time known firmware metadata, for example firmware version or used compiler description. Given information is read on host side before firmware startup. This part of output binary is located as a first structure in binary file. Extended manifest should be skipped in firmware loading routine. Signed-off-by: Karol Trzcinski Signed-off-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Reviewed-by: Kai Vehmanen Reviewed-by: Guennadi Liakhovetski Link: https://lore.kernel.org/r/20200415202816.934-5-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- include/uapi/sound/sof/ext_manifest.h | 61 +++++++++++++++++ sound/soc/sof/loader.c | 122 ++++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 include/uapi/sound/sof/ext_manifest.h (limited to 'include/uapi') diff --git a/include/uapi/sound/sof/ext_manifest.h b/include/uapi/sound/sof/ext_manifest.h new file mode 100644 index 000000000000..fc4d1bec0cb3 --- /dev/null +++ b/include/uapi/sound/sof/ext_manifest.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * Copyright(c) 2020 Intel Corporation. All rights reserved. + */ + +/* + * Extended manifest is a place to store metadata about firmware, known during + * compilation time - for example firmware version or used compiler. + * Given information are read on host side before firmware startup. + * This part of output binary is not signed. + */ + +#ifndef __SOF_FIRMWARE_EXT_MANIFEST_H__ +#define __SOF_FIRMWARE_EXT_MANIFEST_H__ + +#include + +/* In ASCII `XMan` */ +#define SOF_EXT_MAN_MAGIC_NUMBER 0x6e614d58 + +/* Build u32 number in format MMmmmppp */ +#define SOF_EXT_MAN_BUILD_VERSION(MAJOR, MINOR, PATH) ((uint32_t)( \ + ((MAJOR) << 24) | \ + ((MINOR) << 12) | \ + (PATH))) + +/* check extended manifest version consistency */ +#define SOF_EXT_MAN_VERSION_INCOMPATIBLE(host_ver, cli_ver) ( \ + ((host_ver) & GENMASK(31, 24)) != \ + ((cli_ver) & GENMASK(31, 24))) + +/* used extended manifest header version */ +#define SOF_EXT_MAN_VERSION SOF_EXT_MAN_BUILD_VERSION(1, 0, 0) + +/* extended manifest header, deleting any field breaks backward compatibility */ +struct sof_ext_man_header { + uint32_t magic; /*< identification number, */ + /*< EXT_MAN_MAGIC_NUMBER */ + uint32_t full_size; /*< [bytes] full size of ext_man, */ + /*< (header + content + padding) */ + uint32_t header_size; /*< [bytes] makes header extensionable, */ + /*< after append new field to ext_man header */ + /*< then backward compatible won't be lost */ + uint32_t header_version; /*< value of EXT_MAN_VERSION */ + /*< not related with following content */ + uint8_t elements[]; /*< list of ext_man_elem_* elements */ +} __packed; + +/* Now define extended manifest elements */ + +/* extended manifest element header */ +struct sof_ext_man_elem_header { + uint32_t type; /*< SOF_EXT_MAN_ELEM_ */ + uint32_t size; /*< in bytes, including header size */ + uint8_t blob[]; /*< type dependent content */ +} __packed; + +#endif /* __SOF_FIRMWARE_EXT_MANIFEST_H__ */ diff --git a/sound/soc/sof/loader.c b/sound/soc/sof/loader.c index 89f35db2577d..6b354b6fb83a 100644 --- a/sound/soc/sof/loader.c +++ b/sound/soc/sof/loader.c @@ -12,6 +12,7 @@ #include #include +#include #include "ops.h" static int get_ext_windows(struct snd_sof_dev *sdev, @@ -126,6 +127,104 @@ int snd_sof_fw_parse_ext_data(struct snd_sof_dev *sdev, u32 bar, u32 offset) } EXPORT_SYMBOL(snd_sof_fw_parse_ext_data); +static ssize_t snd_sof_ext_man_size(const struct firmware *fw) +{ + const struct sof_ext_man_header *head = (void *)fw->data; + + /* + * assert fw size is big enough to contain extended manifest header, + * it prevents from reading unallocated memory from `head` in following + * step. + */ + if (fw->size < sizeof(*head)) + return -EINVAL; + + /* + * When fw points to extended manifest, + * then first u32 must be equal SOF_EXT_MAN_MAGIC_NUMBER. + */ + if (head->magic == SOF_EXT_MAN_MAGIC_NUMBER) + return head->full_size; + + /* otherwise given fw don't have an extended manifest */ + return 0; +} + +/* parse extended FW manifest data structures */ +static int snd_sof_fw_ext_man_parse(struct snd_sof_dev *sdev, + const struct firmware *fw) +{ + const struct sof_ext_man_elem_header *elem_hdr; + const struct sof_ext_man_header *head; + ssize_t ext_man_size; + ssize_t remaining; + uintptr_t iptr; + int ret = 0; + + head = (struct sof_ext_man_header *)fw->data; + remaining = head->full_size - head->header_size; + ext_man_size = snd_sof_ext_man_size(fw); + + /* Assert firmware starts with extended manifest */ + if (ext_man_size < 0) { + dev_err(sdev->dev, "error: exception while reading firmware extended manifest, code %d\n", + (int)ext_man_size); + return ext_man_size; + } else if (!ext_man_size) { + dev_err(sdev->dev, "error: can't parse extended manifest when it's not present\n"); + return -EINVAL; + } + + /* incompatible version */ + if (SOF_EXT_MAN_VERSION_INCOMPATIBLE(SOF_EXT_MAN_VERSION, + head->header_version)) { + dev_err(sdev->dev, "error: extended manifest version 0x%X differ from used 0x%X\n", + head->header_version, SOF_EXT_MAN_VERSION); + return -EINVAL; + } + + /* get first extended manifest element header */ + iptr = (uintptr_t)fw->data + head->header_size; + + while (remaining > sizeof(*elem_hdr)) { + elem_hdr = (struct sof_ext_man_elem_header *)iptr; + + dev_dbg(sdev->dev, "found sof_ext_man header type %d size 0x%X\n", + elem_hdr->type, elem_hdr->size); + + if (elem_hdr->size < sizeof(*elem_hdr) || + elem_hdr->size > remaining) { + dev_err(sdev->dev, "error: invalid sof_ext_man header size, type %d size 0x%X\n", + elem_hdr->type, elem_hdr->size); + break; + } + + /* process structure data */ + switch (elem_hdr->type) { + default: + dev_warn(sdev->dev, "warning: unknown sof_ext_man header type %d size 0x%X\n", + elem_hdr->type, elem_hdr->size); + break; + } + + if (ret < 0) { + dev_err(sdev->dev, "error: failed to parse sof_ext_man header type %d size 0x%X\n", + elem_hdr->type, elem_hdr->size); + break; + } + + remaining -= elem_hdr->size; + iptr += elem_hdr->size; + } + + if (remaining) { + dev_err(sdev->dev, "error: sof_ext_man header is inconsistent\n"); + ret = -EINVAL; + } + + return ret; +} + /* * IPC Firmware ready. */ @@ -473,6 +572,7 @@ int snd_sof_load_firmware_raw(struct snd_sof_dev *sdev) { struct snd_sof_pdata *plat_data = sdev->pdata; const char *fw_filename; + ssize_t ext_man_size; int ret; /* Don't request firmware again if firmware is already requested */ @@ -490,11 +590,33 @@ int snd_sof_load_firmware_raw(struct snd_sof_dev *sdev) if (ret < 0) { dev_err(sdev->dev, "error: request firmware %s failed err: %d\n", fw_filename, ret); + goto err; } else { dev_dbg(sdev->dev, "request_firmware %s successful\n", fw_filename); } + /* check for extended manifest */ + ext_man_size = snd_sof_ext_man_size(plat_data->fw); + if (ext_man_size > 0) { + ret = snd_sof_fw_ext_man_parse(sdev, plat_data->fw); + + /* when no error occurred, drop extended manifest */ + if (!ret) + plat_data->fw_offset = ext_man_size; + else + dev_err(sdev->dev, "error: firmware %s contains unsupported or invalid extended manifest: %d\n", + fw_filename, ret); + } else if (!ext_man_size) { + /* No extended manifest, so nothing to skip during FW load */ + dev_dbg(sdev->dev, "firmware doesn't contain extended manifest\n"); + } else { + ret = ext_man_size; + dev_err(sdev->dev, "error: firmware %s contains unsupported or invalid extended manifest: %d\n", + fw_filename, ret); + } + +err: kfree(fw_filename); return ret; -- cgit v1.2.3 From 371091417864b7be870eaad45e043059e6b6828a Mon Sep 17 00:00:00 2001 From: Karol Trzcinski Date: Wed, 15 Apr 2020 15:27:57 -0500 Subject: ASoC: SOF: ext_manifest: parse firmware version The firmware version can be extracted from the extended manifest content. This information known at build time does not need to be provided in a mailbox. Signed-off-by: Karol Trzcinski Signed-off-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Reviewed-by: Kai Vehmanen Link: https://lore.kernel.org/r/20200415202816.934-6-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- include/uapi/sound/sof/ext_manifest.h | 13 +++++++++++++ sound/soc/sof/loader.c | 17 +++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/sound/sof/ext_manifest.h b/include/uapi/sound/sof/ext_manifest.h index fc4d1bec0cb3..163d32210ad8 100644 --- a/include/uapi/sound/sof/ext_manifest.h +++ b/include/uapi/sound/sof/ext_manifest.h @@ -51,6 +51,11 @@ struct sof_ext_man_header { /* Now define extended manifest elements */ +/* Extended manifest elements types */ +enum sof_ext_man_elem_type { + SOF_EXT_MAN_ELEM_FW_VERSION = 0, +}; + /* extended manifest element header */ struct sof_ext_man_elem_header { uint32_t type; /*< SOF_EXT_MAN_ELEM_ */ @@ -58,4 +63,12 @@ struct sof_ext_man_elem_header { uint8_t blob[]; /*< type dependent content */ } __packed; +/* FW version */ +struct sof_ext_man_fw_version { + struct sof_ext_man_elem_header hdr; + /* use sof_ipc struct because of code re-use */ + struct sof_ipc_fw_version version; + uint32_t flags; +} __packed; + #endif /* __SOF_FIRMWARE_EXT_MANIFEST_H__ */ diff --git a/sound/soc/sof/loader.c b/sound/soc/sof/loader.c index 6b354b6fb83a..01077c1ce235 100644 --- a/sound/soc/sof/loader.c +++ b/sound/soc/sof/loader.c @@ -127,6 +127,20 @@ int snd_sof_fw_parse_ext_data(struct snd_sof_dev *sdev, u32 bar, u32 offset) } EXPORT_SYMBOL(snd_sof_fw_parse_ext_data); +static int ext_man_get_fw_version(struct snd_sof_dev *sdev, + const struct sof_ext_man_elem_header *hdr) +{ + const struct sof_ext_man_fw_version *v; + + v = container_of(hdr, struct sof_ext_man_fw_version, hdr); + + memcpy(&sdev->fw_ready.version, &v->version, sizeof(v->version)); + sdev->fw_ready.flags = v->flags; + + /* log ABI versions and check FW compatibility */ + return snd_sof_ipc_valid(sdev); +} + static ssize_t snd_sof_ext_man_size(const struct firmware *fw) { const struct sof_ext_man_header *head = (void *)fw->data; @@ -201,6 +215,9 @@ static int snd_sof_fw_ext_man_parse(struct snd_sof_dev *sdev, /* process structure data */ switch (elem_hdr->type) { + case SOF_EXT_MAN_ELEM_FW_VERSION: + ret = ext_man_get_fw_version(sdev, elem_hdr); + break; default: dev_warn(sdev->dev, "warning: unknown sof_ext_man header type %d size 0x%X\n", elem_hdr->type, elem_hdr->size); -- cgit v1.2.3 From 9e72f13ee541ccae34dd4a6735bcdf0c78090216 Mon Sep 17 00:00:00 2001 From: Karol Trzcinski Date: Wed, 15 Apr 2020 15:27:58 -0500 Subject: ASoC: SOF: ext_manifest: parse windows The window description can be extracted from the extended manifest content. This information known at build time does not need to be provided in a mailbox. Signed-off-by: Karol Trzcinski Signed-off-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Reviewed-by: Kai Vehmanen Link: https://lore.kernel.org/r/20200415202816.934-7-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- include/uapi/sound/sof/ext_manifest.h | 9 +++++++++ sound/soc/sof/loader.c | 27 +++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/sound/sof/ext_manifest.h b/include/uapi/sound/sof/ext_manifest.h index 163d32210ad8..203c203f6531 100644 --- a/include/uapi/sound/sof/ext_manifest.h +++ b/include/uapi/sound/sof/ext_manifest.h @@ -17,6 +17,7 @@ #define __SOF_FIRMWARE_EXT_MANIFEST_H__ #include +#include /* In ASCII `XMan` */ #define SOF_EXT_MAN_MAGIC_NUMBER 0x6e614d58 @@ -54,6 +55,7 @@ struct sof_ext_man_header { /* Extended manifest elements types */ enum sof_ext_man_elem_type { SOF_EXT_MAN_ELEM_FW_VERSION = 0, + SOF_EXT_MAN_ELEM_WINDOW = SOF_IPC_EXT_WINDOW, }; /* extended manifest element header */ @@ -71,4 +73,11 @@ struct sof_ext_man_fw_version { uint32_t flags; } __packed; +/* extended data memory windows for IPC, trace and debug */ +struct sof_ext_man_window { + struct sof_ext_man_elem_header hdr; + /* use sof_ipc struct because of code re-use */ + struct sof_ipc_window ipc_window; +} __packed; + #endif /* __SOF_FIRMWARE_EXT_MANIFEST_H__ */ diff --git a/sound/soc/sof/loader.c b/sound/soc/sof/loader.c index 01077c1ce235..bbfdf07fa6f5 100644 --- a/sound/soc/sof/loader.c +++ b/sound/soc/sof/loader.c @@ -20,13 +20,21 @@ static int get_ext_windows(struct snd_sof_dev *sdev, { const struct sof_ipc_window *w = container_of(ext_hdr, struct sof_ipc_window, ext_hdr); + size_t w_size = struct_size(w, window, w->num_windows); + + if (sdev->info_window) { + if (memcmp(sdev->info_window, w, w_size)) { + dev_err(sdev->dev, "error: mistmatch between window descriptor from extended manifest and mailbox"); + return -EINVAL; + } + return 0; + } if (w->num_windows == 0 || w->num_windows > SOF_IPC_MAX_ELEMS) return -EINVAL; /* keep a local copy of the data */ - sdev->info_window = kmemdup(w, struct_size(w, window, w->num_windows), - GFP_KERNEL); + sdev->info_window = kmemdup(w, w_size, GFP_KERNEL); if (!sdev->info_window) return -ENOMEM; @@ -141,6 +149,18 @@ static int ext_man_get_fw_version(struct snd_sof_dev *sdev, return snd_sof_ipc_valid(sdev); } +static int ext_man_get_windows(struct snd_sof_dev *sdev, + const struct sof_ext_man_elem_header *hdr) +{ + const struct sof_ipc_ext_data_hdr *w_ipc; + const struct sof_ext_man_window *w; + + w = container_of(hdr, struct sof_ext_man_window, hdr); + w_ipc = (const struct sof_ipc_ext_data_hdr *)&w->ipc_window; + + return get_ext_windows(sdev, w_ipc); +} + static ssize_t snd_sof_ext_man_size(const struct firmware *fw) { const struct sof_ext_man_header *head = (void *)fw->data; @@ -218,6 +238,9 @@ static int snd_sof_fw_ext_man_parse(struct snd_sof_dev *sdev, case SOF_EXT_MAN_ELEM_FW_VERSION: ret = ext_man_get_fw_version(sdev, elem_hdr); break; + case SOF_EXT_MAN_ELEM_WINDOW: + ret = ext_man_get_windows(sdev, elem_hdr); + break; default: dev_warn(sdev->dev, "warning: unknown sof_ext_man header type %d size 0x%X\n", elem_hdr->type, elem_hdr->size); -- cgit v1.2.3 From 7c024b948c206935e69aafa56187bff9dd36abed Mon Sep 17 00:00:00 2001 From: Karol Trzcinski Date: Wed, 15 Apr 2020 15:27:59 -0500 Subject: ASoC: SOF: ext_manifest: parse compiler version The compiler version and description can be extracted from the extended manifest content. This information known at build time does not need to be provided in a mailbox. Signed-off-by: Karol Trzcinski Signed-off-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Reviewed-by: Kai Vehmanen Link: https://lore.kernel.org/r/20200415202816.934-8-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- include/uapi/sound/sof/ext_manifest.h | 8 ++++++++ sound/soc/sof/loader.c | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/sound/sof/ext_manifest.h b/include/uapi/sound/sof/ext_manifest.h index 203c203f6531..d49c47d08c7f 100644 --- a/include/uapi/sound/sof/ext_manifest.h +++ b/include/uapi/sound/sof/ext_manifest.h @@ -56,6 +56,7 @@ struct sof_ext_man_header { enum sof_ext_man_elem_type { SOF_EXT_MAN_ELEM_FW_VERSION = 0, SOF_EXT_MAN_ELEM_WINDOW = SOF_IPC_EXT_WINDOW, + SOF_EXT_MAN_ELEM_CC_VERSION = SOF_IPC_EXT_CC_INFO, }; /* extended manifest element header */ @@ -80,4 +81,11 @@ struct sof_ext_man_window { struct sof_ipc_window ipc_window; } __packed; +/* Used C compiler description */ +struct sof_ext_man_cc_version { + struct sof_ext_man_elem_header hdr; + /* use sof_ipc struct because of code re-use */ + struct sof_ipc_cc_version cc_version; +} __packed; + #endif /* __SOF_FIRMWARE_EXT_MANIFEST_H__ */ diff --git a/sound/soc/sof/loader.c b/sound/soc/sof/loader.c index bbfdf07fa6f5..8be30cd5e038 100644 --- a/sound/soc/sof/loader.c +++ b/sound/soc/sof/loader.c @@ -49,6 +49,14 @@ static int get_cc_info(struct snd_sof_dev *sdev, const struct sof_ipc_cc_version *cc = container_of(ext_hdr, struct sof_ipc_cc_version, ext_hdr); + if (sdev->cc_version) { + if (memcmp(sdev->cc_version, cc, cc->ext_hdr.hdr.size)) { + dev_err(sdev->dev, "error: receive diverged cc_version descriptions"); + return -EINVAL; + } + return 0; + } + dev_dbg(sdev->dev, "Firmware info: used compiler %s %d:%d:%d%s used optimization flags %s\n", cc->name, cc->major, cc->minor, cc->micro, cc->desc, cc->optim); @@ -161,6 +169,18 @@ static int ext_man_get_windows(struct snd_sof_dev *sdev, return get_ext_windows(sdev, w_ipc); } +static int ext_man_get_cc_info(struct snd_sof_dev *sdev, + const struct sof_ext_man_elem_header *hdr) +{ + const struct sof_ext_man_cc_version *cc; + const struct sof_ipc_ext_data_hdr *cc_version; + + cc = container_of(hdr, struct sof_ext_man_cc_version, hdr); + cc_version = (const struct sof_ipc_ext_data_hdr *)&cc->cc_version; + + return get_cc_info(sdev, cc_version); +} + static ssize_t snd_sof_ext_man_size(const struct firmware *fw) { const struct sof_ext_man_header *head = (void *)fw->data; @@ -241,6 +261,9 @@ static int snd_sof_fw_ext_man_parse(struct snd_sof_dev *sdev, case SOF_EXT_MAN_ELEM_WINDOW: ret = ext_man_get_windows(sdev, elem_hdr); break; + case SOF_EXT_MAN_ELEM_CC_VERSION: + ret = ext_man_get_cc_info(sdev, elem_hdr); + break; default: dev_warn(sdev->dev, "warning: unknown sof_ext_man header type %d size 0x%X\n", elem_hdr->type, elem_hdr->size); -- cgit v1.2.3 From 786d6516877dc852392117d44a77bf3093c39dbd Mon Sep 17 00:00:00 2001 From: Pan Xiuli Date: Wed, 15 Apr 2020 15:28:01 -0500 Subject: ASoC: SOF: add probe support extend data Share how many injection probe DMAs and how many probe points driver can request from FW. injection_dmas_max 0 means injection is not supported probe_points_max 0 means whole probes subsystem in FW is not enabled and not compiled in. ABI version change to 3.14.0 Signed-off-by: Pan Xiuli Signed-off-by: Pierre-Louis Bossart Reviewed-by: Kai Vehmanen Reviewed-by: Guennadi Liakhovetski Link: https://lore.kernel.org/r/20200415202816.934-10-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- include/sound/sof/info.h | 12 ++++++++++++ include/uapi/sound/sof/abi.h | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/sound/sof/info.h b/include/sound/sof/info.h index 438a11fcf272..68e92b550439 100644 --- a/include/sound/sof/info.h +++ b/include/sound/sof/info.h @@ -31,6 +31,7 @@ enum sof_ipc_ext_data { SOF_IPC_EXT_UNUSED = 0, SOF_IPC_EXT_WINDOW = 1, SOF_IPC_EXT_CC_INFO = 2, + SOF_IPC_EXT_PROBE_INFO = 3, }; /* FW version - SOF_IPC_GLB_VERSION */ @@ -114,4 +115,15 @@ struct sof_ipc_cc_version { char desc[]; /* null terminated compiler description */ } __packed; +/* extended data: Probe setup */ +struct sof_ipc_probe_support { + struct sof_ipc_ext_data_hdr ext_hdr; + + uint32_t probe_points_max; + uint32_t injection_dmas_max; + + /* reserved for future use */ + uint32_t reserved[2]; +} __packed; + #endif diff --git a/include/uapi/sound/sof/abi.h b/include/uapi/sound/sof/abi.h index 5995b79d6df1..e0fa2939d49c 100644 --- a/include/uapi/sound/sof/abi.h +++ b/include/uapi/sound/sof/abi.h @@ -26,7 +26,7 @@ /* SOF ABI version major, minor and patch numbers */ #define SOF_ABI_MAJOR 3 -#define SOF_ABI_MINOR 13 +#define SOF_ABI_MINOR 14 #define SOF_ABI_PATCH 0 /* SOF ABI version number. Format within 32bit word is MMmmmppp */ -- cgit v1.2.3 From a9a9cbf081414de0261279e3b11ada2f0a7f7e83 Mon Sep 17 00:00:00 2001 From: Pan Xiuli Date: Wed, 15 Apr 2020 15:28:06 -0500 Subject: ASoC: SOF: make sof_ipc_cc_version to fixed length Align struct sof_ipc_cc_version to firmware definition in SOF ABI 3.15.0. The struct definition was changed due to errors in FW build. The Cadence XCC compiler produces incorrect linkage section sizes, when a variable length array is used in the compiler version struct. The firmware definition was changed to a fixed 32 byte compiler description string. This length covers all released firmware binaries and thus only a minor ABI change is needed. As the same structure is used in IPC messages between driver and firmware, the kernel needs to be aligned to firmware change. Signed-off-by: Pan Xiuli Signed-off-by: Pierre-Louis Bossart Reviewed-by: Kai Vehmanen Link: https://lore.kernel.org/r/20200415202816.934-15-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- include/sound/sof/info.h | 2 +- include/uapi/sound/sof/abi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/sound/sof/info.h b/include/sound/sof/info.h index 2ef98b2fee1f..d5eff3179a39 100644 --- a/include/sound/sof/info.h +++ b/include/sound/sof/info.h @@ -113,7 +113,7 @@ struct sof_ipc_cc_version { uint8_t name[16]; /* null terminated compiler name */ uint8_t optim[4]; /* null terminated compiler -O flag value */ - uint8_t desc[]; /* null terminated compiler description */ + uint8_t desc[32]; /* null terminated compiler description */ } __packed; /* extended data: Probe setup */ diff --git a/include/uapi/sound/sof/abi.h b/include/uapi/sound/sof/abi.h index e0fa2939d49c..6c802a2386ef 100644 --- a/include/uapi/sound/sof/abi.h +++ b/include/uapi/sound/sof/abi.h @@ -26,7 +26,7 @@ /* SOF ABI version major, minor and patch numbers */ #define SOF_ABI_MAJOR 3 -#define SOF_ABI_MINOR 14 +#define SOF_ABI_MINOR 15 #define SOF_ABI_PATCH 0 /* SOF ABI version number. Format within 32bit word is MMmmmppp */ -- cgit v1.2.3 From c7fc96dfc41d168e94d26c455123193e6e59bf24 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 15 Apr 2020 15:28:12 -0500 Subject: ASoC: SOF: topology: Get ALH rate amd channels from topology MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FW will need these params for synchronized playback over multiple DAIs. Signed-off-by: Bard Liao Signed-off-by: Pierre-Louis Bossart Reviewed-by: Guennadi Liakhovetski Reviewed-by: Sławomir Błauciak Reviewed-by: Kai Vehmanen Link: https://lore.kernel.org/r/20200415202816.934-21-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- include/uapi/sound/sof/tokens.h | 4 ++++ sound/soc/sof/topology.c | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h index 2a25cd8da503..b7ad1cd4526a 100644 --- a/include/uapi/sound/sof/tokens.h +++ b/include/uapi/sound/sof/tokens.h @@ -126,4 +126,8 @@ #define SOF_TKN_MUTE_LED_USE 1300 #define SOF_TKN_MUTE_LED_DIRECTION 1301 +/* ALH */ +#define SOF_TKN_INTEL_ALH_RATE 1400 +#define SOF_TKN_INTEL_ALH_CH 1401 + #endif diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c index 3670b4221ba6..f59c34bb085b 100644 --- a/sound/soc/sof/topology.c +++ b/sound/soc/sof/topology.c @@ -656,6 +656,16 @@ static const struct sof_topology_token ssp_tokens[] = { }; +/* ALH */ +static const struct sof_topology_token alh_tokens[] = { + {SOF_TKN_INTEL_ALH_RATE, + SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32, + offsetof(struct sof_ipc_dai_alh_params, rate), 0}, + {SOF_TKN_INTEL_ALH_CH, + SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32, + offsetof(struct sof_ipc_dai_alh_params, channels), 0}, +}; + /* DMIC */ static const struct sof_topology_token dmic_tokens[] = { {SOF_TKN_INTEL_DMIC_DRIVER_VERSION, @@ -3095,13 +3105,26 @@ static int sof_link_alh_load(struct snd_soc_component *scomp, int index, struct sof_ipc_dai_config *config) { struct snd_sof_dev *sdev = snd_soc_component_get_drvdata(scomp); + struct snd_soc_tplg_private *private = &cfg->priv; struct sof_ipc_reply reply; u32 size = sizeof(*config); int ret; + ret = sof_parse_tokens(scomp, &config->alh, alh_tokens, + ARRAY_SIZE(alh_tokens), private->array, + le32_to_cpu(private->size)); + if (ret != 0) { + dev_err(scomp->dev, "error: parse alh tokens failed %d\n", + le32_to_cpu(private->size)); + return ret; + } + /* init IPC */ config->hdr.size = size; + dev_dbg(scomp->dev, "ALH config rate %d channels %d\n", + config->alh.rate, config->alh.channels); + /* send message to DSP */ ret = sof_ipc_tx_message(sdev->ipc, config->hdr.cmd, config, size, &reply, -- cgit v1.2.3 From 18aaab64fbb121e5879f74a46903bcfd30bf660b Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 15 Apr 2020 15:28:14 -0500 Subject: ASoC: SOF: topology: Get HDA rate and channels from topology FW interface for HDA DAI parameters was extended with information on sampling rate and channel count in version 3.16. Align kernel header with the FW change. This change is backwards compatible. Old firmware will ignore the values. Signed-off-by: Bard Liao Signed-off-by: Pierre-Louis Bossart Reviewed-by: Guennadi Liakhovetski Reviewed-by: Kai Vehmanen Link: https://lore.kernel.org/r/20200415202816.934-23-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- include/sound/sof/dai-intel.h | 2 ++ include/uapi/sound/sof/abi.h | 2 +- include/uapi/sound/sof/tokens.h | 4 ++++ sound/soc/sof/topology.c | 9 +++++++++ 4 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/sound/sof/dai-intel.h b/include/sound/sof/dai-intel.h index 4db906c4a534..896db2243d87 100644 --- a/include/sound/sof/dai-intel.h +++ b/include/sound/sof/dai-intel.h @@ -88,6 +88,8 @@ struct sof_ipc_dai_ssp_params { struct sof_ipc_dai_hda_params { struct sof_ipc_hdr hdr; uint32_t link_dma_ch; + uint32_t rate; + uint32_t channels; } __packed; /* ALH Configuration Request - SOF_IPC_DAI_ALH_CONFIG */ diff --git a/include/uapi/sound/sof/abi.h b/include/uapi/sound/sof/abi.h index 6c802a2386ef..d54be303090f 100644 --- a/include/uapi/sound/sof/abi.h +++ b/include/uapi/sound/sof/abi.h @@ -26,7 +26,7 @@ /* SOF ABI version major, minor and patch numbers */ #define SOF_ABI_MAJOR 3 -#define SOF_ABI_MINOR 15 +#define SOF_ABI_MINOR 16 #define SOF_ABI_PATCH 0 /* SOF ABI version number. Format within 32bit word is MMmmmppp */ diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h index b7ad1cd4526a..5941e2eb1588 100644 --- a/include/uapi/sound/sof/tokens.h +++ b/include/uapi/sound/sof/tokens.h @@ -130,4 +130,8 @@ #define SOF_TKN_INTEL_ALH_RATE 1400 #define SOF_TKN_INTEL_ALH_CH 1401 +/* HDA */ +#define SOF_TKN_INTEL_HDA_RATE 1500 +#define SOF_TKN_INTEL_HDA_CH 1501 + #endif diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c index 51d19ffe35b3..a1287924a12d 100644 --- a/sound/soc/sof/topology.c +++ b/sound/soc/sof/topology.c @@ -753,6 +753,12 @@ static const struct sof_topology_token dmic_pdm_tokens[] = { /* HDA */ static const struct sof_topology_token hda_tokens[] = { + {SOF_TKN_INTEL_HDA_RATE, + SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32, + offsetof(struct sof_ipc_dai_hda_params, rate), 0}, + {SOF_TKN_INTEL_HDA_CH, + SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32, + offsetof(struct sof_ipc_dai_hda_params, channels), 0}, }; /* Leds */ @@ -3083,6 +3089,9 @@ static int sof_link_hda_load(struct snd_soc_component *scomp, int index, return ret; } + dev_dbg(scomp->dev, "HDA config rate %d channels %d\n", + config->hda.rate, config->hda.channels); + dai = snd_soc_find_dai(link->cpus); if (!dai) { dev_err(scomp->dev, "error: failed to find dai %s in %s", -- cgit v1.2.3 From 980737282232b752bb14dab96d77665c15889c36 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:45:31 +0300 Subject: capabilities: Introduce CAP_PERFMON to kernel and user space Introduce the CAP_PERFMON capability designed to secure system performance monitoring and observability operations so that CAP_PERFMON can assist CAP_SYS_ADMIN capability in its governing role for performance monitoring and observability subsystems. CAP_PERFMON hardens system security and integrity during performance monitoring and observability operations by decreasing attack surface that is available to a CAP_SYS_ADMIN privileged process [2]. Providing the access to system performance monitoring and observability operations under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes the operation more secure. Thus, CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e: 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) CAP_PERFMON meets the demand to secure system performance monitoring and observability operations for adoption in security sensitive, restricted, multiuser production environments (e.g. HPC clusters, cloud and virtual compute environments), where root or CAP_SYS_ADMIN credentials are not available to mass users of a system, and securely unblocks applicability and scalability of system performance monitoring and observability operations beyond root and CAP_SYS_ADMIN use cases. CAP_PERFMON takes over CAP_SYS_ADMIN credentials related to system performance monitoring and observability operations and balances amount of CAP_SYS_ADMIN credentials following the recommendations in the capabilities man page [1] for CAP_SYS_ADMIN: "Note: this capability is overloaded; see Notes to kernel developers, below." For backward compatibility reasons access to system performance monitoring and observability subsystems of the kernel remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN capability usage for secure system performance monitoring and observability operations is discouraged with respect to the designed CAP_PERFMON capability. Although the software running under CAP_PERFMON can not ensure avoidance of related hardware issues, the software can still mitigate these issues following the official hardware issues mitigation procedure [2]. The bugs in the software itself can be fixed following the standard kernel development process [3] to maintain and harden security of system performance monitoring and observability operations. [1] http://man7.org/linux/man-pages/man7/capabilities.7.html [2] https://www.kernel.org/doc/html/latest/process/embargoed-hardware-issues.html [3] https://www.kernel.org/doc/html/latest/admin-guide/security-bugs.html Signed-off-by: Alexey Budankov Acked-by: James Morris Acked-by: Serge E. Hallyn Acked-by: Song Liu Acked-by: Stephen Smalley Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/5590d543-82c6-490a-6544-08e6a5517db0@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/capability.h | 4 ++++ include/uapi/linux/capability.h | 8 +++++++- security/selinux/include/classmap.h | 4 ++-- 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/capability.h b/include/linux/capability.h index ecce0f43c73a..027d7e4a853b 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); +static inline bool perfmon_capable(void) +{ + return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); +} /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 272dc69fa080..e58c9636741b 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -367,8 +367,14 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_READ 37 +/* + * Allow system performance and observability privileged operations + * using perf_events, i915_perf and other kernel subsystems + */ + +#define CAP_PERFMON 38 -#define CAP_LAST_CAP CAP_AUDIT_READ +#define CAP_LAST_CAP CAP_PERFMON #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 986f3ac14282..d233ab3f1533 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,9 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read" + "wake_alarm", "block_suspend", "audit_read", "perfmon" -#if CAP_LAST_CAP > CAP_AUDIT_READ +#if CAP_LAST_CAP > CAP_PERFMON #error New capability defined, please update COMMON_CAP2_PERMS. #endif -- cgit v1.2.3 From 104edb94cc4b3101bab33161cd861de13e85610b Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Fri, 27 Mar 2020 10:59:48 +0530 Subject: tee: add private login method for kernel clients There are use-cases where user-space shouldn't be allowed to communicate directly with a TEE device which is dedicated to provide a specific service for a kernel client. So add a private login method for kernel clients and disallow user-space to open-session using GP implementation defined login method range: (0x80000000 - 0xBFFFFFFF). Reviewed-by: Jerome Forissier Signed-off-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 7 +++++++ include/uapi/linux/tee.h | 9 +++++++++ 2 files changed, 16 insertions(+) (limited to 'include/uapi') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 6aec502c495c..fb907bf43708 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -333,6 +333,13 @@ static int tee_ioctl_open_session(struct tee_context *ctx, goto out; } + if (arg.clnt_login >= TEE_IOCTL_LOGIN_REE_KERNEL_MIN && + arg.clnt_login <= TEE_IOCTL_LOGIN_REE_KERNEL_MAX) { + pr_debug("login method not allowed for user-space client\n"); + rc = -EPERM; + goto out; + } + rc = ctx->teedev->desc->ops->open_session(ctx, &arg, params); if (rc) goto out; diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 6596f3a09e54..b619f37ee03e 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -173,6 +173,15 @@ struct tee_ioctl_buf_data { #define TEE_IOCTL_LOGIN_APPLICATION 4 #define TEE_IOCTL_LOGIN_USER_APPLICATION 5 #define TEE_IOCTL_LOGIN_GROUP_APPLICATION 6 +/* + * Disallow user-space to use GP implementation specific login + * method range (0x80000000 - 0xBFFFFFFF). This range is rather + * being reserved for REE kernel clients or TEE implementation. + */ +#define TEE_IOCTL_LOGIN_REE_KERNEL_MIN 0x80000000 +#define TEE_IOCTL_LOGIN_REE_KERNEL_MAX 0xBFFFFFFF +/* Private login method for REE kernel clients */ +#define TEE_IOCTL_LOGIN_REE_KERNEL 0x80000000 /** * struct tee_ioctl_param - parameter -- cgit v1.2.3 From eec517cdb4810b3843eb7707971de3164088bff1 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:11:50 +0200 Subject: net: Add IF_OPER_TESTING RFC 2863 defines the operational state testing. Add support for this state, both as a IF_LINK_MODE_ and __LINK_STATE_. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 41 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/if.h | 1 + net/core/dev.c | 5 +++++ net/core/link_watch.c | 12 ++++++++++-- net/core/rtnetlink.c | 9 ++++++++- 5 files changed, 65 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 130a668049ab..0750b54b3765 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -288,6 +288,7 @@ enum netdev_state_t { __LINK_STATE_NOCARRIER, __LINK_STATE_LINKWATCH_PENDING, __LINK_STATE_DORMANT, + __LINK_STATE_TESTING, }; @@ -3907,6 +3908,46 @@ static inline bool netif_dormant(const struct net_device *dev) } +/** + * netif_testing_on - mark device as under test. + * @dev: network device + * + * Mark device as under test (as per RFC2863). + * + * The testing state indicates that some test(s) must be performed on + * the interface. After completion, of the test, the interface state + * will change to up, dormant, or down, as appropriate. + */ +static inline void netif_testing_on(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state)) + linkwatch_fire_event(dev); +} + +/** + * netif_testing_off - set device as not under test. + * @dev: network device + * + * Device is not in testing state. + */ +static inline void netif_testing_off(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state)) + linkwatch_fire_event(dev); +} + +/** + * netif_testing - test if device is under test + * @dev: network device + * + * Check if device is under test + */ +static inline bool netif_testing(const struct net_device *dev) +{ + return test_bit(__LINK_STATE_TESTING, &dev->state); +} + + /** * netif_oper_up - test if device is operational * @dev: network device diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index be714cd8c826..797ba2c1562a 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -178,6 +178,7 @@ enum { enum { IF_LINK_MODE_DEFAULT, IF_LINK_MODE_DORMANT, /* limit upward transition to dormant */ + IF_LINK_MODE_TESTING, /* limit upward transition to testing */ }; /* diff --git a/net/core/dev.c b/net/core/dev.c index 522288177bbd..fb61522b1ce1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9136,6 +9136,11 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, else netif_dormant_off(dev); + if (rootdev->operstate == IF_OPER_TESTING) + netif_testing_on(dev); + else + netif_testing_off(dev); + if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else diff --git a/net/core/link_watch.c b/net/core/link_watch.c index f153e0601838..75431ca9300f 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -34,6 +34,9 @@ static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { + if (netif_testing(dev)) + return IF_OPER_TESTING; + if (!netif_carrier_ok(dev)) return (dev->ifindex != dev_get_iflink(dev) ? IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); @@ -55,11 +58,15 @@ static void rfc2863_policy(struct net_device *dev) write_lock_bh(&dev_base_lock); switch(dev->link_mode) { + case IF_LINK_MODE_TESTING: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_TESTING; + break; + case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; - case IF_LINK_MODE_DEFAULT: default: break; @@ -74,7 +81,8 @@ static void rfc2863_policy(struct net_device *dev) void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ - if (!netif_carrier_ok(dev) || netif_dormant(dev)) + if (!netif_carrier_ok(dev) || netif_dormant(dev) || + netif_testing(dev)) rfc2863_policy(dev); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 709ebbf8ab5b..d6f4f4a9e8ba 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -829,11 +829,18 @@ static void set_operstate(struct net_device *dev, unsigned char transition) switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && - !netif_dormant(dev)) + !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; + case IF_OPER_TESTING: + if (operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN) + operstate = IF_OPER_TESTING; + break; + case IF_OPER_DORMANT: if (operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN) -- cgit v1.2.3 From 72ef5e52b3f74c0be47b20f5c434b7ecc830cf40 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:35 +0200 Subject: docs: fix broken references to text files Several references got broken due to txt to ReST conversion. Several of them can be automatically fixed with: scripts/documentation-file-ref-check --fix Reviewed-by: Mathieu Poirier # hwtracing/coresight/Kconfig Reviewed-by: Paul E. McKenney # memory-barrier.txt Acked-by: Alex Shi # translations/zh_CN Acked-by: Federico Vaga # translations/it_IT Acked-by: Marc Zyngier # kvm/arm64 Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/6f919ddb83a33b5f2a63b6b5f0575737bb2b36aa.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/memory-barriers.txt | 2 +- Documentation/process/submit-checklist.rst | 2 +- .../translations/it_IT/process/submit-checklist.rst | 2 +- Documentation/translations/ko_KR/memory-barriers.txt | 2 +- Documentation/translations/zh_CN/filesystems/sysfs.txt | 2 +- .../translations/zh_CN/process/submit-checklist.rst | 2 +- Documentation/virt/kvm/arm/pvtime.rst | 2 +- Documentation/virt/kvm/devices/vcpu.rst | 2 +- Documentation/virt/kvm/hypercalls.rst | 4 ++-- arch/powerpc/include/uapi/asm/kvm_para.h | 2 +- drivers/gpu/drm/Kconfig | 2 +- drivers/gpu/drm/drm_ioctl.c | 2 +- drivers/hwtracing/coresight/Kconfig | 2 +- fs/fat/Kconfig | 8 ++++---- fs/fuse/Kconfig | 2 +- fs/fuse/dev.c | 2 +- fs/overlayfs/Kconfig | 6 +++--- include/linux/mm.h | 4 ++-- include/uapi/linux/ethtool_netlink.h | 2 +- include/uapi/rdma/rdma_user_ioctl_cmds.h | 2 +- mm/gup.c | 12 ++++++------ virt/kvm/arm/vgic/vgic-mmio-v3.c | 2 +- virt/kvm/arm/vgic/vgic.h | 4 ++-- 23 files changed, 36 insertions(+), 36 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index e1c355e84edd..eaabc3134294 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -620,7 +620,7 @@ because the CPUs that the Linux kernel supports don't do writes until they are certain (1) that the write will actually happen, (2) of the location of the write, and (3) of the value to be written. But please carefully read the "CONTROL DEPENDENCIES" section and the -Documentation/RCU/rcu_dereference.txt file: The compiler can and does +Documentation/RCU/rcu_dereference.rst file: The compiler can and does break dependencies in a great many highly creative ways. CPU 1 CPU 2 diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst index 8e56337d422d..3f8e9d5d95c2 100644 --- a/Documentation/process/submit-checklist.rst +++ b/Documentation/process/submit-checklist.rst @@ -107,7 +107,7 @@ and elsewhere regarding submitting Linux kernel patches. and why. 26) If any ioctl's are added by the patch, then also update - ``Documentation/ioctl/ioctl-number.rst``. + ``Documentation/userspace-api/ioctl/ioctl-number.rst``. 27) If your modified source code depends on or uses any of the kernel APIs or features that are related to the following ``Kconfig`` symbols, diff --git a/Documentation/translations/it_IT/process/submit-checklist.rst b/Documentation/translations/it_IT/process/submit-checklist.rst index 995ee69fab11..3e575502690f 100644 --- a/Documentation/translations/it_IT/process/submit-checklist.rst +++ b/Documentation/translations/it_IT/process/submit-checklist.rst @@ -117,7 +117,7 @@ sottomissione delle patch, in particolare sorgenti che ne spieghi la logica: cosa fanno e perché. 25) Se la patch aggiunge nuove chiamate ioctl, allora aggiornate - ``Documentation/ioctl/ioctl-number.rst``. + ``Documentation/userspace-api/ioctl/ioctl-number.rst``. 26) Se il codice che avete modificato dipende o usa una qualsiasi interfaccia o funzionalità del kernel che è associata a uno dei seguenti simboli diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index 2e831ece6e26..e50fe6541335 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -641,7 +641,7 @@ P 는 짝수 번호 캐시 라인에 저장되어 있고, 변수 B 는 홀수 리눅스 커널이 지원하는 CPU 들은 (1) 쓰기가 정말로 일어날지, (2) 쓰기가 어디에 이루어질지, 그리고 (3) 쓰여질 값을 확실히 알기 전까지는 쓰기를 수행하지 않기 때문입니다. 하지만 "컨트롤 의존성" 섹션과 -Documentation/RCU/rcu_dereference.txt 파일을 주의 깊게 읽어 주시기 바랍니다: +Documentation/RCU/rcu_dereference.rst 파일을 주의 깊게 읽어 주시기 바랍니다: 컴파일러는 매우 창의적인 많은 방법으로 종속성을 깰 수 있습니다. CPU 1 CPU 2 diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt index ee1f37da5b23..a15c3ebdfa82 100644 --- a/Documentation/translations/zh_CN/filesystems/sysfs.txt +++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt @@ -281,7 +281,7 @@ drivers/ 包含了每个已为特定总线上的设备而挂载的驱动程序 假定驱动没有跨越多个总线类型)。 fs/ 包含了一个为文件系统设立的目录。现在每个想要导出属性的文件系统必须 -在 fs/ 下创建自己的层次结构(参见Documentation/filesystems/fuse.txt)。 +在 fs/ 下创建自己的层次结构(参见Documentation/filesystems/fuse.rst)。 dev/ 包含两个子目录: char/ 和 block/。在这两个子目录中,有以 : 格式命名的符号链接。这些符号链接指向 sysfs 目录 diff --git a/Documentation/translations/zh_CN/process/submit-checklist.rst b/Documentation/translations/zh_CN/process/submit-checklist.rst index 8738c55e42a2..50386e0e42e7 100644 --- a/Documentation/translations/zh_CN/process/submit-checklist.rst +++ b/Documentation/translations/zh_CN/process/submit-checklist.rst @@ -97,7 +97,7 @@ Linux内核补丁提交清单 24) 所有内存屏障例如 ``barrier()``, ``rmb()``, ``wmb()`` 都需要源代码中的注 释来解释它们正在执行的操作及其原因的逻辑。 -25) 如果补丁添加了任何ioctl,那么也要更新 ``Documentation/ioctl/ioctl-number.rst`` +25) 如果补丁添加了任何ioctl,那么也要更新 ``Documentation/userspace-api/ioctl/ioctl-number.rst`` 26) 如果修改后的源代码依赖或使用与以下 ``Kconfig`` 符号相关的任何内核API或 功能,则在禁用相关 ``Kconfig`` 符号和/或 ``=m`` (如果该选项可用)的情况 diff --git a/Documentation/virt/kvm/arm/pvtime.rst b/Documentation/virt/kvm/arm/pvtime.rst index 2357dd2d8655..687b60d76ca9 100644 --- a/Documentation/virt/kvm/arm/pvtime.rst +++ b/Documentation/virt/kvm/arm/pvtime.rst @@ -76,5 +76,5 @@ It is advisable that one or more 64k pages are set aside for the purpose of these structures and not used for other purposes, this enables the guest to map the region using 64k pages and avoids conflicting attributes with other memory. -For the user space interface see Documentation/virt/kvm/devices/vcpu.txt +For the user space interface see Documentation/virt/kvm/devices/vcpu.rst section "3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL". diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index 9963e680770a..ca374d3fe085 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -110,5 +110,5 @@ Returns: Specifies the base address of the stolen time structure for this VCPU. The base address must be 64 byte aligned and exist within a valid guest memory -region. See Documentation/virt/kvm/arm/pvtime.txt for more information +region. See Documentation/virt/kvm/arm/pvtime.rst for more information including the layout of the stolen time structure. diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst index dbaf207e560d..ed4fddd364ea 100644 --- a/Documentation/virt/kvm/hypercalls.rst +++ b/Documentation/virt/kvm/hypercalls.rst @@ -22,7 +22,7 @@ S390: number in R1. For further information on the S390 diagnose call as supported by KVM, - refer to Documentation/virt/kvm/s390-diag.txt. + refer to Documentation/virt/kvm/s390-diag.rst. PowerPC: It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. @@ -30,7 +30,7 @@ PowerPC: KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' property inside the device tree's /hypervisor node. - For more information refer to Documentation/virt/kvm/ppc-pv.txt + For more information refer to Documentation/virt/kvm/ppc-pv.rst MIPS: KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h index be48c2215fa2..a809b1b44ddf 100644 --- a/arch/powerpc/include/uapi/asm/kvm_para.h +++ b/arch/powerpc/include/uapi/asm/kvm_para.h @@ -31,7 +31,7 @@ * Struct fields are always 32 or 64 bit aligned, depending on them being 32 * or 64 bit wide respectively. * - * See Documentation/virt/kvm/ppc-pv.txt + * See Documentation/virt/kvm/ppc-pv.rst */ struct kvm_vcpu_arch_shared { __u64 scratch1; diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 43594978958e..fb92be7e8aa7 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -161,7 +161,7 @@ config DRM_LOAD_EDID_FIRMWARE monitor are unable to provide appropriate EDID data. Since this feature is provided as a workaround for broken hardware, the default case is N. Details and instructions how to build your own - EDID data are given in Documentation/driver-api/edid.rst. + EDID data are given in Documentation/admin-guide/edid.rst. config DRM_DP_CEC bool "Enable DisplayPort CEC-Tunneling-over-AUX HDMI support" diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index 9e41972c4bbc..c2b8d2a953ae 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -741,7 +741,7 @@ static const struct drm_ioctl_desc drm_ioctls[] = { * }; * * Please make sure that you follow all the best practices from - * ``Documentation/ioctl/botching-up-ioctls.rst``. Note that drm_ioctl() + * ``Documentation/process/botching-up-ioctls.rst``. Note that drm_ioctl() * automatically zero-extends structures, hence make sure you can add more stuff * at the end, i.e. don't put a variable sized array there. * diff --git a/drivers/hwtracing/coresight/Kconfig b/drivers/hwtracing/coresight/Kconfig index 83e841be1081..02dbb5ca3bcf 100644 --- a/drivers/hwtracing/coresight/Kconfig +++ b/drivers/hwtracing/coresight/Kconfig @@ -107,7 +107,7 @@ config CORESIGHT_CPU_DEBUG can quickly get to know program counter (PC), secure state, exception level, etc. Before use debugging functionality, platform needs to ensure the clock domain and power domain are enabled - properly, please refer Documentation/trace/coresight-cpu-debug.rst + properly, please refer Documentation/trace/coresight/coresight-cpu-debug.rst for detailed description and the example for usage. config CORESIGHT_CTI diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index 718163d0c621..ca31993dcb47 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -69,7 +69,7 @@ config VFAT_FS The VFAT support enlarges your kernel by about 10 KB and it only works if you said Y to the "DOS FAT fs support" above. Please read - the file for details. If + the file for details. If unsure, say Y. To compile this as a module, choose M here: the module will be called @@ -82,7 +82,7 @@ config FAT_DEFAULT_CODEPAGE help This option should be set to the codepage of your FAT filesystems. It can be overridden with the "codepage" mount option. - See for more information. + See for more information. config FAT_DEFAULT_IOCHARSET string "Default iocharset for FAT" @@ -96,7 +96,7 @@ config FAT_DEFAULT_IOCHARSET Note that "utf8" is not recommended for FAT filesystems. If unsure, you shouldn't set "utf8" here - select the next option instead if you would like to use UTF-8 encoded file names by default. - See for more information. + See for more information. Enable any character sets you need in File Systems/Native Language Support. @@ -114,4 +114,4 @@ config FAT_DEFAULT_UTF8 Say Y if you use UTF-8 encoding for file names, N otherwise. - See for more information. + See for more information. diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index eb2a585572dc..774b2618018a 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -12,7 +12,7 @@ config FUSE_FS although chances are your distribution already has that library installed if you've installed the "fuse" package itself. - See for more information. + See for more information. See for needed library/utility version. If you want to develop a userspace FS, or if you want to use diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 97eec7522bf2..c7a65cf2bcca 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2081,7 +2081,7 @@ static void end_polls(struct fuse_conn *fc) * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see - * Documentation/filesystems/fuse.txt). + * Documentation/filesystems/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 714c14c47ca5..dd188c7996b3 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -9,7 +9,7 @@ config OVERLAY_FS 'lower' filesystem is either hidden or, in the case of directories, merged with the 'upper' object. - For more information see Documentation/filesystems/overlayfs.txt + For more information see Documentation/filesystems/overlayfs.rst config OVERLAY_FS_REDIRECT_DIR bool "Overlayfs: turn on redirect directory feature by default" @@ -38,7 +38,7 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW If backward compatibility is not an issue, then it is safe and recommended to say N here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say Y. @@ -103,7 +103,7 @@ config OVERLAY_FS_XINO_AUTO If compatibility with applications that expect 32bit inodes is not an issue, then it is safe and recommended to say Y here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say N. diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a323422d783..1f2850465f59 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1219,7 +1219,7 @@ void unpin_user_pages(struct page **pages, unsigned long npages); * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS * scheme). * - * For more information, please see Documentation/vm/pin_user_pages.rst. + * For more information, please see Documentation/core-api/pin_user_pages.rst. * * @page: pointer to page to be queried. * @Return: True, if it is likely that the page has been "dma-pinned". @@ -2834,7 +2834,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * releasing pages: get_user_pages*() pages must be released via put_page(), * while pin_user_pages*() pages must be released via unpin_user_page(). * - * Please see Documentation/vm/pin_user_pages.rst for more information. + * Please see Documentation/core-api/pin_user_pages.rst for more information. */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7fde76366ba4..1711e57f7848 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -2,7 +2,7 @@ /* * include/uapi/linux/ethtool_netlink.h - netlink interface for ethtool * - * See Documentation/networking/ethtool-netlink.txt in kernel source tree for + * See Documentation/networking/ethtool-netlink.rst in kernel source tree for * doucumentation of the interface. */ diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 7b1ec806f8f9..38ab7accb7be 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -36,7 +36,7 @@ #include #include -/* Documentation/ioctl/ioctl-number.rst */ +/* Documentation/userspace-api/ioctl/ioctl-number.rst */ #define RDMA_IOCTL_MAGIC 0x1b #define RDMA_VERBS_IOCTL \ _IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr) diff --git a/mm/gup.c b/mm/gup.c index 6076df8e04a4..81e4d0b377fd 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2843,9 +2843,9 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for further details. + * see Documentation/core-api/pin_user_pages.rst for further details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ int pin_user_pages_fast(unsigned long start, int nr_pages, @@ -2883,9 +2883,9 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -2919,9 +2919,9 @@ EXPORT_SYMBOL(pin_user_pages_remote); * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages(unsigned long start, unsigned long nr_pages, diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index e72dcc454247..859464fd413f 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -301,7 +301,7 @@ static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, * pending state of interrupt is latched in pending_latch variable. * Userspace will save and restore pending state and line_level * separately. - * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst * for handling of ISPENDR and ICPENDR. */ for (i = 0; i < len * 8; i++) { diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 769e4802645e..64fcd7511110 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -42,7 +42,7 @@ VGIC_AFFINITY_LEVEL(val, 3)) /* - * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-v3.rst, * below macros are defined for CPUREG encoding. */ #define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 @@ -63,7 +63,7 @@ KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) /* - * As per Documentation/virt/kvm/devices/arm-vgic-its.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-its.rst, * below macros are defined for ITS table entry encoding. */ #define KVM_ITS_CTE_VALID_SHIFT 63 -- cgit v1.2.3 From 3ecad8c2c1ff333e204c26e2f0dddfa623153f87 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:36 +0200 Subject: docs: fix broken references for ReST files that moved around Some broken references happened due to shifting files around and ReST renames. Those can't be auto-fixed by the script, so let's fix them manually. Signed-off-by: Mauro Carvalho Chehab Acked-by: Corentin Labbe Link: https://lore.kernel.org/r/64773a12b4410aaf3e3be89e3ec7e34de2484eea.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/doc-guide/maintainer-profile.rst | 2 +- Documentation/virt/kvm/mmu.rst | 2 +- Documentation/virt/kvm/review-checklist.rst | 2 +- arch/x86/kvm/mmu/mmu.c | 2 +- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c | 2 +- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c | 2 +- drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c | 2 +- drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c | 2 +- drivers/media/v4l2-core/v4l2-fwnode.c | 2 +- include/uapi/linux/kvm.h | 4 ++-- tools/include/uapi/linux/kvm.h | 4 ++-- 11 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/doc-guide/maintainer-profile.rst b/Documentation/doc-guide/maintainer-profile.rst index 5afc0ddba40a..755d39f0d407 100644 --- a/Documentation/doc-guide/maintainer-profile.rst +++ b/Documentation/doc-guide/maintainer-profile.rst @@ -6,7 +6,7 @@ Documentation subsystem maintainer entry profile The documentation "subsystem" is the central coordinating point for the kernel's documentation and associated infrastructure. It covers the hierarchy under Documentation/ (with the exception of -Documentation/device-tree), various utilities under scripts/ and, at least +Documentation/devicetree), various utilities under scripts/ and, at least some of the time, LICENSES/. It's worth noting, though, that the boundaries of this subsystem are rather diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index 60981887d20b..46126ecc70f7 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -319,7 +319,7 @@ Handling a page fault is performed as follows: - If both P bit and R/W bit of error code are set, this could possibly be handled as a "fast page fault" (fixed without taking the MMU lock). See - the description in Documentation/virt/kvm/locking.txt. + the description in Documentation/virt/kvm/locking.rst. - if needed, walk the guest page tables to determine the guest translation (gva->gpa or ngpa->gpa) diff --git a/Documentation/virt/kvm/review-checklist.rst b/Documentation/virt/kvm/review-checklist.rst index 1f86a9d3f705..dc01aea4057b 100644 --- a/Documentation/virt/kvm/review-checklist.rst +++ b/Documentation/virt/kvm/review-checklist.rst @@ -10,7 +10,7 @@ Review checklist for kvm patches 2. Patches should be against kvm.git master branch. 3. If the patch introduces or modifies a new userspace API: - - the API must be documented in Documentation/virt/kvm/api.txt + - the API must be documented in Documentation/virt/kvm/api.rst - the API must be discoverable using KVM_CHECK_EXTENSION 4. New state must include support for save/restore. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8071952e9cf2..fd59fee84631 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3586,7 +3586,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, /* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See - * Documentation/virt/kvm/locking.txt to get more detail. + * Documentation/virt/kvm/locking.rst to get more detail. */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c index a5fd8975f3d3..a6abb701bfc6 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c @@ -8,7 +8,7 @@ * This file add support for AES cipher with 128,192,256 bits keysize in * CBC and ECB mode. * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c index 3e4e4bbda34c..b957061424a1 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c @@ -7,7 +7,7 @@ * * Core file which registers crypto algorithms supported by the CryptoEngine. * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include #include diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c index 84d52fc3a2da..c89cb2ee2496 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c @@ -8,7 +8,7 @@ * This file add support for AES cipher with 128,192,256 bits keysize in * CBC and ECB mode. * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c index 6b301afffd11..8ba4f9c81dac 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c @@ -7,7 +7,7 @@ * * Core file which registers crypto algorithms supported by the SecuritySystem * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include #include diff --git a/drivers/media/v4l2-core/v4l2-fwnode.c b/drivers/media/v4l2-core/v4l2-fwnode.c index 97f0f8b23b5d..8a1e1b95b379 100644 --- a/drivers/media/v4l2-core/v4l2-fwnode.c +++ b/drivers/media/v4l2-core/v4l2-fwnode.c @@ -980,7 +980,7 @@ static int v4l2_fwnode_reference_parse(struct device *dev, * * THIS EXAMPLE EXISTS MERELY TO DOCUMENT THIS FUNCTION. DO NOT USE IT AS A * REFERENCE IN HOW ACPI TABLES SHOULD BE WRITTEN!! See documentation under - * Documentation/acpi/dsd instead and especially graph.txt, + * Documentation/firmware-guide/acpi/dsd/ instead and especially graph.txt, * data-node-references.txt and leds.txt . * * Scope (\_SB.PCI0.I2C2) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 428c7dde6b4b..fdd632c833b4 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virt/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.rst */ union { __u32 irq; @@ -1107,7 +1107,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virt/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.rst. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 4b95f9a31a2f..e5f32fcec68f 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virt/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.rst */ union { __u32 irq; @@ -1100,7 +1100,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virt/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.rst. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) -- cgit v1.2.3 From af690f459393017ce40bb9beef6a00e79511574d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:56 +0200 Subject: firewire: firewire-cdev.hL get rid of a docs warning This warning: ./include/uapi/linux/firewire-cdev.h:312: WARNING: Inline literal start-string without end-string. is because %FOO doesn't work if there's a parenthesis at the string (as a parenthesis may indicate a function). So, mark the literal block using the alternate ``FOO`` syntax. Acked-by: Stefan Richter Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/9b2501a41eba27ccdd4603cac2353c0efba7a90a.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/uapi/linux/firewire-cdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 1acd2b179aef..7e5b5c10a49c 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -308,7 +308,7 @@ struct fw_cdev_event_iso_interrupt_mc { /** * struct fw_cdev_event_iso_resource - Iso resources were allocated or freed * @closure: See &fw_cdev_event_common; - * set by %FW_CDEV_IOC_(DE)ALLOCATE_ISO_RESOURCE(_ONCE) ioctl + * set by``FW_CDEV_IOC_(DE)ALLOCATE_ISO_RESOURCE(_ONCE)`` ioctl * @type: %FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or * %FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED * @handle: Reference by which an allocated resource can be deallocated -- cgit v1.2.3 From 1ca3cb46a992d83ebd093acc64241007b20c5033 Mon Sep 17 00:00:00 2001 From: Maheshwar Ajja Date: Mon, 16 Mar 2020 23:42:30 +0100 Subject: media: v4l2-ctrl: Add H264 profile and levels Add H264 profile "Contrained High" and H264 levels "5.2", "6.0", "6.1" and "6.2". Signed-off-by: Maheshwar Ajja Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/v4l2-ctrls.c | 5 +++++ include/uapi/linux/v4l2-controls.h | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'include/uapi') diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 93d33d1db4e8..0186ba85aac7 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -336,6 +336,10 @@ const char * const *v4l2_ctrl_get_menu(u32 id) "4.2", "5", "5.1", + "5.2", + "6.0", + "6.1", + "6.2", NULL, }; static const char * const h264_loop_filter[] = { @@ -362,6 +366,7 @@ const char * const *v4l2_ctrl_get_menu(u32 id) "Scalable High Intra", "Stereo High", "Multiview High", + "Constrained High", NULL, }; static const char * const vui_sar_idc[] = { diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 1a58d7cc4ccc..0ba1005c9651 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -473,6 +473,10 @@ enum v4l2_mpeg_video_h264_level { V4L2_MPEG_VIDEO_H264_LEVEL_4_2 = 13, V4L2_MPEG_VIDEO_H264_LEVEL_5_0 = 14, V4L2_MPEG_VIDEO_H264_LEVEL_5_1 = 15, + V4L2_MPEG_VIDEO_H264_LEVEL_5_2 = 16, + V4L2_MPEG_VIDEO_H264_LEVEL_6_0 = 17, + V4L2_MPEG_VIDEO_H264_LEVEL_6_1 = 18, + V4L2_MPEG_VIDEO_H264_LEVEL_6_2 = 19, }; #define V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_ALPHA (V4L2_CID_MPEG_BASE+360) #define V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_BETA (V4L2_CID_MPEG_BASE+361) @@ -501,6 +505,7 @@ enum v4l2_mpeg_video_h264_profile { V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH_INTRA = 14, V4L2_MPEG_VIDEO_H264_PROFILE_STEREO_HIGH = 15, V4L2_MPEG_VIDEO_H264_PROFILE_MULTIVIEW_HIGH = 16, + V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_HIGH = 17, }; #define V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_HEIGHT (V4L2_CID_MPEG_BASE+364) #define V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_WIDTH (V4L2_CID_MPEG_BASE+365) -- cgit v1.2.3 From d8e25a10ef876bfb2e6ee611fbbb7f7c926a3309 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 22 Apr 2020 12:07:10 +0100 Subject: ASoC: SOF: Fix build The recent batch of SOF changes failed to build on some x86 configurations including an allmodconfig, revert the commits: e150ef4169a76 ASoC: SOF: Introduce extended manifest 371091417864b ASoC: SOF: ext_manifest: parse firmware version 7c024b948c206 ASoC: SOF: ext_manifest: parse compiler version 9e72f13ee541c ASoC: SOF: ext_manifest: parse windows to fix this. Reported-by: Stephen Rothwell Signed-off-by: Mark Brown --- include/uapi/sound/sof/ext_manifest.h | 91 ---------------- sound/soc/sof/loader.c | 189 +--------------------------------- 2 files changed, 2 insertions(+), 278 deletions(-) delete mode 100644 include/uapi/sound/sof/ext_manifest.h (limited to 'include/uapi') diff --git a/include/uapi/sound/sof/ext_manifest.h b/include/uapi/sound/sof/ext_manifest.h deleted file mode 100644 index d49c47d08c7f..000000000000 --- a/include/uapi/sound/sof/ext_manifest.h +++ /dev/null @@ -1,91 +0,0 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ -/* - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * Copyright(c) 2020 Intel Corporation. All rights reserved. - */ - -/* - * Extended manifest is a place to store metadata about firmware, known during - * compilation time - for example firmware version or used compiler. - * Given information are read on host side before firmware startup. - * This part of output binary is not signed. - */ - -#ifndef __SOF_FIRMWARE_EXT_MANIFEST_H__ -#define __SOF_FIRMWARE_EXT_MANIFEST_H__ - -#include -#include - -/* In ASCII `XMan` */ -#define SOF_EXT_MAN_MAGIC_NUMBER 0x6e614d58 - -/* Build u32 number in format MMmmmppp */ -#define SOF_EXT_MAN_BUILD_VERSION(MAJOR, MINOR, PATH) ((uint32_t)( \ - ((MAJOR) << 24) | \ - ((MINOR) << 12) | \ - (PATH))) - -/* check extended manifest version consistency */ -#define SOF_EXT_MAN_VERSION_INCOMPATIBLE(host_ver, cli_ver) ( \ - ((host_ver) & GENMASK(31, 24)) != \ - ((cli_ver) & GENMASK(31, 24))) - -/* used extended manifest header version */ -#define SOF_EXT_MAN_VERSION SOF_EXT_MAN_BUILD_VERSION(1, 0, 0) - -/* extended manifest header, deleting any field breaks backward compatibility */ -struct sof_ext_man_header { - uint32_t magic; /*< identification number, */ - /*< EXT_MAN_MAGIC_NUMBER */ - uint32_t full_size; /*< [bytes] full size of ext_man, */ - /*< (header + content + padding) */ - uint32_t header_size; /*< [bytes] makes header extensionable, */ - /*< after append new field to ext_man header */ - /*< then backward compatible won't be lost */ - uint32_t header_version; /*< value of EXT_MAN_VERSION */ - /*< not related with following content */ - uint8_t elements[]; /*< list of ext_man_elem_* elements */ -} __packed; - -/* Now define extended manifest elements */ - -/* Extended manifest elements types */ -enum sof_ext_man_elem_type { - SOF_EXT_MAN_ELEM_FW_VERSION = 0, - SOF_EXT_MAN_ELEM_WINDOW = SOF_IPC_EXT_WINDOW, - SOF_EXT_MAN_ELEM_CC_VERSION = SOF_IPC_EXT_CC_INFO, -}; - -/* extended manifest element header */ -struct sof_ext_man_elem_header { - uint32_t type; /*< SOF_EXT_MAN_ELEM_ */ - uint32_t size; /*< in bytes, including header size */ - uint8_t blob[]; /*< type dependent content */ -} __packed; - -/* FW version */ -struct sof_ext_man_fw_version { - struct sof_ext_man_elem_header hdr; - /* use sof_ipc struct because of code re-use */ - struct sof_ipc_fw_version version; - uint32_t flags; -} __packed; - -/* extended data memory windows for IPC, trace and debug */ -struct sof_ext_man_window { - struct sof_ext_man_elem_header hdr; - /* use sof_ipc struct because of code re-use */ - struct sof_ipc_window ipc_window; -} __packed; - -/* Used C compiler description */ -struct sof_ext_man_cc_version { - struct sof_ext_man_elem_header hdr; - /* use sof_ipc struct because of code re-use */ - struct sof_ipc_cc_version cc_version; -} __packed; - -#endif /* __SOF_FIRMWARE_EXT_MANIFEST_H__ */ diff --git a/sound/soc/sof/loader.c b/sound/soc/sof/loader.c index 8be30cd5e038..89f35db2577d 100644 --- a/sound/soc/sof/loader.c +++ b/sound/soc/sof/loader.c @@ -12,7 +12,6 @@ #include #include -#include #include "ops.h" static int get_ext_windows(struct snd_sof_dev *sdev, @@ -20,21 +19,13 @@ static int get_ext_windows(struct snd_sof_dev *sdev, { const struct sof_ipc_window *w = container_of(ext_hdr, struct sof_ipc_window, ext_hdr); - size_t w_size = struct_size(w, window, w->num_windows); - - if (sdev->info_window) { - if (memcmp(sdev->info_window, w, w_size)) { - dev_err(sdev->dev, "error: mistmatch between window descriptor from extended manifest and mailbox"); - return -EINVAL; - } - return 0; - } if (w->num_windows == 0 || w->num_windows > SOF_IPC_MAX_ELEMS) return -EINVAL; /* keep a local copy of the data */ - sdev->info_window = kmemdup(w, w_size, GFP_KERNEL); + sdev->info_window = kmemdup(w, struct_size(w, window, w->num_windows), + GFP_KERNEL); if (!sdev->info_window) return -ENOMEM; @@ -49,14 +40,6 @@ static int get_cc_info(struct snd_sof_dev *sdev, const struct sof_ipc_cc_version *cc = container_of(ext_hdr, struct sof_ipc_cc_version, ext_hdr); - if (sdev->cc_version) { - if (memcmp(sdev->cc_version, cc, cc->ext_hdr.hdr.size)) { - dev_err(sdev->dev, "error: receive diverged cc_version descriptions"); - return -EINVAL; - } - return 0; - } - dev_dbg(sdev->dev, "Firmware info: used compiler %s %d:%d:%d%s used optimization flags %s\n", cc->name, cc->major, cc->minor, cc->micro, cc->desc, cc->optim); @@ -143,151 +126,6 @@ int snd_sof_fw_parse_ext_data(struct snd_sof_dev *sdev, u32 bar, u32 offset) } EXPORT_SYMBOL(snd_sof_fw_parse_ext_data); -static int ext_man_get_fw_version(struct snd_sof_dev *sdev, - const struct sof_ext_man_elem_header *hdr) -{ - const struct sof_ext_man_fw_version *v; - - v = container_of(hdr, struct sof_ext_man_fw_version, hdr); - - memcpy(&sdev->fw_ready.version, &v->version, sizeof(v->version)); - sdev->fw_ready.flags = v->flags; - - /* log ABI versions and check FW compatibility */ - return snd_sof_ipc_valid(sdev); -} - -static int ext_man_get_windows(struct snd_sof_dev *sdev, - const struct sof_ext_man_elem_header *hdr) -{ - const struct sof_ipc_ext_data_hdr *w_ipc; - const struct sof_ext_man_window *w; - - w = container_of(hdr, struct sof_ext_man_window, hdr); - w_ipc = (const struct sof_ipc_ext_data_hdr *)&w->ipc_window; - - return get_ext_windows(sdev, w_ipc); -} - -static int ext_man_get_cc_info(struct snd_sof_dev *sdev, - const struct sof_ext_man_elem_header *hdr) -{ - const struct sof_ext_man_cc_version *cc; - const struct sof_ipc_ext_data_hdr *cc_version; - - cc = container_of(hdr, struct sof_ext_man_cc_version, hdr); - cc_version = (const struct sof_ipc_ext_data_hdr *)&cc->cc_version; - - return get_cc_info(sdev, cc_version); -} - -static ssize_t snd_sof_ext_man_size(const struct firmware *fw) -{ - const struct sof_ext_man_header *head = (void *)fw->data; - - /* - * assert fw size is big enough to contain extended manifest header, - * it prevents from reading unallocated memory from `head` in following - * step. - */ - if (fw->size < sizeof(*head)) - return -EINVAL; - - /* - * When fw points to extended manifest, - * then first u32 must be equal SOF_EXT_MAN_MAGIC_NUMBER. - */ - if (head->magic == SOF_EXT_MAN_MAGIC_NUMBER) - return head->full_size; - - /* otherwise given fw don't have an extended manifest */ - return 0; -} - -/* parse extended FW manifest data structures */ -static int snd_sof_fw_ext_man_parse(struct snd_sof_dev *sdev, - const struct firmware *fw) -{ - const struct sof_ext_man_elem_header *elem_hdr; - const struct sof_ext_man_header *head; - ssize_t ext_man_size; - ssize_t remaining; - uintptr_t iptr; - int ret = 0; - - head = (struct sof_ext_man_header *)fw->data; - remaining = head->full_size - head->header_size; - ext_man_size = snd_sof_ext_man_size(fw); - - /* Assert firmware starts with extended manifest */ - if (ext_man_size < 0) { - dev_err(sdev->dev, "error: exception while reading firmware extended manifest, code %d\n", - (int)ext_man_size); - return ext_man_size; - } else if (!ext_man_size) { - dev_err(sdev->dev, "error: can't parse extended manifest when it's not present\n"); - return -EINVAL; - } - - /* incompatible version */ - if (SOF_EXT_MAN_VERSION_INCOMPATIBLE(SOF_EXT_MAN_VERSION, - head->header_version)) { - dev_err(sdev->dev, "error: extended manifest version 0x%X differ from used 0x%X\n", - head->header_version, SOF_EXT_MAN_VERSION); - return -EINVAL; - } - - /* get first extended manifest element header */ - iptr = (uintptr_t)fw->data + head->header_size; - - while (remaining > sizeof(*elem_hdr)) { - elem_hdr = (struct sof_ext_man_elem_header *)iptr; - - dev_dbg(sdev->dev, "found sof_ext_man header type %d size 0x%X\n", - elem_hdr->type, elem_hdr->size); - - if (elem_hdr->size < sizeof(*elem_hdr) || - elem_hdr->size > remaining) { - dev_err(sdev->dev, "error: invalid sof_ext_man header size, type %d size 0x%X\n", - elem_hdr->type, elem_hdr->size); - break; - } - - /* process structure data */ - switch (elem_hdr->type) { - case SOF_EXT_MAN_ELEM_FW_VERSION: - ret = ext_man_get_fw_version(sdev, elem_hdr); - break; - case SOF_EXT_MAN_ELEM_WINDOW: - ret = ext_man_get_windows(sdev, elem_hdr); - break; - case SOF_EXT_MAN_ELEM_CC_VERSION: - ret = ext_man_get_cc_info(sdev, elem_hdr); - break; - default: - dev_warn(sdev->dev, "warning: unknown sof_ext_man header type %d size 0x%X\n", - elem_hdr->type, elem_hdr->size); - break; - } - - if (ret < 0) { - dev_err(sdev->dev, "error: failed to parse sof_ext_man header type %d size 0x%X\n", - elem_hdr->type, elem_hdr->size); - break; - } - - remaining -= elem_hdr->size; - iptr += elem_hdr->size; - } - - if (remaining) { - dev_err(sdev->dev, "error: sof_ext_man header is inconsistent\n"); - ret = -EINVAL; - } - - return ret; -} - /* * IPC Firmware ready. */ @@ -635,7 +473,6 @@ int snd_sof_load_firmware_raw(struct snd_sof_dev *sdev) { struct snd_sof_pdata *plat_data = sdev->pdata; const char *fw_filename; - ssize_t ext_man_size; int ret; /* Don't request firmware again if firmware is already requested */ @@ -653,33 +490,11 @@ int snd_sof_load_firmware_raw(struct snd_sof_dev *sdev) if (ret < 0) { dev_err(sdev->dev, "error: request firmware %s failed err: %d\n", fw_filename, ret); - goto err; } else { dev_dbg(sdev->dev, "request_firmware %s successful\n", fw_filename); } - /* check for extended manifest */ - ext_man_size = snd_sof_ext_man_size(plat_data->fw); - if (ext_man_size > 0) { - ret = snd_sof_fw_ext_man_parse(sdev, plat_data->fw); - - /* when no error occurred, drop extended manifest */ - if (!ret) - plat_data->fw_offset = ext_man_size; - else - dev_err(sdev->dev, "error: firmware %s contains unsupported or invalid extended manifest: %d\n", - fw_filename, ret); - } else if (!ext_man_size) { - /* No extended manifest, so nothing to skip during FW load */ - dev_dbg(sdev->dev, "firmware doesn't contain extended manifest\n"); - } else { - ret = ext_man_size; - dev_err(sdev->dev, "error: firmware %s contains unsupported or invalid extended manifest: %d\n", - fw_filename, ret); - } - -err: kfree(fw_filename); return ret; -- cgit v1.2.3 From 4d797fce783a8eb11dd23463828db84743795046 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 1 Apr 2020 17:25:47 +0300 Subject: cfg80211: Unprotected Beacon frame RX indication Extend cfg80211_rx_unprot_mlme_mgmt() to cover indication of unprotected Beacon frames in addition to the previously used Deauthentication and Disassociation frames. The Beacon frame case is quite similar, but has couple of exceptions: this is used both with fully unprotected and also incorrectly protected frames and there is a rate limit on the events to avoid unnecessary flooding netlink events in case something goes wrong. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200401142548.6990-1-jouni@codeaurora.org [add missing kernel-doc] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++++++++-- include/uapi/linux/nl80211.h | 7 +++++++ net/wireless/nl80211.c | 13 +++++++++++-- net/wireless/sme.c | 2 ++ 4 files changed, 30 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 70e48f66dac8..775952677b3d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5045,6 +5045,8 @@ struct cfg80211_cqm_config; * @pmsr_list: (private) peer measurement requests * @pmsr_lock: (private) peer measurements requests/results lock * @pmsr_free_wk: (private) peer measurements cleanup work + * @unprot_beacon_reported: (private) timestamp of last + * unprotected beacon report */ struct wireless_dev { struct wiphy *wiphy; @@ -5121,6 +5123,8 @@ struct wireless_dev { struct list_head pmsr_list; spinlock_t pmsr_lock; struct work_struct pmsr_free_wk; + + unsigned long unprot_beacon_reported; }; static inline u8 *wdev_address(struct wireless_dev *wdev) @@ -6135,12 +6139,16 @@ void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len); /** * cfg80211_rx_unprot_mlme_mgmt - notification of unprotected mlme mgmt frame * @dev: network device - * @buf: deauthentication frame (header + body) + * @buf: received management frame (header + body) * @len: length of the frame data * * This function is called whenever a received deauthentication or dissassoc * frame has been dropped in station mode because of MFP being used but the - * frame was not protected. This function may sleep. + * frame was not protected. This is also used to notify reception of a Beacon + * frame that was dropped because it did not include a valid MME MIC while + * beacon protection was enabled (BIGTK configured in station mode). + * + * This function may sleep. */ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 2b691161830f..afdd9802ccb8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1151,6 +1151,11 @@ * @NL80211_CMD_SET_TID_CONFIG: Data frame TID specific configuration * is passed using %NL80211_ATTR_TID_CONFIG attribute. * + * @NL80211_CMD_UNPROT_BEACON: Unprotected or incorrectly protected Beacon + * frame. This event is used to indicate that a received Beacon frame was + * dropped because it did not include a valid MME MIC while beacon + * protection was enabled (BIGTK configured in station mode). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1377,6 +1382,8 @@ enum nl80211_commands { NL80211_CMD_SET_TID_CONFIG, + NL80211_CMD_UNPROT_BEACON, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 692bcd35f809..2127e5344b1a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -15542,10 +15542,19 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, if (WARN_ON(len < 2)) return; - if (ieee80211_is_deauth(mgmt->frame_control)) + if (ieee80211_is_deauth(mgmt->frame_control)) { cmd = NL80211_CMD_UNPROT_DEAUTHENTICATE; - else + } else if (ieee80211_is_disassoc(mgmt->frame_control)) { cmd = NL80211_CMD_UNPROT_DISASSOCIATE; + } else if (ieee80211_is_beacon(mgmt->frame_control)) { + if (wdev->unprot_beacon_reported && + elapsed_jiffies_msecs(wdev->unprot_beacon_reported) < 10000) + return; + cmd = NL80211_CMD_UNPROT_BEACON; + wdev->unprot_beacon_reported = jiffies; + } else { + return; + } trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len); nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1, diff --git a/net/wireless/sme.c b/net/wireless/sme.c index ac3e60aa1fc8..3554c0d951f4 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -694,6 +694,7 @@ void __cfg80211_connect_result(struct net_device *dev, return; } + wdev->unprot_beacon_reported = 0; nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr, GFP_KERNEL); @@ -921,6 +922,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev, cfg80211_hold_bss(bss_from_pub(info->bss)); wdev->current_bss = bss_from_pub(info->bss); + wdev->unprot_beacon_reported = 0; nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy), wdev->netdev, info, GFP_KERNEL); -- cgit v1.2.3 From 9dba48a6ece79da064655736dc7347a5fcadedef Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 17 Apr 2020 12:40:15 +0200 Subject: cfg80211: support multicast RX registration For DPP, there's a need to receive multicast action frames, but many drivers need a special filter configuration for this. Support announcing from userspace in the management registration that multicast RX is required, with an extended feature flag if the driver handles this. Signed-off-by: Johannes Berg Reviewed-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200417124013.c46238801048.Ib041d437ce0bff28a0c6d5dc915f68f1d8591002@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 13 +++++++++++++ net/wireless/core.h | 3 ++- net/wireless/mlme.c | 38 ++++++++++++++++++++++++++++++-------- net/wireless/nl80211.c | 10 ++++++++++ 5 files changed, 59 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bc273f6d60f2..dbb9675fe38f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3390,9 +3390,13 @@ struct cfg80211_update_owe_info { * for the entire device * @interface_stypes: bitmap of management frame subtypes registered * for the given interface + * @global_mcast_rx: mcast RX is needed globally for these subtypes + * @interface_mcast_stypes: mcast RX is needed on this interface + * for these subtypes */ struct mgmt_frame_regs { u32 global_stypes, interface_stypes; + u32 global_mcast_stypes, interface_mcast_stypes; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index afdd9802ccb8..e0dc89eceab8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -687,6 +687,10 @@ * four bytes for vendor frames including the OUI. The registration * cannot be dropped, but is removed automatically when the netlink * socket is closed. Multiple registrations can be made. + * The %NL80211_ATTR_RECEIVE_MULTICAST flag attribute can be given if + * %NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS is available, in which + * case the registration can also be modified to include/exclude the + * flag, rather than requiring unregistration to change it. * @NL80211_CMD_REGISTER_ACTION: Alias for @NL80211_CMD_REGISTER_FRAME for * backward compatibility * @NL80211_CMD_FRAME: Management frame TX request and RX notification. This @@ -2477,6 +2481,9 @@ enum nl80211_commands { * no roaming occurs between the reauth threshold and PMK expiration, * disassociation is still forced. * + * @NL80211_ATTR_RECEIVE_MULTICAST: multicast flag for the + * %NL80211_CMD_REGISTER_FRAME command, see the description there. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2952,6 +2959,8 @@ enum nl80211_attrs { NL80211_ATTR_PMK_LIFETIME, NL80211_ATTR_PMK_REAUTH_THRESHOLD, + NL80211_ATTR_RECEIVE_MULTICAST, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5691,6 +5700,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_DEL_IBSS_STA: The driver supports removing stations * in IBSS mode, essentially by dropping their state. * + * @NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS: management frame registrations + * are possible for multicast frames and those will be reported properly. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5742,6 +5754,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH, NL80211_EXT_FEATURE_PROTECTED_TWT, NL80211_EXT_FEATURE_DEL_IBSS_STA, + NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/core.h b/net/wireless/core.h index 30fb2c35ae43..639d41896573 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -381,7 +381,8 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, u16 frame_type, const u8 *match_data, - int match_len, struct netlink_ext_ack *extack); + int match_len, bool multicast_rx, + struct netlink_ext_ack *extack); void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk); void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 2e1a21e90b83..409497a3527d 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -426,6 +426,8 @@ struct cfg80211_mgmt_registration { __le16 frame_type; + bool multicast_rx; + u8 match[]; }; @@ -442,10 +444,18 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { list_for_each_entry_rcu(reg, &tmp->mgmt_registrations, list) { u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); + u32 mcast_mask = 0; + + if (reg->multicast_rx) + mcast_mask = mask; upd.global_stypes |= mask; - if (tmp == wdev) + upd.global_mcast_stypes |= mcast_mask; + + if (tmp == wdev) { upd.interface_stypes |= mask; + upd.interface_mcast_stypes |= mcast_mask; + } } } rcu_read_unlock(); @@ -465,11 +475,13 @@ void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, - int match_len, struct netlink_ext_ack *extack) + int match_len, bool multicast_rx, + struct netlink_ext_ack *extack) { struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; + bool update_multicast = false; if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; @@ -520,6 +532,11 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, continue; if (memcmp(reg->match, match_data, mlen) == 0) { + if (reg->multicast_rx != multicast_rx) { + update_multicast = true; + reg->multicast_rx = multicast_rx; + break; + } NL_SET_ERR_MSG(extack, "Match already configured"); err = -EALREADY; break; @@ -529,12 +546,17 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, if (err) goto out; - memcpy(nreg->match, match_data, match_len); - nreg->match_len = match_len; - nreg->nlportid = snd_portid; - nreg->frame_type = cpu_to_le16(frame_type); - nreg->wdev = wdev; - list_add(&nreg->list, &wdev->mgmt_registrations); + if (update_multicast) { + kfree(nreg); + } else { + memcpy(nreg->match, match_data, match_len); + nreg->match_len = match_len; + nreg->nlportid = snd_portid; + nreg->frame_type = cpu_to_le16(frame_type); + nreg->wdev = wdev; + nreg->multicast_rx = multicast_rx; + list_add(&nreg->list, &wdev->mgmt_registrations); + } spin_unlock_bh(&wdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 2127e5344b1a..73a3e885d4dd 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -661,6 +661,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CONTROL_PORT_NO_PREAUTH] = { .type = NLA_FLAG }, [NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1), [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100), + [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -10773,9 +10774,18 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->mgmt_tx) return -EOPNOTSUPP; + if (info->attrs[NL80211_ATTR_RECEIVE_MULTICAST] && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS)) { + GENL_SET_ERR_MSG(info, + "multicast RX registrations are not supported"); + return -EOPNOTSUPP; + } + return cfg80211_mlme_register_mgmt(wdev, info->snd_portid, frame_type, nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]), nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]), + info->attrs[NL80211_ATTR_RECEIVE_MULTICAST], info->extack); } -- cgit v1.2.3 From 155d7c733807190258639c66b36340948f369349 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 20 Apr 2020 14:06:00 +0200 Subject: nl80211: allow client-only BIGTK support The current NL80211_EXT_FEATURE_BEACON_PROTECTION feature flag requires both AP and client support, add a new one called NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT that enables only support in client (and P2P-client) modes. Link: https://lore.kernel.org/r/20200420140559.6ba704053a5a.Ifeb869fb0b48e52fe0cb9c15572b93ac8a924f8d@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +++ net/wireless/nl80211.c | 19 +++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e0dc89eceab8..9679d561f7d0 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5690,6 +5690,8 @@ enum nl80211_feature_flags { * * @NL80211_EXT_FEATURE_BEACON_PROTECTION: The driver supports Beacon protection * and can receive key configuration for BIGTK using key indexes 6 and 7. + * @NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT: The driver supports Beacon + * protection as a client only and cannot transmit protected beacons. * * @NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH: The driver can disable the * forwarding of preauth frames over the control port. They are then @@ -5755,6 +5757,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_PROTECTED_TWT, NL80211_EXT_FEATURE_DEL_IBSS_STA, NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 73a3e885d4dd..d470d77d2eb6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3905,14 +3905,25 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) }; void *hdr; struct sk_buff *msg; + bool bigtk_support = false; + + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION)) + bigtk_support = true; + + if ((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_STATION || + dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_CLIENT) && + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) + bigtk_support = true; if (info->attrs[NL80211_ATTR_KEY_IDX]) { key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); - if (key_idx > 5 && - !wiphy_ext_feature_isset( - &rdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION)) + + if (key_idx >= 6 && key_idx <= 7 && !bigtk_support) { + GENL_SET_ERR_MSG(info, "BIGTK not supported"); return -EINVAL; + } } if (info->attrs[NL80211_ATTR_MAC]) -- cgit v1.2.3 From acd05785e48c01edb2c4f4d014d28478b5f19fb5 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 17 Apr 2020 15:14:46 -0700 Subject: kvm: add capability for halt polling KVM_CAP_HALT_POLL is a per-VM capability that lets userspace control the halt-polling time, allowing halt-polling to be tuned or disabled on particular VMs. With dynamic halt-polling, a VM's VCPUs can poll from anywhere from [0, halt_poll_ns] on each halt. KVM_CAP_HALT_POLL sets the upper limit on the poll time. Signed-off-by: David Matlack Signed-off-by: Jon Cargille Reviewed-by: Jim Mattson Message-Id: <20200417221446.108733-1-jcargill@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 17 +++++++++++++++++ include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 1 + virt/kvm/kvm_main.c | 19 +++++++++++++++---- 4 files changed, 34 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index efbbe570aa9b..d871dacb984e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5802,6 +5802,23 @@ If present, this capability can be enabled for a VM, meaning that KVM will allow the transition to secure guest mode. Otherwise KVM will veto the transition. +7.20 KVM_CAP_HALT_POLL +---------------------- + +:Architectures: all +:Target: VM +:Parameters: args[0] is the maximum poll time in nanoseconds +:Returns: 0 on success; -1 on error + +This capability overrides the kvm module parameter halt_poll_ns for the +target VM. + +VCPU polling allows a VCPU to poll for wakeup events instead of immediately +scheduling during guest halts. The maximum time a VCPU can spend polling is +controlled by the kvm module parameter halt_poll_ns. This capability allows +the maximum halt time to specified on a per-VM basis, effectively overriding +the module parameter for the target VM. + 8. Other capabilities. ====================== diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5285a5568208..3cc6ccbb1183 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -503,6 +503,7 @@ struct kvm { struct srcu_struct srcu; struct srcu_struct irq_srcu; pid_t userspace_pid; + unsigned int max_halt_poll_ns; }; #define kvm_err(fmt, ...) \ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 428c7dde6b4b..ac9eba0289d1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1017,6 +1017,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_VCPU_RESETS 179 #define KVM_CAP_S390_PROTECTED 180 #define KVM_CAP_PPC_SECURE_GUEST 181 +#define KVM_CAP_HALT_POLL 182 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e7436d054305..33e1eee96f75 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -710,6 +710,8 @@ static struct kvm *kvm_create_vm(unsigned long type) goto out_err_no_arch_destroy_vm; } + kvm->max_halt_poll_ns = halt_poll_ns; + r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_arch_destroy_vm; @@ -2713,15 +2715,16 @@ out: if (!kvm_arch_no_poll(vcpu)) { if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); - } else if (halt_poll_ns) { + } else if (vcpu->kvm->max_halt_poll_ns) { if (block_ns <= vcpu->halt_poll_ns) ; /* we had a long block, shrink polling */ - else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) + else if (vcpu->halt_poll_ns && + block_ns > vcpu->kvm->max_halt_poll_ns) shrink_halt_poll_ns(vcpu); /* we had a short halt and our poll time is too small */ - else if (vcpu->halt_poll_ns < halt_poll_ns && - block_ns < halt_poll_ns) + else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns && + block_ns < vcpu->kvm->max_halt_poll_ns) grow_halt_poll_ns(vcpu); } else { vcpu->halt_poll_ns = 0; @@ -3510,6 +3513,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: + case KVM_CAP_HALT_POLL: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -3560,6 +3564,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, return 0; } #endif + case KVM_CAP_HALT_POLL: { + if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) + return -EINVAL; + + kvm->max_halt_poll_ns = cap->args[0]; + return 0; + } default: return kvm_vm_ioctl_enable_cap(kvm, cap); } -- cgit v1.2.3 From 71d19214776e61b33da48f7c1b46e522c7f78221 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sun, 26 Apr 2020 09:15:25 -0700 Subject: bpf: add bpf_ktime_get_boot_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a device like a cellphone which is constantly suspending and resuming CLOCK_MONOTONIC is not particularly useful for keeping track of or reacting to external network events. Instead you want to use CLOCK_BOOTTIME. Hence add bpf_ktime_get_boot_ns() as a mirror of bpf_ktime_get_ns() based around CLOCK_BOOTTIME instead of CLOCK_MONOTONIC. Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov --- drivers/media/rc/bpf-lirc.c | 2 ++ include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 13 ++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 14 ++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 13 ++++++++++++- 7 files changed, 44 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 0f3417d161b8..069c42f22a8c 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -103,6 +103,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_prandom_u32: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5147e11e53ff..10960cfabea4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1509,6 +1509,7 @@ extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; +extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7bbf1b65be10..4a6c47f3febe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -652,6 +652,8 @@ union bpf_attr { * u64 bpf_ktime_get_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: clock_gettime(CLOCK_MONOTONIC) * Return * Current *ktime*. * @@ -3025,6 +3027,14 @@ union bpf_attr { * * **-EOPNOTSUPP** Unsupported operation, for example a * call from outside of TC ingress. * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: clock_gettime(CLOCK_BOOTTIME) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3151,7 +3161,8 @@ union bpf_attr { FN(xdp_output), \ FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ - FN(sk_assign), + FN(sk_assign), \ + FN(ktime_get_boot_ns), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0cc91805069a..6aa11de67315 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2156,6 +2156,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9a6b23387d02..5c0290e0696e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -155,6 +155,18 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_boot_ns) +{ + /* NMI safe access to clock boottime */ + return ktime_get_boot_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { + .func = bpf_ktime_get_boot_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -615,6 +627,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; default: break; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..e875c95d3ced 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -797,6 +797,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7bbf1b65be10..4a6c47f3febe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -652,6 +652,8 @@ union bpf_attr { * u64 bpf_ktime_get_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: clock_gettime(CLOCK_MONOTONIC) * Return * Current *ktime*. * @@ -3025,6 +3027,14 @@ union bpf_attr { * * **-EOPNOTSUPP** Unsupported operation, for example a * call from outside of TC ingress. * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: clock_gettime(CLOCK_BOOTTIME) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3151,7 +3161,8 @@ union bpf_attr { FN(xdp_output), \ FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ - FN(sk_assign), + FN(sk_assign), \ + FN(ktime_get_boot_ns), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 74f99482eae03195ced512b440b31d62bdb6e943 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Tue, 21 Apr 2020 10:04:16 -0500 Subject: netfilter: nf_conntrack: add IPS_HW_OFFLOAD status bit This bit indicates that the conntrack entry is offloaded to hardware flow table. nf_conntrack entry will be tagged with [HW_OFFLOAD] if it's offload to hardware. cat /proc/net/nf_conntrack ipv4 2 tcp 6 \ src=1.1.1.17 dst=1.1.1.16 sport=56394 dport=5001 \ src=1.1.1.16 dst=1.1.1.17 sport=5001 dport=56394 [HW_OFFLOAD] \ mark=0 zone=0 use=3 Note that HW_OFFLOAD/OFFLOAD/ASSURED are mutually exclusive. Changelog: * V1->V2: - Remove check of lastused from stats. It was meant for cases such as removing driver module while traffic still running. Better to handle such cases from garbage collector. Signed-off-by: Bodong Wang Reviewed-by: Oz Shlomo Reviewed-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 8 ++++++-- net/netfilter/nf_conntrack_standalone.c | 4 +++- net/netfilter/nf_flow_table_offload.c | 3 +++ 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index b6f0bb1dc799..4b3395082d15 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -114,15 +114,19 @@ enum ip_conntrack_status { IPS_OFFLOAD_BIT = 14, IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT), + /* Conntrack has been offloaded to hardware. */ + IPS_HW_OFFLOAD_BIT = 15, + IPS_HW_OFFLOAD = (1 << IPS_HW_OFFLOAD_BIT), + /* Be careful here, modifying these bits can make things messy, * so don't let users modify them directly. */ IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED | - IPS_OFFLOAD), + IPS_OFFLOAD | IPS_HW_OFFLOAD), - __IPS_MAX_BIT = 15, + __IPS_MAX_BIT = 16, }; /* Connection tracking event types */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9b57330c81f8..5a3e6c43ee68 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -348,7 +348,9 @@ static int ct_seq_show(struct seq_file *s, void *v) if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; - if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) + if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status)) + seq_puts(s, "[HW_OFFLOAD] "); + else if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[OFFLOAD] "); else if (test_bit(IPS_ASSURED_BIT, &ct->status)) seq_puts(s, "[ASSURED] "); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index e3b099c14eff..a2abb0feab7f 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -754,12 +754,15 @@ static void flow_offload_work_add(struct flow_offload_work *offload) err = flow_offload_rule_add(offload, flow_rule); if (err < 0) set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags); + else + set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); nf_flow_offload_destroy(flow_rule); } static void flow_offload_work_del(struct flow_offload_work *offload) { + clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); -- cgit v1.2.3 From 1b450791d517d4d6666ab9ab6d9a20c8819e3572 Mon Sep 17 00:00:00 2001 From: Mateusz Gorski Date: Mon, 27 Apr 2020 15:27:26 +0200 Subject: ASoC: Intel: Multiple I/O PCM format support for pipe For pipes supporting multiple input/output formats, kcontrol is created and selection of pipe input and output configuration is done based on control set. If more than one configuration is supported, then this patch allows user to select configuration of choice using amixer settings. Signed-off-by: Mateusz Gorski Signed-off-by: Pavan K S Reviewed-by: Cezary Rojewski Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20200427132727.24942-3-mateusz.gorski@linux.intel.com Signed-off-by: Mark Brown --- include/uapi/sound/skl-tplg-interface.h | 1 + sound/soc/intel/skylake/skl-topology.c | 95 +++++++++++++++++++++++++++++++++ sound/soc/intel/skylake/skl-topology.h | 1 + 3 files changed, 97 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/sound/skl-tplg-interface.h b/include/uapi/sound/skl-tplg-interface.h index 9eee32f5e407..f2711186c81f 100644 --- a/include/uapi/sound/skl-tplg-interface.h +++ b/include/uapi/sound/skl-tplg-interface.h @@ -18,6 +18,7 @@ */ #define SKL_CONTROL_TYPE_BYTE_TLV 0x100 #define SKL_CONTROL_TYPE_MIC_SELECT 0x102 +#define SKL_CONTROL_TYPE_MULTI_IO_SELECT 0x103 #define HDA_SST_CFG_MAX 900 /* size of copier cfg*/ #define MAX_IN_QUEUE 8 diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c index 8198d5c5a590..f30c531d21bc 100644 --- a/sound/soc/intel/skylake/skl-topology.c +++ b/sound/soc/intel/skylake/skl-topology.c @@ -579,6 +579,38 @@ static int skl_tplg_unload_pipe_modules(struct skl_dev *skl, return ret; } +static bool skl_tplg_is_multi_fmt(struct skl_dev *skl, struct skl_pipe *pipe) +{ + struct skl_pipe_fmt *cur_fmt; + struct skl_pipe_fmt *next_fmt; + int i; + + if (pipe->nr_cfgs <= 1) + return false; + + if (pipe->conn_type != SKL_PIPE_CONN_TYPE_FE) + return true; + + for (i = 0; i < pipe->nr_cfgs - 1; i++) { + if (pipe->direction == SNDRV_PCM_STREAM_PLAYBACK) { + cur_fmt = &pipe->configs[i].out_fmt; + next_fmt = &pipe->configs[i + 1].out_fmt; + } else { + cur_fmt = &pipe->configs[i].in_fmt; + next_fmt = &pipe->configs[i + 1].in_fmt; + } + + if (!CHECK_HW_PARAMS(cur_fmt->channels, cur_fmt->freq, + cur_fmt->bps, + next_fmt->channels, + next_fmt->freq, + next_fmt->bps)) + return true; + } + + return false; +} + /* * Here, we select pipe format based on the pipe type and pipe * direction to determine the current config index for the pipeline. @@ -601,6 +633,14 @@ skl_tplg_get_pipe_config(struct skl_dev *skl, struct skl_module_cfg *mconfig) return 0; } + if (skl_tplg_is_multi_fmt(skl, pipe)) { + pipe->cur_config_idx = pipe->pipe_config_idx; + pipe->memory_pages = pconfig->mem_pages; + dev_dbg(skl->dev, "found pipe config idx:%d\n", + pipe->cur_config_idx); + return 0; + } + if (pipe->conn_type == SKL_PIPE_CONN_TYPE_NONE) { dev_dbg(skl->dev, "No conn_type detected, take 0th config\n"); pipe->cur_config_idx = 0; @@ -1315,6 +1355,56 @@ static int skl_tplg_pga_event(struct snd_soc_dapm_widget *w, return 0; } +static int skl_tplg_multi_config_set_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol, + bool is_set) +{ + struct snd_soc_component *component = + snd_soc_kcontrol_component(kcontrol); + struct hdac_bus *bus = snd_soc_component_get_drvdata(component); + struct skl_dev *skl = bus_to_skl(bus); + struct skl_pipeline *ppl; + struct skl_pipe *pipe = NULL; + struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; + u32 *pipe_id; + + if (!ec) + return -EINVAL; + + if (is_set && ucontrol->value.enumerated.item[0] > ec->items) + return -EINVAL; + + pipe_id = ec->dobj.private; + + list_for_each_entry(ppl, &skl->ppl_list, node) { + if (ppl->pipe->ppl_id == *pipe_id) { + pipe = ppl->pipe; + break; + } + } + if (!pipe) + return -EIO; + + if (is_set) + pipe->pipe_config_idx = ucontrol->value.enumerated.item[0]; + else + ucontrol->value.enumerated.item[0] = pipe->pipe_config_idx; + + return 0; +} + +static int skl_tplg_multi_config_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return skl_tplg_multi_config_set_get(kcontrol, ucontrol, false); +} + +static int skl_tplg_multi_config_set(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return skl_tplg_multi_config_set_get(kcontrol, ucontrol, true); +} + static int skl_tplg_tlv_control_get(struct snd_kcontrol *kcontrol, unsigned int __user *data, unsigned int size) { @@ -1854,6 +1944,11 @@ static const struct snd_soc_tplg_kcontrol_ops skl_tplg_kcontrol_ops[] = { .get = skl_tplg_mic_control_get, .put = skl_tplg_mic_control_set, }, + { + .id = SKL_CONTROL_TYPE_MULTI_IO_SELECT, + .get = skl_tplg_multi_config_get, + .put = skl_tplg_multi_config_set, + }, }; static int skl_tplg_fill_pipe_cfg(struct device *dev, diff --git a/sound/soc/intel/skylake/skl-topology.h b/sound/soc/intel/skylake/skl-topology.h index e967800dbb62..06576147cc29 100644 --- a/sound/soc/intel/skylake/skl-topology.h +++ b/sound/soc/intel/skylake/skl-topology.h @@ -306,6 +306,7 @@ struct skl_pipe { struct skl_path_config configs[SKL_MAX_PATH_CONFIGS]; struct list_head w_list; bool passthru; + u32 pipe_config_idx; }; enum skl_module_state { -- cgit v1.2.3 From 2d744ecf2b98405723a2138a547e5c75009bc4e5 Mon Sep 17 00:00:00 2001 From: Mateusz Gorski Date: Mon, 27 Apr 2020 15:27:27 +0200 Subject: ASoC: Intel: Skylake: Automatic DMIC format configuration according to information from NHLT Automatically choose DMIC pipeline format configuration depending on information included in NHLT. Change the access rights of appropriate kcontrols to read-only in order to prevent user interference. Signed-off-by: Mateusz Gorski Reviewed-by: Cezary Rojewski Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20200427132727.24942-4-mateusz.gorski@linux.intel.com Signed-off-by: Mark Brown --- include/uapi/sound/skl-tplg-interface.h | 1 + sound/soc/intel/skylake/skl-topology.c | 64 +++++++++++++++++++++++++++++++-- 2 files changed, 62 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/sound/skl-tplg-interface.h b/include/uapi/sound/skl-tplg-interface.h index f2711186c81f..a93c0decfdd5 100644 --- a/include/uapi/sound/skl-tplg-interface.h +++ b/include/uapi/sound/skl-tplg-interface.h @@ -19,6 +19,7 @@ #define SKL_CONTROL_TYPE_BYTE_TLV 0x100 #define SKL_CONTROL_TYPE_MIC_SELECT 0x102 #define SKL_CONTROL_TYPE_MULTI_IO_SELECT 0x103 +#define SKL_CONTROL_TYPE_MULTI_IO_SELECT_DMIC 0x104 #define HDA_SST_CFG_MAX 900 /* size of copier cfg*/ #define MAX_IN_QUEUE 8 diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c index f30c531d21bc..b9aab47d1202 100644 --- a/sound/soc/intel/skylake/skl-topology.c +++ b/sound/soc/intel/skylake/skl-topology.c @@ -1405,6 +1405,18 @@ static int skl_tplg_multi_config_set(struct snd_kcontrol *kcontrol, return skl_tplg_multi_config_set_get(kcontrol, ucontrol, true); } +static int skl_tplg_multi_config_get_dmic(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return skl_tplg_multi_config_set_get(kcontrol, ucontrol, false); +} + +static int skl_tplg_multi_config_set_dmic(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return skl_tplg_multi_config_set_get(kcontrol, ucontrol, true); +} + static int skl_tplg_tlv_control_get(struct snd_kcontrol *kcontrol, unsigned int __user *data, unsigned int size) { @@ -1949,6 +1961,11 @@ static const struct snd_soc_tplg_kcontrol_ops skl_tplg_kcontrol_ops[] = { .get = skl_tplg_multi_config_get, .put = skl_tplg_multi_config_set, }, + { + .id = SKL_CONTROL_TYPE_MULTI_IO_SELECT_DMIC, + .get = skl_tplg_multi_config_get_dmic, + .put = skl_tplg_multi_config_set_dmic, + } }; static int skl_tplg_fill_pipe_cfg(struct device *dev, @@ -3109,12 +3126,21 @@ static int skl_tplg_control_load(struct snd_soc_component *cmpnt, case SND_SOC_TPLG_CTL_ENUM: tplg_ec = container_of(hdr, struct snd_soc_tplg_enum_control, hdr); - if (kctl->access & SNDRV_CTL_ELEM_ACCESS_READWRITE) { + if (kctl->access & SNDRV_CTL_ELEM_ACCESS_READ) { se = (struct soc_enum *)kctl->private_value; if (tplg_ec->priv.size) - return skl_init_enum_data(bus->dev, se, - tplg_ec); + skl_init_enum_data(bus->dev, se, tplg_ec); } + + /* + * now that the control initializations are done, remove + * write permission for the DMIC configuration enums to + * avoid conflicts between NHLT settings and user interaction + */ + + if (hdr->ops.get == SKL_CONTROL_TYPE_MULTI_IO_SELECT_DMIC) + kctl->access = SNDRV_CTL_ELEM_ACCESS_READ; + break; default: @@ -3584,6 +3610,37 @@ static int skl_manifest_load(struct snd_soc_component *cmpnt, int index, return 0; } +static void skl_tplg_complete(struct snd_soc_component *component) +{ + struct snd_soc_dobj *dobj; + struct snd_soc_acpi_mach *mach = + dev_get_platdata(component->card->dev); + int i; + + list_for_each_entry(dobj, &component->dobj_list, list) { + struct snd_kcontrol *kcontrol = dobj->control.kcontrol; + struct soc_enum *se = + (struct soc_enum *)kcontrol->private_value; + char **texts = dobj->control.dtexts; + char chan_text[4]; + + if (dobj->type != SND_SOC_DOBJ_ENUM || + dobj->control.kcontrol->put != + skl_tplg_multi_config_set_dmic) + continue; + sprintf(chan_text, "c%d", mach->mach_params.dmic_num); + + for (i = 0; i < se->items; i++) { + struct snd_ctl_elem_value val; + + if (strstr(texts[i], chan_text)) { + val.value.enumerated.item[0] = i; + kcontrol->put(kcontrol, &val); + } + } + } +} + static struct snd_soc_tplg_ops skl_tplg_ops = { .widget_load = skl_tplg_widget_load, .control_load = skl_tplg_control_load, @@ -3593,6 +3650,7 @@ static struct snd_soc_tplg_ops skl_tplg_ops = { .io_ops_count = ARRAY_SIZE(skl_tplg_kcontrol_ops), .manifest = skl_manifest_load, .dai_load = skl_dai_load, + .complete = skl_tplg_complete, }; /* -- cgit v1.2.3 From 4714d13791f831d253852c8b5d657270becb8b2a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:21:58 +0200 Subject: bridge: uapi: mrp: Add mrp attributes. Add new nested netlink attribute to configure the MRP. These attributes are used by the userspace to add/delete/configure MRP instances and by the kernel to notify the userspace when the MRP ring gets open/closed. MRP nested attribute has the following attributes: IFLA_BRIDGE_MRP_INSTANCE - the parameter type is br_mrp_instance which contains the instance id, and the ifindex of the two ports. The ports can't be part of multiple instances. This is used to create/delete MRP instances. IFLA_BRIDGE_MRP_PORT_STATE - the parameter type is u32. Which can be forwarding, blocking or disabled. IFLA_BRIDGE_MRP_PORT_ROLE - the parameter type is br_mrp_port_role which contains the instance id and the role. The role can be primary or secondary. IFLA_BRIDGE_MRP_RING_STATE - the parameter type is br_mrp_ring_state which contains the instance id and the state. The state can be open or closed. IFLA_BRIDGE_MRP_RING_ROLE - the parameter type is br_mrp_ring_role which contains the instance id and the ring role. The role can be MRM or MRC. IFLA_BRIDGE_MRP_START_TEST - the parameter type is br_mrp_start_test which contains the instance id, the interval at which to send the MRP_Test frames, how many test frames can be missed before declaring the ring open and the period which represent for how long to send the test frames. Also add the file include/uapi/linux/mrp_bridge.h which defines all the types used by MRP that are also needed by the userpace. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 42 +++++++++++++++++++++ include/uapi/linux/if_ether.h | 1 + include/uapi/linux/mrp_bridge.h | 84 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 include/uapi/linux/mrp_bridge.h (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index bfe621ea51b3..bd8c95488f16 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -120,6 +120,7 @@ enum { IFLA_BRIDGE_MODE, IFLA_BRIDGE_VLAN_INFO, IFLA_BRIDGE_VLAN_TUNNEL_INFO, + IFLA_BRIDGE_MRP, __IFLA_BRIDGE_MAX, }; #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1) @@ -157,6 +158,47 @@ struct bridge_vlan_xstats { __u32 pad2; }; +enum { + IFLA_BRIDGE_MRP_UNSPEC, + IFLA_BRIDGE_MRP_INSTANCE, + IFLA_BRIDGE_MRP_PORT_STATE, + IFLA_BRIDGE_MRP_PORT_ROLE, + IFLA_BRIDGE_MRP_RING_STATE, + IFLA_BRIDGE_MRP_RING_ROLE, + IFLA_BRIDGE_MRP_START_TEST, + __IFLA_BRIDGE_MRP_MAX, +}; + +struct br_mrp_instance { + __u32 ring_id; + __u32 p_ifindex; + __u32 s_ifindex; +}; + +struct br_mrp_port_role { + __u32 ring_id; + __u32 role; +}; + +struct br_mrp_ring_state { + __u32 ring_id; + __u32 ring_state; +}; + +struct br_mrp_ring_role { + __u32 ring_id; + __u32 ring_role; +}; + +struct br_mrp_start_test { + __u32 ring_id; + __u32 interval; + __u32 max_miss; + __u32 period; +}; + +#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1) + struct bridge_stp_xstats { __u64 transition_blk; __u64 transition_fwd; diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index f6ceb2e63d1e..d6de2b167448 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -92,6 +92,7 @@ #define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */ #define ETH_P_TIPC 0x88CA /* TIPC */ #define ETH_P_LLDP 0x88CC /* Link Layer Discovery Protocol */ +#define ETH_P_MRP 0x88E3 /* Media Redundancy Protocol */ #define ETH_P_MACSEC 0x88E5 /* 802.1ae MACsec */ #define ETH_P_8021AH 0x88E7 /* 802.1ah Backbone Service Tag */ #define ETH_P_MVRP 0x88F5 /* 802.1Q MVRP */ diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h new file mode 100644 index 000000000000..2600cdf5a284 --- /dev/null +++ b/include/uapi/linux/mrp_bridge.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_MRP_BRIDGE_H_ +#define _UAPI_LINUX_MRP_BRIDGE_H_ + +#include +#include + +#define MRP_MAX_FRAME_LENGTH 200 +#define MRP_DEFAULT_PRIO 0x8000 +#define MRP_DOMAIN_UUID_LENGTH 16 +#define MRP_VERSION 1 +#define MRP_FRAME_PRIO 7 + +enum br_mrp_ring_role_type { + BR_MRP_RING_ROLE_DISABLED, + BR_MRP_RING_ROLE_MRC, + BR_MRP_RING_ROLE_MRM, +}; + +enum br_mrp_ring_state_type { + BR_MRP_RING_STATE_OPEN, + BR_MRP_RING_STATE_CLOSED, +}; + +enum br_mrp_port_state_type { + BR_MRP_PORT_STATE_DISABLED, + BR_MRP_PORT_STATE_BLOCKED, + BR_MRP_PORT_STATE_FORWARDING, + BR_MRP_PORT_STATE_NOT_CONNECTED, +}; + +enum br_mrp_port_role_type { + BR_MRP_PORT_ROLE_PRIMARY, + BR_MRP_PORT_ROLE_SECONDARY, + BR_MRP_PORT_ROLE_NONE, +}; + +enum br_mrp_tlv_header_type { + BR_MRP_TLV_HEADER_END = 0x0, + BR_MRP_TLV_HEADER_COMMON = 0x1, + BR_MRP_TLV_HEADER_RING_TEST = 0x2, + BR_MRP_TLV_HEADER_RING_TOPO = 0x3, + BR_MRP_TLV_HEADER_RING_LINK_DOWN = 0x4, + BR_MRP_TLV_HEADER_RING_LINK_UP = 0x5, +}; + +struct br_mrp_tlv_hdr { + __u8 type; + __u8 length; +}; + +struct br_mrp_end_hdr { + struct br_mrp_tlv_hdr hdr; +}; + +struct br_mrp_common_hdr { + __u16 seq_id; + __u8 domain[MRP_DOMAIN_UUID_LENGTH]; +}; + +struct br_mrp_ring_test_hdr { + __u16 prio; + __u8 sa[ETH_ALEN]; + __u16 port_role; + __u16 state; + __u16 transitions; + __u32 timestamp; +}; + +struct br_mrp_ring_topo_hdr { + __u16 prio; + __u8 sa[ETH_ALEN]; + __u16 interval; +}; + +struct br_mrp_ring_link_hdr { + __u8 sa[ETH_ALEN]; + __u16 port_role; + __u16 interval; + __u16 blocked; +}; + +#endif -- cgit v1.2.3 From 3e54442c93845316762b1b3c75e654463fd1b715 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 26 Apr 2020 15:22:01 +0200 Subject: net: bridge: Add port attribute IFLA_BRPORT_MRP_RING_OPEN This patch adds a new port attribute, IFLA_BRPORT_MRP_RING_OPEN, which allows to notify the userspace when the port lost the continuite of MRP frames. This attribute is set by kernel whenever the SW or HW detects that the ring is being open or closed. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_netlink.c | 3 +++ tools/include/uapi/linux/if_link.h | 1 + 3 files changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 127c704eeba9..a009365ad67b 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -343,6 +343,7 @@ enum { IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 43dab4066f91..4084f1ef8641 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -151,6 +151,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + 0; } @@ -213,6 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, !!(p->flags & BR_NEIGH_SUPPRESS)) || + nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & + BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index ca6665ea758a..cafedbbfefbe 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -343,6 +343,7 @@ enum { IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3 From 9d2161bed4e39ef7a5e5f18f69c4a57d001051b9 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 22 Apr 2020 17:37:04 -0400 Subject: audit: log audit netlink multicast bind and unbind Log information about programs connecting to and disconnecting from the audit netlink multicast socket. This is needed so that during investigations a security officer can tell who or what had access to the audit trail. This helps to meet the FAU_SAR.2 requirement for Common Criteria. Here is the systemd startup event: type=PROCTITLE msg=audit(2020-04-22 10:10:21.787:10) : proctitle=/init type=SYSCALL msg=audit(2020-04-22 10:10:21.787:10) : arch=x86_64 syscall=bind success=yes exit=0 a0=0x19 a1=0x555f4aac7e90 a2=0xc a3=0x7ffcb792ff44 items=0 ppid=0 pid=1 auid=unset uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=(none) ses=unset comm=systemd exe=/usr/lib/systemd/systemd subj=kernel key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 10:10:21.787:10) : pid=1 uid=root auid=unset tty=(none) ses=unset subj=kernel comm=systemd exe=/usr/lib/systemd/systemd nl-mcgrp=1 op=connect res=yes And events from the test suite that just uses close(): type=PROCTITLE msg=audit(2020-04-22 11:47:08.501:442) : proctitle=/usr/bin/perl -w amcast_joinpart/test type=SYSCALL msg=audit(2020-04-22 11:47:08.501:442) : arch=x86_64 syscall=bind success=yes exit=0 a0=0x7 a1=0x563004378760 a2=0xc a3=0x0 items=0 ppid=815 pid=818 auid=root uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=ttyS0 ses=1 comm=perl exe=/usr/bin/perl subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 11:47:08.501:442) : pid=818 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=connect res=yes type=UNKNOWN[1335] msg=audit(2020-04-22 11:47:08.501:443) : pid=818 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=disconnect res=yes And the events from the test suite using setsockopt with NETLINK_DROP_MEMBERSHIP: type=PROCTITLE msg=audit(2020-04-22 11:39:53.291:439) : proctitle=/usr/bin/perl -w amcast_joinpart/test type=SYSCALL msg=audit(2020-04-22 11:39:53.291:439) : arch=x86_64 syscall=bind success=yes exit=0 a0=0x7 a1=0x5560877c2d20 a2=0xc a3=0x0 items=0 ppid=772 pid=775 auid=root uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=ttyS0 ses=1 comm=perl exe=/usr/bin/perl subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 11:39:53.291:439) : pid=775 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=connect res=yes type=PROCTITLE msg=audit(2020-04-22 11:39:53.292:440) : proctitle=/usr/bin/perl -w amcast_joinpart/test type=SYSCALL msg=audit(2020-04-22 11:39:53.292:440) : arch=x86_64 syscall=setsockopt success=yes exit=0 a0=0x7 a1=SOL_NETLINK a2=0x2 a3=0x7ffc8366f000 items=0 ppid=772 pid=775 auid=root uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=ttyS0 ses=1 comm=perl exe=/usr/bin/perl subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1335] msg=audit(2020-04-22 11:39:53.292:440) : pid=775 uid=root auid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=perl exe=/usr/bin/perl nl-mcgrp=1 op=disconnect res=yes Please see the upstream issue tracker at https://github.com/linux-audit/audit-kernel/issues/28 With the feature description at https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Multicast-Socket-Join-Part The testsuite support is at https://github.com/rgbriggs/audit-testsuite/compare/ghak28-mcast-part-join https://github.com/linux-audit/audit-testsuite/pull/93 And the userspace support patch is at https://github.com/linux-audit/audit-userspace/pull/114 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 1 + kernel/audit.c | 48 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 45 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index a534d71e689a..9b6a973f4cc3 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -117,6 +117,7 @@ #define AUDIT_TIME_INJOFFSET 1332 /* Timekeeping offset injected */ #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */ #define AUDIT_BPF 1334 /* BPF subsystem */ +#define AUDIT_EVENT_LISTENER 1335 /* Task joined multicast read socket */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ diff --git a/kernel/audit.c b/kernel/audit.c index 622c30246d19..e33460e01b3b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1530,20 +1530,60 @@ static void audit_receive(struct sk_buff *skb) audit_ctl_unlock(); } +/* Log information about who is connecting to the audit multicast socket */ +static void audit_log_multicast(int group, const char *op, int err) +{ + const struct cred *cred; + struct tty_struct *tty; + char comm[sizeof(current->comm)]; + struct audit_buffer *ab; + + if (!audit_enabled) + return; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); + if (!ab) + return; + + cred = current_cred(); + tty = audit_get_tty(); + audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", + task_pid_nr(current), + from_kuid(&init_user_ns, cred->uid), + from_kuid(&init_user_ns, audit_get_loginuid(current)), + tty ? tty_name(tty) : "(none)", + audit_get_sessionid(current)); + audit_put_tty(tty); + audit_log_task_context(ab); /* subj= */ + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); + audit_log_d_path_exe(ab, current->mm); /* exe= */ + audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); + audit_log_end(ab); +} + /* Run custom bind function on netlink socket group connect or bind requests. */ -static int audit_bind(struct net *net, int group) +static int audit_multicast_bind(struct net *net, int group) { + int err = 0; + if (!capable(CAP_AUDIT_READ)) - return -EPERM; + err = -EPERM; + audit_log_multicast(group, "connect", err); + return err; +} - return 0; +static void audit_multicast_unbind(struct net *net, int group) +{ + audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, - .bind = audit_bind, + .bind = audit_multicast_bind, + .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; -- cgit v1.2.3 From 3ff7ddb1353da9b535e65702704cbadea1da9a00 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2020 21:55:37 +0200 Subject: netfilter: nft_nat: add netmap support This patch allows you to NAT the network address prefix onto another network address prefix, a.k.a. netmapping. Userspace must specify the NF_NAT_RANGE_NETMAP flag and the prefix address through the NFTA_NAT_REG_ADDR_MIN and NFTA_NAT_REG_ADDR_MAX netlink attributes. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_nat.h | 4 ++- net/netfilter/nft_nat.c | 46 ++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h index 4a95c0db14d4..a64586e77b24 100644 --- a/include/uapi/linux/netfilter/nf_nat.h +++ b/include/uapi/linux/netfilter/nf_nat.h @@ -11,6 +11,7 @@ #define NF_NAT_RANGE_PERSISTENT (1 << 3) #define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) #define NF_NAT_RANGE_PROTO_OFFSET (1 << 5) +#define NF_NAT_RANGE_NETMAP (1 << 6) #define NF_NAT_RANGE_PROTO_RANDOM_ALL \ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) @@ -18,7 +19,8 @@ #define NF_NAT_RANGE_MASK \ (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ - NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET) + NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET | \ + NF_NAT_RANGE_NETMAP) struct nf_nat_ipv4_range { unsigned int flags; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 7442aa8b1555..23a7bfd10521 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -60,6 +60,46 @@ static void nft_nat_setup_proto(struct nf_nat_range2 *range, nft_reg_load16(®s->data[priv->sreg_proto_max]); } +static void nft_nat_setup_netmap(struct nf_nat_range2 *range, + const struct nft_pktinfo *pkt, + const struct nft_nat *priv) +{ + struct sk_buff *skb = pkt->skb; + union nf_inet_addr new_addr; + __be32 netmask; + int i, len = 0; + + switch (priv->type) { + case NFT_NAT_SNAT: + if (nft_pf(pkt) == NFPROTO_IPV4) { + new_addr.ip = ip_hdr(skb)->saddr; + len = sizeof(struct in_addr); + } else { + new_addr.in6 = ipv6_hdr(skb)->saddr; + len = sizeof(struct in6_addr); + } + break; + case NFT_NAT_DNAT: + if (nft_pf(pkt) == NFPROTO_IPV4) { + new_addr.ip = ip_hdr(skb)->daddr; + len = sizeof(struct in_addr); + } else { + new_addr.in6 = ipv6_hdr(skb)->daddr; + len = sizeof(struct in6_addr); + } + break; + } + + for (i = 0; i < len / sizeof(__be32); i++) { + netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); + new_addr.ip6[i] &= ~netmask; + new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask; + } + + range->min_addr = new_addr; + range->max_addr = new_addr; +} + static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -70,8 +110,12 @@ static void nft_nat_eval(const struct nft_expr *expr, struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); - if (priv->sreg_addr_min) + + if (priv->sreg_addr_min) { nft_nat_setup_addr(&range, regs, priv); + if (priv->flags & NF_NAT_RANGE_NETMAP) + nft_nat_setup_netmap(&range, pkt, priv); + } if (priv->sreg_proto_min) nft_nat_setup_proto(&range, regs, priv); -- cgit v1.2.3 From c57053725d9bd7ab3e14fd0f8001714041df1280 Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Fri, 17 Apr 2020 20:50:30 -0400 Subject: drm/amdgpu: add tiling flags from Mesa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DCC_INDEPENDENT_128B is needed for displayble DCC on gfx10. SCANOUT is not needed by the kernel, but Mesa uses it. Signed-off-by: Marek Olšák Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- include/uapi/drm/amdgpu_drm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 65f69723cbdc..d28b4ce744d5 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -346,6 +346,10 @@ struct drm_amdgpu_gem_userptr { #define AMDGPU_TILING_DCC_PITCH_MAX_MASK 0x3FFF #define AMDGPU_TILING_DCC_INDEPENDENT_64B_SHIFT 43 #define AMDGPU_TILING_DCC_INDEPENDENT_64B_MASK 0x1 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_SHIFT 44 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_MASK 0x1 +#define AMDGPU_TILING_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_SCANOUT_MASK 0x1 /* Set/Get helpers for tiling flags. */ #define AMDGPU_TILING_SET(field, value) \ -- cgit v1.2.3 From 35ce006004821c5e9ae8fe03d048567cec99c41e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 7 Aug 2019 21:43:24 -0500 Subject: drm/amdgpu: add UAPI for creating encrypted buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a flag to the GEM_CREATE ioctl to create encrypted buffers. Buffers with this flag set will be created with the TMZ bit set in the PTEs or engines accessing them. This is required in order to properly access the data from the engines. Signed-off-by: Alex Deucher Reviewed-by: Huang Rui Reviewed-by: Christian König --- include/uapi/drm/amdgpu_drm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index d28b4ce744d5..4a0829d35e72 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -133,6 +133,11 @@ extern "C" { * releasing the memory */ #define AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE (1 << 9) +/* Flag that BO will be encrypted and that the TMZ bit should be + * set in the PTEs when mapping this buffer via GPUVM or + * accessing it with various hw blocks + */ +#define AMDGPU_GEM_CREATE_ENCRYPTED (1 << 10) struct drm_amdgpu_gem_create_in { /** the requested memory size */ -- cgit v1.2.3 From e90c2b210bad457aab73d3357465afe4d4475f7e Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Mon, 23 Sep 2019 19:02:41 -0400 Subject: drm/amdgpu: add UAPI to create secure commands (v3) Add a flag to the command submission IOCTL structure which when present indicates that this command submission should be treated as secure. The kernel driver uses this flag to determine whether the engine should be transitioned to secure or unsecure, or the work can be submitted to a secure queue depending on the IP. v3: the flag is now at command submission IOCTL Signed-off-by: Luben Tuikov Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- include/uapi/drm/amdgpu_drm.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 4a0829d35e72..34cd0bae06bc 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -205,6 +205,9 @@ union drm_amdgpu_bo_list { #define AMDGPU_CTX_OP_QUERY_STATE 3 #define AMDGPU_CTX_OP_QUERY_STATE2 4 +/* Flag the command submission as secure */ +#define AMDGPU_CS_FLAGS_SECURE (1 << 0) + /* GPU reset status */ #define AMDGPU_CTX_NO_RESET 0 /* this the context caused it */ @@ -564,7 +567,7 @@ struct drm_amdgpu_cs_in { /** Handle of resource list associated with CS */ __u32 bo_list_handle; __u32 num_chunks; - __u32 _pad; + __u32 flags; /** this points to __u64 * which point to cs chunks */ __u64 chunks; }; -- cgit v1.2.3 From 4baa8ff0690e464443594353d63c989f0ed9f831 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 27 Nov 2019 15:55:35 -0500 Subject: drm/amdgpu: move CS secure flag next the structs where it's used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So it's not mixed up with the CTX stuff. Reviewed-by: Zhan Liu Reviewed-by: Christian König Signed-off-by: Alex Deucher --- include/uapi/drm/amdgpu_drm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 34cd0bae06bc..bea72eb8c147 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -205,9 +205,6 @@ union drm_amdgpu_bo_list { #define AMDGPU_CTX_OP_QUERY_STATE 3 #define AMDGPU_CTX_OP_QUERY_STATE2 4 -/* Flag the command submission as secure */ -#define AMDGPU_CS_FLAGS_SECURE (1 << 0) - /* GPU reset status */ #define AMDGPU_CTX_NO_RESET 0 /* this the context caused it */ @@ -561,6 +558,9 @@ struct drm_amdgpu_cs_chunk { __u64 chunk_data; }; +/* Flag the command submission as secure */ +#define AMDGPU_CS_FLAGS_SECURE (1 << 0) + struct drm_amdgpu_cs_in { /** Rendering context id */ __u32 ctx_id; -- cgit v1.2.3 From 0bb5d5b03f78aeb5f87d47877eb15532875c64da Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Wed, 22 Apr 2020 17:56:56 -0400 Subject: drm/amdgpu: Move to a per-IB secure flag (TMZ) Move from a per-CS secure flag (TMZ) to a per-IB secure flag. Signed-off-by: Luben Tuikov Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 23 ++++++++++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 3 --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 9 ++++----- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 23 +++++++---------------- drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 3 +-- drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 3 +-- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 3 +-- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 20 ++++++-------------- include/uapi/drm/amdgpu_drm.h | 7 ++++--- 10 files changed, 44 insertions(+), 52 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 99de770a8e9f..3eee5c7d83e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -232,8 +232,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs if (ret) goto free_all_kdata; - p->job->secure = cs->in.flags & AMDGPU_CS_FLAGS_SECURE; - if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) { ret = -ECANCELED; goto free_all_kdata; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index 045951d2b46c..cba22039df6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -133,6 +133,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, uint64_t fence_ctx; uint32_t status = 0, alloc_size; unsigned fence_flags = 0; + bool secure; unsigned i; int r = 0; @@ -214,9 +215,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, if (job && ring->funcs->emit_cntxcntl) { status |= job->preamble_status; status |= job->preemption_status; - amdgpu_ring_emit_cntxcntl(ring, status, job->secure); + amdgpu_ring_emit_cntxcntl(ring, status); } + secure = false; for (i = 0; i < num_ibs; ++i) { ib = &ibs[i]; @@ -228,12 +230,27 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, !amdgpu_sriov_vf(adev)) /* for SRIOV preemption, Preamble CE ib must be inserted anyway */ continue; + /* If this IB is TMZ, add frame TMZ start packet, + * else, turn off TMZ. + */ + if (ib->flags & AMDGPU_IB_FLAGS_SECURE && ring->funcs->emit_tmz) { + if (!secure) { + secure = true; + amdgpu_ring_emit_tmz(ring, true); + } + } else if (secure) { + secure = false; + amdgpu_ring_emit_tmz(ring, false); + } + amdgpu_ring_emit_ib(ring, job, ib, status); status &= ~AMDGPU_HAVE_CTX_SWITCH; } - if (ring->funcs->emit_tmz) - amdgpu_ring_emit_tmz(ring, false, job ? job->secure : false); + if (secure) { + secure = false; + amdgpu_ring_emit_tmz(ring, false); + } #ifdef CONFIG_X86_64 if (!(adev->flags & AMD_IS_APU)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 7f5ccee476a4..81caac9b958a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -62,9 +62,6 @@ struct amdgpu_job { /* user fence handling */ uint64_t uf_addr; uint64_t uf_sequence; - - /* the job is due to a secure command submission */ - bool secure; }; int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 5956eff2d784..7d39064f9361 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -168,8 +168,7 @@ struct amdgpu_ring_funcs { void (*begin_use)(struct amdgpu_ring *ring); void (*end_use)(struct amdgpu_ring *ring); void (*emit_switch_buffer) (struct amdgpu_ring *ring); - void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags, - bool trusted); + void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags); void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t reg_val_offs); void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val); @@ -178,7 +177,7 @@ struct amdgpu_ring_funcs { void (*emit_reg_write_reg_wait)(struct amdgpu_ring *ring, uint32_t reg0, uint32_t reg1, uint32_t ref, uint32_t mask); - void (*emit_tmz)(struct amdgpu_ring *ring, bool start, bool trusted); + void (*emit_tmz)(struct amdgpu_ring *ring, bool start); /* Try to soft recover the ring to make the fence signal */ void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid); int (*preempt_ib)(struct amdgpu_ring *ring); @@ -252,12 +251,12 @@ struct amdgpu_ring { #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as)) #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r)) #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r)) -#define amdgpu_ring_emit_cntxcntl(r, d, s) (r)->funcs->emit_cntxcntl((r), (d), (s)) +#define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d)) #define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), (o)) #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v)) #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m)) #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) -#define amdgpu_ring_emit_tmz(r, b, s) (r)->funcs->emit_tmz((r), (b), (s)) +#define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b)) #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib))) #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r)) #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o)) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 473c1c145332..404c6d470515 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -3037,8 +3037,7 @@ static int gfx_v10_0_rlc_backdoor_autoload_enable(struct amdgpu_device *adev); static int gfx_v10_0_wait_for_rlc_autoload_complete(struct amdgpu_device *adev); static void gfx_v10_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume); static void gfx_v10_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume); -static void gfx_v10_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start, - bool trusted); +static void gfx_v10_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start); static void gfx10_kiq_set_resources(struct amdgpu_ring *kiq_ring, uint64_t queue_mask) { @@ -7436,8 +7435,7 @@ static void gfx_v10_0_ring_emit_sb(struct amdgpu_ring *ring) } static void gfx_v10_0_ring_emit_cntxcntl(struct amdgpu_ring *ring, - uint32_t flags, - bool trusted) + uint32_t flags) { uint32_t dw2 = 0; @@ -7445,8 +7443,6 @@ static void gfx_v10_0_ring_emit_cntxcntl(struct amdgpu_ring *ring, gfx_v10_0_ring_emit_ce_meta(ring, (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ? true : false); - gfx_v10_0_ring_emit_tmz(ring, true, trusted); - dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */ if (flags & AMDGPU_HAVE_CTX_SWITCH) { /* set load_global_config & load_global_uconfig */ @@ -7603,17 +7599,12 @@ static void gfx_v10_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume) sizeof(de_payload) >> 2); } -static void gfx_v10_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start, - bool trusted) +static void gfx_v10_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start) { - amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0)); - /* - * cmd = 0: frame begin - * cmd = 1: frame end - */ - amdgpu_ring_write(ring, - ((amdgpu_is_tmz(ring->adev) && trusted) ? FRAME_TMZ : 0) - | FRAME_CMD(start ? 0 : 1)); + if (amdgpu_is_tmz(ring->adev)) { + amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0)); + amdgpu_ring_write(ring, FRAME_TMZ | FRAME_CMD(start ? 0 : 1)); + } } static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c index 283b7fc10f98..aa1e1be852dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c @@ -2969,8 +2969,7 @@ static uint64_t gfx_v6_0_get_gpu_clock_counter(struct amdgpu_device *adev) return clock; } -static void gfx_v6_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags, - bool trusted) +static void gfx_v6_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) { if (flags & AMDGPU_HAVE_CTX_SWITCH) gfx_v6_0_ring_emit_vgt_flush(ring); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index f26e91354ba8..e5a88cad44cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -2320,8 +2320,7 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring, amdgpu_ring_write(ring, control); } -static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags, - bool trusted) +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) { uint32_t dw2 = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index d1312d829252..2fcf6865abba 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -6329,8 +6329,7 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring) amdgpu_ring_write(ring, 0); } -static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags, - bool trusted) +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) { uint32_t dw2 = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index bae5dd6ea348..4e042e974983 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -5442,29 +5442,21 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring) amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2); } -static void gfx_v9_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start, - bool trusted) +static void gfx_v9_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start) { - amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0)); - /* - * cmd = 0: frame begin - * cmd = 1: frame end - */ - amdgpu_ring_write(ring, - ((amdgpu_is_tmz(ring->adev) && trusted) ? FRAME_TMZ : 0) - | FRAME_CMD(start ? 0 : 1)); + if (amdgpu_is_tmz(ring->adev)) { + amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0)); + amdgpu_ring_write(ring, FRAME_TMZ | FRAME_CMD(start ? 0 : 1)); + } } -static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags, - bool trusted) +static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) { uint32_t dw2 = 0; if (amdgpu_sriov_vf(ring->adev)) gfx_v9_0_ring_emit_ce_meta(ring); - gfx_v9_0_ring_emit_tmz(ring, true, trusted); - dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */ if (flags & AMDGPU_HAVE_CTX_SWITCH) { /* set load_global_config & load_global_uconfig */ diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index bea72eb8c147..e01b673f0449 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -558,9 +558,6 @@ struct drm_amdgpu_cs_chunk { __u64 chunk_data; }; -/* Flag the command submission as secure */ -#define AMDGPU_CS_FLAGS_SECURE (1 << 0) - struct drm_amdgpu_cs_in { /** Rendering context id */ __u32 ctx_id; @@ -601,6 +598,10 @@ union drm_amdgpu_cs { */ #define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) +/* Flag the IB as secure (TMZ) + */ +#define AMDGPU_IB_FLAGS_SECURE (1 << 5) + struct drm_amdgpu_cs_chunk_ib { __u32 _pad; /** AMDGPU_IB_FLAG_* */ -- cgit v1.2.3 From 5bb4b78be9c67b02a7f138850e9e89825181f555 Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Mon, 6 May 2019 22:11:14 -0500 Subject: drm/amdkfd: New IOCTL to allocate queue GWS (v2) Add a new kfd ioctl to allocate queue GWS. Queue GWS is released on queue destroy. v2: re-introduce this API with the following fixes squashed in: - drm/amdkfd: fix null pointer dereference on dev - drm/amdkfd: Return proper error code for gws alloc API - drm/amdkfd: Remove GPU ID in GWS queue creation Signed-off-by: Oak Zeng Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 39 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 9 +++++ include/uapi/linux/kfd_ioctl.h | 19 ++++++++++- 4 files changed, 68 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 0ec5f25adf56..5eb1314f500b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1584,6 +1584,43 @@ copy_from_user_failed: return err; } +static int kfd_ioctl_alloc_queue_gws(struct file *filep, + struct kfd_process *p, void *data) +{ + int retval; + struct kfd_ioctl_alloc_queue_gws_args *args = data; + struct queue *q; + struct kfd_dev *dev; + + if (!hws_gws_support) + return -ENODEV; + + mutex_lock(&p->mutex); + q = pqm_get_user_queue(&p->pqm, args->queue_id); + + if (q) { + dev = q->device; + } else { + retval = -EINVAL; + goto out_unlock; + } + + if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { + retval = -ENODEV; + goto out_unlock; + } + + retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL); + mutex_unlock(&p->mutex); + + args->first_gws = 0; + return retval; + +out_unlock: + mutex_unlock(&p->mutex); + return retval; +} + static int kfd_ioctl_get_dmabuf_info(struct file *filep, struct kfd_process *p, void *data) { @@ -1786,6 +1823,8 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, kfd_ioctl_import_dmabuf, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, + kfd_ioctl_alloc_queue_gws, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 4a3049841086..5e7f1fb6761b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -923,6 +923,8 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, void *gws); struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, unsigned int qid); +struct queue *pqm_get_user_queue(struct process_queue_manager *pqm, + unsigned int qid); int pqm_get_wave_state(struct process_queue_manager *pqm, unsigned int qid, void __user *ctl_stack, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 084c35f55d59..eb1635ac8988 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -476,6 +476,15 @@ struct kernel_queue *pqm_get_kernel_queue( return NULL; } +struct queue *pqm_get_user_queue(struct process_queue_manager *pqm, + unsigned int qid) +{ + struct process_queue_node *pqn; + + pqn = get_queue_by_qid(pqm, qid); + return pqn ? pqn->q : NULL; +} + int pqm_get_wave_state(struct process_queue_manager *pqm, unsigned int qid, void __user *ctl_stack, diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 20917c59f39c..4f6676428c5c 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -410,6 +410,20 @@ struct kfd_ioctl_unmap_memory_from_gpu_args { __u32 n_success; /* to/from KFD */ }; +/* Allocate GWS for specific queue + * + * @queue_id: queue's id that GWS is allocated for + * @num_gws: how many GWS to allocate + * @first_gws: index of the first GWS allocated. + * only support contiguous GWS allocation + */ +struct kfd_ioctl_alloc_queue_gws_args { + __u32 queue_id; /* to KFD */ + __u32 num_gws; /* to KFD */ + __u32 first_gws; /* from KFD */ + __u32 pad; +}; + struct kfd_ioctl_get_dmabuf_info_args { __u64 size; /* from KFD */ __u64 metadata_ptr; /* to KFD */ @@ -529,7 +543,10 @@ enum kfd_mmio_remap { #define AMDKFD_IOC_IMPORT_DMABUF \ AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args) +#define AMDKFD_IOC_ALLOC_QUEUE_GWS \ + AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x1E +#define AMDKFD_COMMAND_END 0x1F #endif -- cgit v1.2.3 From a3b80e1078943dc12553166fb08e258463dec013 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:06 -0700 Subject: bpf: Allocate ID for bpf_link Generate ID for each bpf_link using IDR, similarly to bpf_map and bpf_prog. bpf_link creation, initialization, attachment, and exposing to user-space through FD and ID is a complicated multi-step process, abstract it away through bpf_link_primer and bpf_link_prime(), bpf_link_settle(), and bpf_link_cleanup() internal API. They guarantee that until bpf_link is properly attached, user-space won't be able to access partially-initialized bpf_link either from FD or ID. All this allows to simplify bpf_link attachment and error handling code. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-3-andriin@fb.com --- include/linux/bpf.h | 17 ++++-- include/uapi/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 14 ++--- kernel/bpf/syscall.c | 143 ++++++++++++++++++++++++++++++++--------------- 4 files changed, 118 insertions(+), 57 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 81c8620cb4c4..875d1f0af803 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1085,11 +1085,19 @@ int bpf_prog_new_fd(struct bpf_prog *prog); struct bpf_link { atomic64_t refcnt; + u32 id; const struct bpf_link_ops *ops; struct bpf_prog *prog; struct work_struct work; }; +struct bpf_link_primer { + struct bpf_link *link; + struct file *file; + int fd; + u32 id; +}; + struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); @@ -1097,10 +1105,11 @@ struct bpf_link_ops { struct bpf_prog *old_prog); }; -void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, - struct bpf_prog *prog); -void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd); +void bpf_link_init(struct bpf_link *link, + const struct bpf_link_ops *ops, struct bpf_prog *prog); +int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); +int bpf_link_settle(struct bpf_link_primer *primer); +void bpf_link_cleanup(struct bpf_link_primer *primer); void bpf_link_inc(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4a6c47f3febe..6121aa487465 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -523,6 +523,7 @@ union bpf_attr { __u32 prog_id; __u32 map_id; __u32 btf_id; + __u32 link_id; }; __u32 next_id; __u32 open_flags; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index da6e48e802b2..1bdf37fca879 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -841,10 +841,10 @@ const struct bpf_link_ops bpf_cgroup_link_lops = { int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_cgroup_link *link; - struct file *link_file; struct cgroup *cgrp; - int err, link_fd; + int err; if (attr->link_create.flags) return -EINVAL; @@ -862,22 +862,20 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->cgroup = cgrp; link->type = attr->link_create.attach_type; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_cgroup; } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, BPF_F_ALLOW_MULTI); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_cgroup; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_cgroup: cgroup_put(cgrp); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f5358e1462eb..5439e05e3d25 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -42,6 +42,8 @@ static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); +static DEFINE_IDR(link_idr); +static DEFINE_SPINLOCK(link_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -2181,25 +2183,38 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, - struct bpf_prog *prog) +void bpf_link_init(struct bpf_link *link, + const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->id = 0; link->ops = ops; link->prog = prog; } +static void bpf_link_free_id(int id) +{ + if (!id) + return; + + spin_lock_bh(&link_idr_lock); + idr_remove(&link_idr, id); + spin_unlock_bh(&link_idr_lock); +} + /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred - * anon_inode's release() call. This helper manages marking bpf_link as - * defunct, releases anon_inode file and puts reserved FD. + * anon_inode's release() call. This helper marksbpf_link as + * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt + * is not decremented, it's the responsibility of a calling code that failed + * to complete bpf_link initialization. */ -void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, - int link_fd) +void bpf_link_cleanup(struct bpf_link_primer *primer) { - link->prog = NULL; - fput(link_file); - put_unused_fd(link_fd); + primer->link->prog = NULL; + bpf_link_free_id(primer->id); + fput(primer->file); + put_unused_fd(primer->fd); } void bpf_link_inc(struct bpf_link *link) @@ -2210,6 +2225,7 @@ void bpf_link_inc(struct bpf_link *link) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bpf_link_free_id(link->id); if (link->prog) { /* detach BPF program, clean up used resources */ link->ops->release(link); @@ -2275,9 +2291,11 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "link_type:\t%s\n" + "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", link_type, + link->id, prog_tag, prog->aux->id); } @@ -2292,36 +2310,76 @@ static const struct file_operations bpf_link_fops = { .write = bpf_dummy_write, }; -int bpf_link_new_fd(struct bpf_link *link) +static int bpf_link_alloc_id(struct bpf_link *link) { - return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); -} + int id; + + idr_preload(GFP_KERNEL); + spin_lock_bh(&link_idr_lock); + id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); + spin_unlock_bh(&link_idr_lock); + idr_preload_end(); -/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but - * instead of immediately installing fd in fdtable, just reserve it and - * return. Caller then need to either install it with fd_install(fd, file) or - * release with put_unused_fd(fd). - * This is useful for cases when bpf_link attachment/detachment are - * complicated and expensive operations and should be delayed until all the fd - * reservation and anon_inode creation succeeds. + return id; +} + +/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, + * reserving unused FD and allocating ID from link_idr. This is to be paired + * with bpf_link_settle() to install FD and ID and expose bpf_link to + * user-space, if bpf_link is successfully attached. If not, bpf_link and + * pre-allocated resources are to be freed with bpf_cleanup() call. All the + * transient state is passed around in struct bpf_link_primer. + * This is preferred way to create and initialize bpf_link, especially when + * there are complicated and expensive operations inbetween creating bpf_link + * itself and attaching it to BPF hook. By using bpf_link_prime() and + * bpf_link_settle() kernel code using bpf_link doesn't have to perform + * expensive (and potentially failing) roll back operations in a rare case + * that file, FD, or ID can't be allocated. */ -struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd) +int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { struct file *file; - int fd; + int fd, id; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) - return ERR_PTR(fd); + return fd; file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); if (IS_ERR(file)) { put_unused_fd(fd); - return file; + return PTR_ERR(file); } - *reserved_fd = fd; - return file; + id = bpf_link_alloc_id(link); + if (id < 0) { + put_unused_fd(fd); + fput(file); + return id; + } + + primer->link = link; + primer->file = file; + primer->fd = fd; + primer->id = id; + return 0; +} + +int bpf_link_settle(struct bpf_link_primer *primer) +{ + /* make bpf_link fetchable by ID */ + spin_lock_bh(&link_idr_lock); + primer->link->id = primer->id; + spin_unlock_bh(&link_idr_lock); + /* make bpf_link fetchable by FD */ + fd_install(primer->fd, primer->file); + /* pass through installed FD */ + return primer->fd; +} + +int bpf_link_new_fd(struct bpf_link *link) +{ + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); } struct bpf_link *bpf_link_get_from_fd(u32 ufd) @@ -2367,9 +2425,9 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog) { + struct bpf_link_primer link_primer; struct bpf_tracing_link *link; - struct file *link_file; - int link_fd, err; + int err; switch (prog->type) { case BPF_PROG_TYPE_TRACING: @@ -2404,22 +2462,19 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) } bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_prog; } err = bpf_trampoline_link_prog(prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_prog; } - fd_install(link_fd, link_file); - return link_fd; - + return bpf_link_settle(&link_primer); out_put_prog: bpf_prog_put(prog); return err; @@ -2447,7 +2502,7 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } -static const struct bpf_link_ops bpf_raw_tp_lops = { +static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, }; @@ -2456,13 +2511,13 @@ static const struct bpf_link_ops bpf_raw_tp_lops = { static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { + struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; - struct file *link_file; struct bpf_prog *prog; const char *tp_name; char buf[128]; - int link_fd, err; + int err; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; @@ -2515,24 +2570,22 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_lops, prog); + bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog); link->btp = btp; - link_file = bpf_link_new_file(&link->link, &link_fd); - if (IS_ERR(link_file)) { + err = bpf_link_prime(&link->link, &link_primer); + if (err) { kfree(link); - err = PTR_ERR(link_file); goto out_put_btp; } err = bpf_probe_register(link->btp, prog); if (err) { - bpf_link_cleanup(&link->link, link_file, link_fd); + bpf_link_cleanup(&link_primer); goto out_put_btp; } - fd_install(link_fd, link_file); - return link_fd; + return bpf_link_settle(&link_primer); out_put_btp: bpf_put_raw_tracepoint(btp); @@ -3464,7 +3517,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (file->f_op == &bpf_link_fops) { struct bpf_link *link = file->private_data; - if (link->ops == &bpf_raw_tp_lops) { + if (link->ops == &bpf_raw_tp_link_lops) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); struct bpf_raw_event_map *btp = raw_tp->btp; -- cgit v1.2.3 From 2d602c8cf40d65d4a7ac34fe18648d8778e6e594 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:07 -0700 Subject: bpf: Support GET_FD_BY_ID and GET_NEXT_ID for bpf_link Add support to look up bpf_link by ID and iterate over all existing bpf_links in the system. GET_FD_BY_ID code handles not-yet-ready bpf_link by checking that its ID hasn't been set to non-zero value yet. Setting bpf_link's ID is done as the very last step in finalizing bpf_link, together with installing FD. This approach allows users of bpf_link in kernel code to not worry about races between user-space and kernel code that hasn't finished attaching and initializing bpf_link. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-4-andriin@fb.com --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6121aa487465..7e6541fceade 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -113,6 +113,8 @@ enum bpf_cmd { BPF_MAP_DELETE_BATCH, BPF_LINK_CREATE, BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, }; enum bpf_map_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5439e05e3d25..1c213a730502 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3713,6 +3713,48 @@ out_put_link: return ret; } +static int bpf_link_inc_not_zero(struct bpf_link *link) +{ + return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; +} + +#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id + +static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_link *link; + u32 id = attr->link_id; + int fd, err; + + if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&link_idr_lock); + link = idr_find(&link_idr, id); + /* before link is "settled", ID is 0, pretend it doesn't exist yet */ + if (link) { + if (link->id) + err = bpf_link_inc_not_zero(link); + else + err = -EAGAIN; + } else { + err = -ENOENT; + } + spin_unlock_bh(&link_idr_lock); + + if (err) + return err; + + fd = bpf_link_new_fd(link); + if (fd < 0) + bpf_link_put(link); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3830,6 +3872,13 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_UPDATE: err = link_update(&attr); break; + case BPF_LINK_GET_FD_BY_ID: + err = bpf_link_get_fd_by_id(&attr); + break; + case BPF_LINK_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &link_idr, &link_idr_lock); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From f2e10bff16a0fdd41ba278c84da9813700e356af Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Apr 2020 17:16:08 -0700 Subject: bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link Add ability to fetch bpf_link details through BPF_OBJ_GET_INFO_BY_FD command. Also enhance show_fdinfo to potentially include bpf_link type-specific information (similarly to obj_info). Also introduce enum bpf_link_type stored in bpf_link itself and expose it in UAPI. bpf_link_tracing also now will store and return bpf_attach_type. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200429001614.1544-5-andriin@fb.com --- include/linux/bpf-cgroup.h | 2 - include/linux/bpf.h | 8 ++- include/linux/bpf_types.h | 6 ++ include/uapi/linux/bpf.h | 28 ++++++++ kernel/bpf/btf.c | 2 + kernel/bpf/cgroup.c | 43 +++++++++++- kernel/bpf/syscall.c | 155 ++++++++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 2 + tools/include/uapi/linux/bpf.h | 31 +++++++++ 9 files changed, 253 insertions(+), 24 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a9cb9a5bf8e9..272626cc3fc9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -57,8 +57,6 @@ struct bpf_cgroup_link { enum bpf_attach_type type; }; -extern const struct bpf_link_ops bpf_cgroup_link_lops; - struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 875d1f0af803..c07b1d2f3824 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1026,9 +1026,11 @@ extern const struct file_operations bpf_prog_fops; extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ extern const struct bpf_map_ops _ops; +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE extern const struct bpf_prog_ops bpf_offload_prog_ops; extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; @@ -1086,6 +1088,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog); struct bpf_link { atomic64_t refcnt; u32 id; + enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; struct work_struct work; @@ -1103,9 +1106,12 @@ struct bpf_link_ops { void (*dealloc)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); + void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); + int (*fill_link_info)(const struct bpf_link *link, + struct bpf_link_info *info); }; -void bpf_link_init(struct bpf_link *link, +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index ba0c2d56f8a3..8345cdf553b8 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -118,3 +118,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) #if defined(CONFIG_BPF_JIT) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif + +BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) +BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) +#ifdef CONFIG_CGROUP_BPF +BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) +#endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7e6541fceade..0eccafae55bb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -222,6 +222,15 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + + MAX_BPF_LINK_TYPE, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -3612,6 +3621,25 @@ struct bpf_btf_info { __u32 id; } __attribute__((aligned(8))); +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + }; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach attach type). diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d65c6912bdaf..a2cfba89a8e1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3482,6 +3482,7 @@ extern char __weak __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) static union { struct bpf_ctx_convert { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ @@ -3508,6 +3509,7 @@ static u8 bpf_ctx_convert_map[] = { 0, /* avoid empty array */ }; #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 1bdf37fca879..5c0e964105ac 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -833,10 +833,48 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } -const struct bpf_link_ops bpf_cgroup_link_lops = { +static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + seq_printf(seq, + "cgroup_id:\t%llu\n" + "attach_type:\t%d\n", + cg_id, + cg_link->type); +} + +static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + u64 cg_id = 0; + + mutex_lock(&cgroup_mutex); + if (cg_link->cgroup) + cg_id = cgroup_id(cg_link->cgroup); + mutex_unlock(&cgroup_mutex); + + info->cgroup.cgroup_id = cg_id; + info->cgroup.attach_type = cg_link->type; + return 0; +} + +static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, .update_prog = cgroup_bpf_replace, + .show_fdinfo = bpf_cgroup_link_show_fdinfo, + .fill_link_info = bpf_cgroup_link_fill_link_info, }; int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -858,7 +896,8 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) err = -ENOMEM; goto out_put_cgroup; } - bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, + prog); link->cgroup = cgrp; link->type = attr->link_create.attach_type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1c213a730502..d23c04cbe14f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -51,9 +51,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* @@ -1548,9 +1550,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) @@ -2183,10 +2187,11 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { atomic64_set(&link->refcnt, 1); + link->type = type; link->id = 0; link->ops = ops; link->prog = prog; @@ -2266,27 +2271,23 @@ static int bpf_link_release(struct inode *inode, struct file *filp) return 0; } -#ifdef CONFIG_PROC_FS -static const struct bpf_link_ops bpf_raw_tp_lops; -static const struct bpf_link_ops bpf_tracing_link_lops; +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) [_id] = #_name, +static const char *bpf_link_type_strs[] = { + [BPF_LINK_TYPE_UNSPEC] = "", +#include +}; +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE +#ifdef CONFIG_PROC_FS static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - const char *link_type; - - if (link->ops == &bpf_raw_tp_lops) - link_type = "raw_tracepoint"; - else if (link->ops == &bpf_tracing_link_lops) - link_type = "tracing"; -#ifdef CONFIG_CGROUP_BPF - else if (link->ops == &bpf_cgroup_link_lops) - link_type = "cgroup"; -#endif - else - link_type = "unknown"; bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, @@ -2294,10 +2295,12 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) "link_id:\t%u\n" "prog_tag:\t%s\n" "prog_id:\t%u\n", - link_type, + bpf_link_type_strs[link->type], link->id, prog_tag, prog->aux->id); + if (link->ops->show_fdinfo) + link->ops->show_fdinfo(link, m); } #endif @@ -2403,6 +2406,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct bpf_tracing_link { struct bpf_link link; + enum bpf_attach_type attach_type; }; static void bpf_tracing_link_release(struct bpf_link *link) @@ -2418,9 +2422,33 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) kfree(tr_link); } +static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + seq_printf(seq, + "attach_type:\t%d\n", + tr_link->attach_type); +} + +static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + info->tracing.attach_type = tr_link->attach_type; + + return 0; +} + static const struct bpf_link_ops bpf_tracing_link_lops = { .release = bpf_tracing_link_release, .dealloc = bpf_tracing_link_dealloc, + .show_fdinfo = bpf_tracing_link_show_fdinfo, + .fill_link_info = bpf_tracing_link_fill_link_info, }; static int bpf_tracing_prog_attach(struct bpf_prog *prog) @@ -2460,7 +2488,9 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog); + link->attach_type = prog->expected_attach_type; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -2502,9 +2532,56 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link) kfree(raw_tp); } +static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + + seq_printf(seq, + "tp_name:\t%s\n", + raw_tp_link->btp->tp->name); +} + +static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_raw_tp_link *raw_tp_link = + container_of(link, struct bpf_raw_tp_link, link); + char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); + const char *tp_name = raw_tp_link->btp->tp->name; + u32 ulen = info->raw_tracepoint.tp_name_len; + size_t tp_len = strlen(tp_name); + + if (ulen && !ubuf) + return -EINVAL; + + info->raw_tracepoint.tp_name_len = tp_len + 1; + + if (!ubuf) + return 0; + + if (ulen >= tp_len + 1) { + if (copy_to_user(ubuf, tp_name, tp_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, tp_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + + return 0; +} + static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc = bpf_raw_tp_link_dealloc, + .show_fdinfo = bpf_raw_tp_link_show_fdinfo, + .fill_link_info = bpf_raw_tp_link_fill_link_info, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd @@ -2570,7 +2647,8 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, &bpf_raw_tp_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog); link->btp = btp; err = bpf_link_prime(&link->link, &link_primer); @@ -3366,6 +3444,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } +static int bpf_link_get_info_by_fd(struct bpf_link *link, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_link_info info; + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + + info.type = link->type; + info.id = link->id; + info.prog_id = link->prog->aux->id; + + if (link->ops->fill_link_info) { + err = link->ops->fill_link_info(link, &info); + if (err) + return err; + } + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -3390,6 +3504,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, uattr); else if (f.file->f_op == &btf_fops) err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + else if (f.file->f_op == &bpf_link_fops) + err = bpf_link_get_info_by_fd(f.file->private_data, + attr, uattr); else err = -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 91728e0f27eb..2b337e32aa94 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) +#define BPF_LINK_TYPE(_id, _name) #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +#undef BPF_LINK_TYPE }; /* bpf_check() is a static code analyzer that walks eBPF program diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4a6c47f3febe..0eccafae55bb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -113,6 +113,8 @@ enum bpf_cmd { BPF_MAP_DELETE_BATCH, BPF_LINK_CREATE, BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, }; enum bpf_map_type { @@ -220,6 +222,15 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + + MAX_BPF_LINK_TYPE, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -523,6 +534,7 @@ union bpf_attr { __u32 prog_id; __u32 map_id; __u32 btf_id; + __u32 link_id; }; __u32 next_id; __u32 open_flags; @@ -3609,6 +3621,25 @@ struct bpf_btf_info { __u32 id; } __attribute__((aligned(8))); +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + }; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach attach type). -- cgit v1.2.3 From 97f9ac3db6612f14ac0c509e1a63ce14fd4cc0eb Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Tue, 21 Apr 2020 12:44:49 -0500 Subject: crypto: ccp - Add support for SEV-ES to the PSP driver To provide support for SEV-ES, the hypervisor must provide an area of memory to the PSP. Once this Trusted Memory Region (TMR) is provided to the PSP, the contents of this area of memory are no longer available to the x86. Update the PSP driver to allocate a 1MB region for the TMR that is 1MB aligned and then provide it to the PSP through the SEV INIT command. Signed-off-by: Tom Lendacky Reviewed-by: Brijesh Singh Reviewed-by: Joerg Roedel Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 43 +++++++++++++++++++++++++++++++++++++++++++ include/linux/psp-sev.h | 2 ++ include/uapi/linux/psp-sev.h | 2 ++ 3 files changed, 47 insertions(+) (limited to 'include/uapi') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 896f190b9a50..439cd737076e 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -44,6 +45,14 @@ MODULE_PARM_DESC(psp_probe_timeout, " default timeout value, in seconds, during static bool psp_dead; static int psp_timeout; +/* Trusted Memory Region (TMR): + * The TMR is a 1MB area that must be 1MB aligned. Use the page allocator + * to allocate the memory, which will return aligned memory for the specified + * allocation order. + */ +#define SEV_ES_TMR_SIZE (1024 * 1024) +static void *sev_es_tmr; + static inline bool sev_version_greater_or_equal(u8 maj, u8 min) { struct sev_device *sev = psp_master->sev_data; @@ -214,6 +223,20 @@ static int __sev_platform_init_locked(int *error) if (sev->state == SEV_STATE_INIT) return 0; + if (sev_es_tmr) { + u64 tmr_pa; + + /* + * Do not include the encryption mask on the physical + * address of the TMR (firmware should clear it anyway). + */ + tmr_pa = __pa(sev_es_tmr); + + sev->init_cmd_buf.flags |= SEV_INIT_FLAGS_SEV_ES; + sev->init_cmd_buf.tmr_address = tmr_pa; + sev->init_cmd_buf.tmr_len = SEV_ES_TMR_SIZE; + } + rc = __sev_do_cmd_locked(SEV_CMD_INIT, &sev->init_cmd_buf, error); if (rc) return rc; @@ -1012,6 +1035,7 @@ EXPORT_SYMBOL_GPL(sev_issue_cmd_external_user); void sev_pci_init(void) { struct sev_device *sev = psp_master->sev_data; + struct page *tmr_page; int error, rc; if (!sev) @@ -1041,6 +1065,16 @@ void sev_pci_init(void) sev_update_firmware(sev->dev) == 0) sev_get_api_version(); + /* Obtain the TMR memory area for SEV-ES use */ + tmr_page = alloc_pages(GFP_KERNEL, get_order(SEV_ES_TMR_SIZE)); + if (tmr_page) { + sev_es_tmr = page_address(tmr_page); + } else { + sev_es_tmr = NULL; + dev_warn(sev->dev, + "SEV: TMR allocation failed, SEV-ES support unavailable\n"); + } + /* Initialize the platform */ rc = sev_platform_init(&error); if (rc && (error == SEV_RET_SECURE_DATA_INVALID)) { @@ -1075,4 +1109,13 @@ void sev_pci_exit(void) return; sev_platform_shutdown(NULL); + + if (sev_es_tmr) { + /* The TMR area was encrypted, flush it from the cache */ + wbinvd_on_all_cpus(); + + free_pages((unsigned long)sev_es_tmr, + get_order(SEV_ES_TMR_SIZE)); + sev_es_tmr = NULL; + } } diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 5167bf2bfc75..7fbc8679145c 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -100,6 +100,8 @@ struct sev_data_init { u32 tmr_len; /* In */ } __packed; +#define SEV_INIT_FLAGS_SEV_ES 0x01 + /** * struct sev_data_pek_csr - PEK_CSR command parameters * diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 0549a5c622bf..91b4c63d5cbf 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -83,6 +83,8 @@ struct sev_user_data_status { __u32 guest_count; /* Out */ } __packed; +#define SEV_STATUS_FLAGS_CONFIG_ES 0x0100 + /** * struct sev_user_data_pek_csr - PEK_CSR command parameters * -- cgit v1.2.3 From 6e3a401fc8af01828bcdc92d713195d318b36e7e Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Thu, 30 Apr 2020 18:51:14 +0300 Subject: inet_diag: add cgroup id attribute This patch adds cgroup v2 ID to common inet diag message attributes. Cgroup v2 ID is kernfs ID (ino or ino+gen). This attribute allows filter inet diag output by cgroup ID obtained by name_to_handle_at() syscall. When net_cls or net_prio cgroup is activated this ID is equal to 1 (root cgroup ID) for newly created sockets. Some notes about this ID: 1) gets initialized in socket() syscall 2) incoming socket gets ID from listening socket (not during accept() syscall) 3) not changed when process get moved to another cgroup 4) can point to deleted cgroup (refcounting) v2: - use CONFIG_SOCK_CGROUP_DATA instead if CONFIG_CGROUPS v3: - fix attr size by using nla_total_size_64bit() (Eric Dumazet) - more detailed commit message (Konstantin Khlebnikov) Signed-off-by: Dmitry Yakunin Reviewed-by: Konstantin Khlebnikov Acked-By: Tejun Heo Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 6 +++++- include/uapi/linux/inet_diag.h | 1 + net/ipv4/inet_diag.c | 7 +++++++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index ce9ed1c0602f..0ef2d800fda7 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -71,7 +71,11 @@ static inline size_t inet_diag_msg_attrs_size(void) + nla_total_size(1) /* INET_DIAG_SKV6ONLY */ #endif + nla_total_size(4) /* INET_DIAG_MARK */ - + nla_total_size(4); /* INET_DIAG_CLASS_ID */ + + nla_total_size(4) /* INET_DIAG_CLASS_ID */ +#ifdef CONFIG_SOCK_CGROUP_DATA + + nla_total_size_64bit(sizeof(u64)) /* INET_DIAG_CGROUP_ID */ +#endif + ; } int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 57cc429a9177..c9b1e551792c 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -157,6 +157,7 @@ enum { INET_DIAG_MD5SIG, INET_DIAG_ULP_INFO, INET_DIAG_SK_BPF_STORAGES, + INET_DIAG_CGROUP_ID, __INET_DIAG_MAX, }; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5d50aad3cdbf..9c4c315cbc10 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -162,6 +162,13 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, goto errout; } +#ifdef CONFIG_SOCK_CGROUP_DATA + if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID, + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)), + INET_DIAG_PAD)) + goto errout; +#endif + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); -- cgit v1.2.3 From b1f3e43dbfacfcd95296b0f80f84b186add9ef54 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Thu, 30 Apr 2020 18:51:15 +0300 Subject: inet_diag: add support for cgroup filter This patch adds ability to filter sockets based on cgroup v2 ID. Such filter is helpful in ss utility for filtering sockets by cgroup pathname. Signed-off-by: Dmitry Yakunin Reviewed-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + net/ipv4/inet_diag.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index c9b1e551792c..e6f183ee8417 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -96,6 +96,7 @@ enum { INET_DIAG_BC_MARK_COND, INET_DIAG_BC_S_EQ, INET_DIAG_BC_D_EQ, + INET_DIAG_BC_CGROUP_COND, /* u64 cgroup v2 ID */ }; struct inet_diag_hostcond { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 9c4c315cbc10..0034092358c3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -43,6 +43,9 @@ struct inet_diag_entry { u16 userlocks; u32 ifindex; u32 mark; +#ifdef CONFIG_SOCK_CGROUP_DATA + u64 cgroup_id; +#endif }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -682,6 +685,16 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } +#ifdef CONFIG_SOCK_CGROUP_DATA + case INET_DIAG_BC_CGROUP_COND: { + u64 cgroup_id; + + cgroup_id = get_unaligned((const u64 *)(op + 1)); + if (cgroup_id != entry->cgroup_id) + yes = 0; + break; + } +#endif } if (yes) { @@ -732,6 +745,9 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; else entry.mark = 0; +#ifdef CONFIG_SOCK_CGROUP_DATA + entry.cgroup_id = cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)); +#endif return inet_diag_bc_run(bc, &entry); } @@ -821,6 +837,15 @@ static bool valid_markcond(const struct inet_diag_bc_op *op, int len, return len >= *min_len; } +#ifdef CONFIG_SOCK_CGROUP_DATA +static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + *min_len += sizeof(u64); + return len >= *min_len; +} +#endif + static int inet_diag_bc_audit(const struct nlattr *attr, const struct sk_buff *skb) { @@ -863,6 +888,12 @@ static int inet_diag_bc_audit(const struct nlattr *attr, if (!valid_markcond(bc, len, &min_len)) return -EINVAL; break; +#ifdef CONFIG_SOCK_CGROUP_DATA + case INET_DIAG_BC_CGROUP_COND: + if (!valid_cgroupcond(bc, len, &min_len)) + return -EINVAL; + break; +#endif case INET_DIAG_BC_AUTO: case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: -- cgit v1.2.3 From 06bfa47e72c83550fefc93c62a1ace5fff72e212 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:31 +0200 Subject: docs: networking: convert timestamping.txt to ReST - add SPDX header; - add a document title; - adjust titles and chapters, adding proper markups; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/packet_mmap.rst | 4 +- Documentation/networking/timestamping.rst | 591 ++++++++++++++++++++++++++++++ Documentation/networking/timestamping.txt | 571 ----------------------------- include/uapi/linux/errqueue.h | 2 +- 5 files changed, 595 insertions(+), 574 deletions(-) create mode 100644 Documentation/networking/timestamping.rst delete mode 100644 Documentation/networking/timestamping.txt (limited to 'include/uapi') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index be65ee509669..8f9a84b8e3f2 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -109,6 +109,7 @@ Contents: tc-actions-env-rules tcp-thin team + timestamping .. only:: subproject and html diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst index 884c7222b9e9..6c009ceb1183 100644 --- a/Documentation/networking/packet_mmap.rst +++ b/Documentation/networking/packet_mmap.rst @@ -1030,7 +1030,7 @@ the packet meta information for mmap(2)ed RX_RING and TX_RINGs. If your NIC is capable of timestamping packets in hardware, you can request those hardware timestamps to be used. Note: you may need to enable the generation of hardware timestamps with SIOCSHWTSTAMP (see related information from -Documentation/networking/timestamping.txt). +Documentation/networking/timestamping.rst). PACKET_TIMESTAMP accepts the same integer bit field as SO_TIMESTAMPING:: @@ -1069,7 +1069,7 @@ TX_RING part only TP_STATUS_AVAILABLE is set, then the tp_sec and tp_{n,u}sec members do not contain a valid value. For TX_RINGs, by default no timestamp is generated! -See include/linux/net_tstamp.h and Documentation/networking/timestamping.txt +See include/linux/net_tstamp.h and Documentation/networking/timestamping.rst for more information on hardware timestamps. Miscellaneous bits diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst new file mode 100644 index 000000000000..1adead6a4527 --- /dev/null +++ b/Documentation/networking/timestamping.rst @@ -0,0 +1,591 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +Timestamping +============ + + +1. Control Interfaces +===================== + +The interfaces for receiving network packages timestamps are: + +SO_TIMESTAMP + Generates a timestamp for each incoming packet in (not necessarily + monotonic) system time. Reports the timestamp via recvmsg() in a + control message in usec resolution. + SO_TIMESTAMP is defined as SO_TIMESTAMP_NEW or SO_TIMESTAMP_OLD + based on the architecture type and time_t representation of libc. + Control message format is in struct __kernel_old_timeval for + SO_TIMESTAMP_OLD and in struct __kernel_sock_timeval for + SO_TIMESTAMP_NEW options respectively. + +SO_TIMESTAMPNS + Same timestamping mechanism as SO_TIMESTAMP, but reports the + timestamp as struct timespec in nsec resolution. + SO_TIMESTAMPNS is defined as SO_TIMESTAMPNS_NEW or SO_TIMESTAMPNS_OLD + based on the architecture type and time_t representation of libc. + Control message format is in struct timespec for SO_TIMESTAMPNS_OLD + and in struct __kernel_timespec for SO_TIMESTAMPNS_NEW options + respectively. + +IP_MULTICAST_LOOP + SO_TIMESTAMP[NS] + Only for multicast:approximate transmit timestamp obtained by + reading the looped packet receive timestamp. + +SO_TIMESTAMPING + Generates timestamps on reception, transmission or both. Supports + multiple timestamp sources, including hardware. Supports generating + timestamps for stream sockets. + + +1.1 SO_TIMESTAMP (also SO_TIMESTAMP_OLD and SO_TIMESTAMP_NEW) +------------------------------------------------------------- + +This socket option enables timestamping of datagrams on the reception +path. Because the destination socket, if any, is not known early in +the network stack, the feature has to be enabled for all packets. The +same is true for all early receive timestamp options. + +For interface details, see `man 7 socket`. + +Always use SO_TIMESTAMP_NEW timestamp to always get timestamp in +struct __kernel_sock_timeval format. + +SO_TIMESTAMP_OLD returns incorrect timestamps after the year 2038 +on 32 bit machines. + +1.2 SO_TIMESTAMPNS (also SO_TIMESTAMPNS_OLD and SO_TIMESTAMPNS_NEW): + +This option is identical to SO_TIMESTAMP except for the returned data type. +Its struct timespec allows for higher resolution (ns) timestamps than the +timeval of SO_TIMESTAMP (ms). + +Always use SO_TIMESTAMPNS_NEW timestamp to always get timestamp in +struct __kernel_timespec format. + +SO_TIMESTAMPNS_OLD returns incorrect timestamps after the year 2038 +on 32 bit machines. + +1.3 SO_TIMESTAMPING (also SO_TIMESTAMPING_OLD and SO_TIMESTAMPING_NEW) +---------------------------------------------------------------------- + +Supports multiple types of timestamp requests. As a result, this +socket option takes a bitmap of flags, not a boolean. In:: + + err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); + +val is an integer with any of the following bits set. Setting other +bit returns EINVAL and does not change the current state. + +The socket option configures timestamp generation for individual +sk_buffs (1.3.1), timestamp reporting to the socket's error +queue (1.3.2) and options (1.3.3). Timestamp generation can also +be enabled for individual sendmsg calls using cmsg (1.3.4). + + +1.3.1 Timestamp Generation +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some bits are requests to the stack to try to generate timestamps. Any +combination of them is valid. Changes to these bits apply to newly +created packets, not to packets already in the stack. As a result, it +is possible to selectively request timestamps for a subset of packets +(e.g., for sampling) by embedding an send() call within two setsockopt +calls, one to enable timestamp generation and one to disable it. +Timestamps may also be generated for reasons other than being +requested by a particular socket, such as when receive timestamping is +enabled system wide, as explained earlier. + +SOF_TIMESTAMPING_RX_HARDWARE: + Request rx timestamps generated by the network adapter. + +SOF_TIMESTAMPING_RX_SOFTWARE: + Request rx timestamps when data enters the kernel. These timestamps + are generated just after a device driver hands a packet to the + kernel receive stack. + +SOF_TIMESTAMPING_TX_HARDWARE: + Request tx timestamps generated by the network adapter. This flag + can be enabled via both socket options and control messages. + +SOF_TIMESTAMPING_TX_SOFTWARE: + Request tx timestamps when data leaves the kernel. These timestamps + are generated in the device driver as close as possible, but always + prior to, passing the packet to the network interface. Hence, they + require driver support and may not be available for all devices. + This flag can be enabled via both socket options and control messages. + +SOF_TIMESTAMPING_TX_SCHED: + Request tx timestamps prior to entering the packet scheduler. Kernel + transmit latency is, if long, often dominated by queuing delay. The + difference between this timestamp and one taken at + SOF_TIMESTAMPING_TX_SOFTWARE will expose this latency independent + of protocol processing. The latency incurred in protocol + processing, if any, can be computed by subtracting a userspace + timestamp taken immediately before send() from this timestamp. On + machines with virtual devices where a transmitted packet travels + through multiple devices and, hence, multiple packet schedulers, + a timestamp is generated at each layer. This allows for fine + grained measurement of queuing delay. This flag can be enabled + via both socket options and control messages. + +SOF_TIMESTAMPING_TX_ACK: + Request tx timestamps when all data in the send buffer has been + acknowledged. This only makes sense for reliable protocols. It is + currently only implemented for TCP. For that protocol, it may + over-report measurement, because the timestamp is generated when all + data up to and including the buffer at send() was acknowledged: the + cumulative acknowledgment. The mechanism ignores SACK and FACK. + This flag can be enabled via both socket options and control messages. + + +1.3.2 Timestamp Reporting +^^^^^^^^^^^^^^^^^^^^^^^^^ + +The other three bits control which timestamps will be reported in a +generated control message. Changes to the bits take immediate +effect at the timestamp reporting locations in the stack. Timestamps +are only reported for packets that also have the relevant timestamp +generation request set. + +SOF_TIMESTAMPING_SOFTWARE: + Report any software timestamps when available. + +SOF_TIMESTAMPING_SYS_HARDWARE: + This option is deprecated and ignored. + +SOF_TIMESTAMPING_RAW_HARDWARE: + Report hardware timestamps as generated by + SOF_TIMESTAMPING_TX_HARDWARE when available. + + +1.3.3 Timestamp Options +^^^^^^^^^^^^^^^^^^^^^^^ + +The interface supports the options + +SOF_TIMESTAMPING_OPT_ID: + Generate a unique identifier along with each packet. A process can + have multiple concurrent timestamping requests outstanding. Packets + can be reordered in the transmit path, for instance in the packet + scheduler. In that case timestamps will be queued onto the error + queue out of order from the original send() calls. It is not always + possible to uniquely match timestamps to the original send() calls + based on timestamp order or payload inspection alone, then. + + This option associates each packet at send() with a unique + identifier and returns that along with the timestamp. The identifier + is derived from a per-socket u32 counter (that wraps). For datagram + sockets, the counter increments with each sent packet. For stream + sockets, it increments with every byte. + + The counter starts at zero. It is initialized the first time that + the socket option is enabled. It is reset each time the option is + enabled after having been disabled. Resetting the counter does not + change the identifiers of existing packets in the system. + + This option is implemented only for transmit timestamps. There, the + timestamp is always looped along with a struct sock_extended_err. + The option modifies field ee_data to pass an id that is unique + among all possibly concurrently outstanding timestamp requests for + that socket. + + +SOF_TIMESTAMPING_OPT_CMSG: + Support recv() cmsg for all timestamped packets. Control messages + are already supported unconditionally on all packets with receive + timestamps and on IPv6 packets with transmit timestamp. This option + extends them to IPv4 packets with transmit timestamp. One use case + is to correlate packets with their egress device, by enabling socket + option IP_PKTINFO simultaneously. + + +SOF_TIMESTAMPING_OPT_TSONLY: + Applies to transmit timestamps only. Makes the kernel return the + timestamp as a cmsg alongside an empty packet, as opposed to + alongside the original packet. This reduces the amount of memory + charged to the socket's receive budget (SO_RCVBUF) and delivers + the timestamp even if sysctl net.core.tstamp_allow_data is 0. + This option disables SOF_TIMESTAMPING_OPT_CMSG. + +SOF_TIMESTAMPING_OPT_STATS: + Optional stats that are obtained along with the transmit timestamps. + It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the + transmit timestamp is available, the stats are available in a + separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a + list of TLVs (struct nlattr) of types. These stats allow the + application to associate various transport layer stats with + the transmit timestamps, such as how long a certain block of + data was limited by peer's receiver window. + +SOF_TIMESTAMPING_OPT_PKTINFO: + Enable the SCM_TIMESTAMPING_PKTINFO control message for incoming + packets with hardware timestamps. The message contains struct + scm_ts_pktinfo, which supplies the index of the real interface which + received the packet and its length at layer 2. A valid (non-zero) + interface index will be returned only if CONFIG_NET_RX_BUSY_POLL is + enabled and the driver is using NAPI. The struct contains also two + other fields, but they are reserved and undefined. + +SOF_TIMESTAMPING_OPT_TX_SWHW: + Request both hardware and software timestamps for outgoing packets + when SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE + are enabled at the same time. If both timestamps are generated, + two separate messages will be looped to the socket's error queue, + each containing just one timestamp. + +New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to +disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate +regardless of the setting of sysctl net.core.tstamp_allow_data. + +An exception is when a process needs additional cmsg data, for +instance SOL_IP/IP_PKTINFO to detect the egress network interface. +Then pass option SOF_TIMESTAMPING_OPT_CMSG. This option depends on +having access to the contents of the original packet, so cannot be +combined with SOF_TIMESTAMPING_OPT_TSONLY. + + +1.3.4. Enabling timestamps via control messages +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In addition to socket options, timestamp generation can be requested +per write via cmsg, only for SOF_TIMESTAMPING_TX_* (see Section 1.3.1). +Using this feature, applications can sample timestamps per sendmsg() +without paying the overhead of enabling and disabling timestamps via +setsockopt:: + + struct msghdr *msg; + ... + cmsg = CMSG_FIRSTHDR(msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SO_TIMESTAMPING; + cmsg->cmsg_len = CMSG_LEN(sizeof(__u32)); + *((__u32 *) CMSG_DATA(cmsg)) = SOF_TIMESTAMPING_TX_SCHED | + SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_ACK; + err = sendmsg(fd, msg, 0); + +The SOF_TIMESTAMPING_TX_* flags set via cmsg will override +the SOF_TIMESTAMPING_TX_* flags set via setsockopt. + +Moreover, applications must still enable timestamp reporting via +setsockopt to receive timestamps:: + + __u32 val = SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID /* or any other flag */; + err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); + + +1.4 Bytestream Timestamps +------------------------- + +The SO_TIMESTAMPING interface supports timestamping of bytes in a +bytestream. Each request is interpreted as a request for when the +entire contents of the buffer has passed a timestamping point. That +is, for streams option SOF_TIMESTAMPING_TX_SOFTWARE will record +when all bytes have reached the device driver, regardless of how +many packets the data has been converted into. + +In general, bytestreams have no natural delimiters and therefore +correlating a timestamp with data is non-trivial. A range of bytes +may be split across segments, any segments may be merged (possibly +coalescing sections of previously segmented buffers associated with +independent send() calls). Segments can be reordered and the same +byte range can coexist in multiple segments for protocols that +implement retransmissions. + +It is essential that all timestamps implement the same semantics, +regardless of these possible transformations, as otherwise they are +incomparable. Handling "rare" corner cases differently from the +simple case (a 1:1 mapping from buffer to skb) is insufficient +because performance debugging often needs to focus on such outliers. + +In practice, timestamps can be correlated with segments of a +bytestream consistently, if both semantics of the timestamp and the +timing of measurement are chosen correctly. This challenge is no +different from deciding on a strategy for IP fragmentation. There, the +definition is that only the first fragment is timestamped. For +bytestreams, we chose that a timestamp is generated only when all +bytes have passed a point. SOF_TIMESTAMPING_TX_ACK as defined is easy to +implement and reason about. An implementation that has to take into +account SACK would be more complex due to possible transmission holes +and out of order arrival. + +On the host, TCP can also break the simple 1:1 mapping from buffer to +skbuff as a result of Nagle, cork, autocork, segmentation and GSO. The +implementation ensures correctness in all cases by tracking the +individual last byte passed to send(), even if it is no longer the +last byte after an skbuff extend or merge operation. It stores the +relevant sequence number in skb_shinfo(skb)->tskey. Because an skbuff +has only one such field, only one timestamp can be generated. + +In rare cases, a timestamp request can be missed if two requests are +collapsed onto the same skb. A process can detect this situation by +enabling SOF_TIMESTAMPING_OPT_ID and comparing the byte offset at +send time with the value returned for each timestamp. It can prevent +the situation by always flushing the TCP stack in between requests, +for instance by enabling TCP_NODELAY and disabling TCP_CORK and +autocork. + +These precautions ensure that the timestamp is generated only when all +bytes have passed a timestamp point, assuming that the network stack +itself does not reorder the segments. The stack indeed tries to avoid +reordering. The one exception is under administrator control: it is +possible to construct a packet scheduler configuration that delays +segments from the same stream differently. Such a setup would be +unusual. + + +2 Data Interfaces +================== + +Timestamps are read using the ancillary data feature of recvmsg(). +See `man 3 cmsg` for details of this interface. The socket manual +page (`man 7 socket`) describes how timestamps generated with +SO_TIMESTAMP and SO_TIMESTAMPNS records can be retrieved. + + +2.1 SCM_TIMESTAMPING records +---------------------------- + +These timestamps are returned in a control message with cmsg_level +SOL_SOCKET, cmsg_type SCM_TIMESTAMPING, and payload of type + +For SO_TIMESTAMPING_OLD:: + + struct scm_timestamping { + struct timespec ts[3]; + }; + +For SO_TIMESTAMPING_NEW:: + + struct scm_timestamping64 { + struct __kernel_timespec ts[3]; + +Always use SO_TIMESTAMPING_NEW timestamp to always get timestamp in +struct scm_timestamping64 format. + +SO_TIMESTAMPING_OLD returns incorrect timestamps after the year 2038 +on 32 bit machines. + +The structure can return up to three timestamps. This is a legacy +feature. At least one field is non-zero at any time. Most timestamps +are passed in ts[0]. Hardware timestamps are passed in ts[2]. + +ts[1] used to hold hardware timestamps converted to system time. +Instead, expose the hardware clock device on the NIC directly as +a HW PTP clock source, to allow time conversion in userspace and +optionally synchronize system time with a userspace PTP stack such +as linuxptp. For the PTP clock API, see Documentation/driver-api/ptp.rst. + +Note that if the SO_TIMESTAMP or SO_TIMESTAMPNS option is enabled +together with SO_TIMESTAMPING using SOF_TIMESTAMPING_SOFTWARE, a false +software timestamp will be generated in the recvmsg() call and passed +in ts[0] when a real software timestamp is missing. This happens also +on hardware transmit timestamps. + +2.1.1 Transmit timestamps with MSG_ERRQUEUE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For transmit timestamps the outgoing packet is looped back to the +socket's error queue with the send timestamp(s) attached. A process +receives the timestamps by calling recvmsg() with flag MSG_ERRQUEUE +set and with a msg_control buffer sufficiently large to receive the +relevant metadata structures. The recvmsg call returns the original +outgoing data packet with two ancillary messages attached. + +A message of cm_level SOL_IP(V6) and cm_type IP(V6)_RECVERR +embeds a struct sock_extended_err. This defines the error type. For +timestamps, the ee_errno field is ENOMSG. The other ancillary message +will have cm_level SOL_SOCKET and cm_type SCM_TIMESTAMPING. This +embeds the struct scm_timestamping. + + +2.1.1.2 Timestamp types +~~~~~~~~~~~~~~~~~~~~~~~ + +The semantics of the three struct timespec are defined by field +ee_info in the extended error structure. It contains a value of +type SCM_TSTAMP_* to define the actual timestamp passed in +scm_timestamping. + +The SCM_TSTAMP_* types are 1:1 matches to the SOF_TIMESTAMPING_* +control fields discussed previously, with one exception. For legacy +reasons, SCM_TSTAMP_SND is equal to zero and can be set for both +SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE. It +is the first if ts[2] is non-zero, the second otherwise, in which +case the timestamp is stored in ts[0]. + + +2.1.1.3 Fragmentation +~~~~~~~~~~~~~~~~~~~~~ + +Fragmentation of outgoing datagrams is rare, but is possible, e.g., by +explicitly disabling PMTU discovery. If an outgoing packet is fragmented, +then only the first fragment is timestamped and returned to the sending +socket. + + +2.1.1.4 Packet Payload +~~~~~~~~~~~~~~~~~~~~~~ + +The calling application is often not interested in receiving the whole +packet payload that it passed to the stack originally: the socket +error queue mechanism is just a method to piggyback the timestamp on. +In this case, the application can choose to read datagrams with a +smaller buffer, possibly even of length 0. The payload is truncated +accordingly. Until the process calls recvmsg() on the error queue, +however, the full packet is queued, taking up budget from SO_RCVBUF. + + +2.1.1.5 Blocking Read +~~~~~~~~~~~~~~~~~~~~~ + +Reading from the error queue is always a non-blocking operation. To +block waiting on a timestamp, use poll or select. poll() will return +POLLERR in pollfd.revents if any data is ready on the error queue. +There is no need to pass this flag in pollfd.events. This flag is +ignored on request. See also `man 2 poll`. + + +2.1.2 Receive timestamps +^^^^^^^^^^^^^^^^^^^^^^^^ + +On reception, there is no reason to read from the socket error queue. +The SCM_TIMESTAMPING ancillary data is sent along with the packet data +on a normal recvmsg(). Since this is not a socket error, it is not +accompanied by a message SOL_IP(V6)/IP(V6)_RECVERROR. In this case, +the meaning of the three fields in struct scm_timestamping is +implicitly defined. ts[0] holds a software timestamp if set, ts[1] +is again deprecated and ts[2] holds a hardware timestamp if set. + + +3. Hardware Timestamping configuration: SIOCSHWTSTAMP and SIOCGHWTSTAMP +======================================================================= + +Hardware time stamping must also be initialized for each device driver +that is expected to do hardware time stamping. The parameter is defined in +include/uapi/linux/net_tstamp.h as:: + + struct hwtstamp_config { + int flags; /* no flags defined right now, must be zero */ + int tx_type; /* HWTSTAMP_TX_* */ + int rx_filter; /* HWTSTAMP_FILTER_* */ + }; + +Desired behavior is passed into the kernel and to a specific device by +calling ioctl(SIOCSHWTSTAMP) with a pointer to a struct ifreq whose +ifr_data points to a struct hwtstamp_config. The tx_type and +rx_filter are hints to the driver what it is expected to do. If +the requested fine-grained filtering for incoming packets is not +supported, the driver may time stamp more than just the requested types +of packets. + +Drivers are free to use a more permissive configuration than the requested +configuration. It is expected that drivers should only implement directly the +most generic mode that can be supported. For example if the hardware can +support HWTSTAMP_FILTER_V2_EVENT, then it should generally always upscale +HWTSTAMP_FILTER_V2_L2_SYNC_MESSAGE, and so forth, as HWTSTAMP_FILTER_V2_EVENT +is more generic (and more useful to applications). + +A driver which supports hardware time stamping shall update the struct +with the actual, possibly more permissive configuration. If the +requested packets cannot be time stamped, then nothing should be +changed and ERANGE shall be returned (in contrast to EINVAL, which +indicates that SIOCSHWTSTAMP is not supported at all). + +Only a processes with admin rights may change the configuration. User +space is responsible to ensure that multiple processes don't interfere +with each other and that the settings are reset. + +Any process can read the actual configuration by passing this +structure to ioctl(SIOCGHWTSTAMP) in the same way. However, this has +not been implemented in all drivers. + +:: + + /* possible values for hwtstamp_config->tx_type */ + enum { + /* + * no outgoing packet will need hardware time stamping; + * should a packet arrive which asks for it, no hardware + * time stamping will be done + */ + HWTSTAMP_TX_OFF, + + /* + * enables hardware time stamping for outgoing packets; + * the sender of the packet decides which are to be + * time stamped by setting SOF_TIMESTAMPING_TX_SOFTWARE + * before sending the packet + */ + HWTSTAMP_TX_ON, + }; + + /* possible values for hwtstamp_config->rx_filter */ + enum { + /* time stamp no incoming packet at all */ + HWTSTAMP_FILTER_NONE, + + /* time stamp any incoming packet */ + HWTSTAMP_FILTER_ALL, + + /* return value: time stamp all packets requested plus some others */ + HWTSTAMP_FILTER_SOME, + + /* PTP v1, UDP, any kind of event packet */ + HWTSTAMP_FILTER_PTP_V1_L4_EVENT, + + /* for the complete list of values, please check + * the include file include/uapi/linux/net_tstamp.h + */ + }; + +3.1 Hardware Timestamping Implementation: Device Drivers +-------------------------------------------------------- + +A driver which supports hardware time stamping must support the +SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with +the actual values as described in the section on SIOCSHWTSTAMP. It +should also support SIOCGHWTSTAMP. + +Time stamps for received packets must be stored in the skb. To get a pointer +to the shared time stamp structure of the skb call skb_hwtstamps(). Then +set the time stamps in the structure:: + + struct skb_shared_hwtstamps { + /* hardware time stamp transformed into duration + * since arbitrary point in time + */ + ktime_t hwtstamp; + }; + +Time stamps for outgoing packets are to be generated as follows: + +- In hard_start_xmit(), check if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) + is set no-zero. If yes, then the driver is expected to do hardware time + stamping. +- If this is possible for the skb and requested, then declare + that the driver is doing the time stamping by setting the flag + SKBTX_IN_PROGRESS in skb_shinfo(skb)->tx_flags , e.g. with:: + + skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; + + You might want to keep a pointer to the associated skb for the next step + and not free the skb. A driver not supporting hardware time stamping doesn't + do that. A driver must never touch sk_buff::tstamp! It is used to store + software generated time stamps by the network subsystem. +- Driver should call skb_tx_timestamp() as close to passing sk_buff to hardware + as possible. skb_tx_timestamp() provides a software time stamp if requested + and hardware timestamping is not possible (SKBTX_IN_PROGRESS not set). +- As soon as the driver has sent the packet and/or obtained a + hardware time stamp for it, it passes the time stamp back by + calling skb_hwtstamp_tx() with the original skb, the raw + hardware time stamp. skb_hwtstamp_tx() clones the original skb and + adds the timestamps, therefore the original skb has to be freed now. + If obtaining the hardware time stamp somehow fails, then the driver + should not fall back to software time stamping. The rationale is that + this would occur at a later time in the processing pipeline than other + software time stamping and therefore could lead to unexpected deltas + between time stamps. diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt deleted file mode 100644 index 8dd6333c3270..000000000000 --- a/Documentation/networking/timestamping.txt +++ /dev/null @@ -1,571 +0,0 @@ - -1. Control Interfaces - -The interfaces for receiving network packages timestamps are: - -* SO_TIMESTAMP - Generates a timestamp for each incoming packet in (not necessarily - monotonic) system time. Reports the timestamp via recvmsg() in a - control message in usec resolution. - SO_TIMESTAMP is defined as SO_TIMESTAMP_NEW or SO_TIMESTAMP_OLD - based on the architecture type and time_t representation of libc. - Control message format is in struct __kernel_old_timeval for - SO_TIMESTAMP_OLD and in struct __kernel_sock_timeval for - SO_TIMESTAMP_NEW options respectively. - -* SO_TIMESTAMPNS - Same timestamping mechanism as SO_TIMESTAMP, but reports the - timestamp as struct timespec in nsec resolution. - SO_TIMESTAMPNS is defined as SO_TIMESTAMPNS_NEW or SO_TIMESTAMPNS_OLD - based on the architecture type and time_t representation of libc. - Control message format is in struct timespec for SO_TIMESTAMPNS_OLD - and in struct __kernel_timespec for SO_TIMESTAMPNS_NEW options - respectively. - -* IP_MULTICAST_LOOP + SO_TIMESTAMP[NS] - Only for multicast:approximate transmit timestamp obtained by - reading the looped packet receive timestamp. - -* SO_TIMESTAMPING - Generates timestamps on reception, transmission or both. Supports - multiple timestamp sources, including hardware. Supports generating - timestamps for stream sockets. - - -1.1 SO_TIMESTAMP (also SO_TIMESTAMP_OLD and SO_TIMESTAMP_NEW): - -This socket option enables timestamping of datagrams on the reception -path. Because the destination socket, if any, is not known early in -the network stack, the feature has to be enabled for all packets. The -same is true for all early receive timestamp options. - -For interface details, see `man 7 socket`. - -Always use SO_TIMESTAMP_NEW timestamp to always get timestamp in -struct __kernel_sock_timeval format. - -SO_TIMESTAMP_OLD returns incorrect timestamps after the year 2038 -on 32 bit machines. - -1.2 SO_TIMESTAMPNS (also SO_TIMESTAMPNS_OLD and SO_TIMESTAMPNS_NEW): - -This option is identical to SO_TIMESTAMP except for the returned data type. -Its struct timespec allows for higher resolution (ns) timestamps than the -timeval of SO_TIMESTAMP (ms). - -Always use SO_TIMESTAMPNS_NEW timestamp to always get timestamp in -struct __kernel_timespec format. - -SO_TIMESTAMPNS_OLD returns incorrect timestamps after the year 2038 -on 32 bit machines. - -1.3 SO_TIMESTAMPING (also SO_TIMESTAMPING_OLD and SO_TIMESTAMPING_NEW): - -Supports multiple types of timestamp requests. As a result, this -socket option takes a bitmap of flags, not a boolean. In - - err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); - -val is an integer with any of the following bits set. Setting other -bit returns EINVAL and does not change the current state. - -The socket option configures timestamp generation for individual -sk_buffs (1.3.1), timestamp reporting to the socket's error -queue (1.3.2) and options (1.3.3). Timestamp generation can also -be enabled for individual sendmsg calls using cmsg (1.3.4). - - -1.3.1 Timestamp Generation - -Some bits are requests to the stack to try to generate timestamps. Any -combination of them is valid. Changes to these bits apply to newly -created packets, not to packets already in the stack. As a result, it -is possible to selectively request timestamps for a subset of packets -(e.g., for sampling) by embedding an send() call within two setsockopt -calls, one to enable timestamp generation and one to disable it. -Timestamps may also be generated for reasons other than being -requested by a particular socket, such as when receive timestamping is -enabled system wide, as explained earlier. - -SOF_TIMESTAMPING_RX_HARDWARE: - Request rx timestamps generated by the network adapter. - -SOF_TIMESTAMPING_RX_SOFTWARE: - Request rx timestamps when data enters the kernel. These timestamps - are generated just after a device driver hands a packet to the - kernel receive stack. - -SOF_TIMESTAMPING_TX_HARDWARE: - Request tx timestamps generated by the network adapter. This flag - can be enabled via both socket options and control messages. - -SOF_TIMESTAMPING_TX_SOFTWARE: - Request tx timestamps when data leaves the kernel. These timestamps - are generated in the device driver as close as possible, but always - prior to, passing the packet to the network interface. Hence, they - require driver support and may not be available for all devices. - This flag can be enabled via both socket options and control messages. - - -SOF_TIMESTAMPING_TX_SCHED: - Request tx timestamps prior to entering the packet scheduler. Kernel - transmit latency is, if long, often dominated by queuing delay. The - difference between this timestamp and one taken at - SOF_TIMESTAMPING_TX_SOFTWARE will expose this latency independent - of protocol processing. The latency incurred in protocol - processing, if any, can be computed by subtracting a userspace - timestamp taken immediately before send() from this timestamp. On - machines with virtual devices where a transmitted packet travels - through multiple devices and, hence, multiple packet schedulers, - a timestamp is generated at each layer. This allows for fine - grained measurement of queuing delay. This flag can be enabled - via both socket options and control messages. - -SOF_TIMESTAMPING_TX_ACK: - Request tx timestamps when all data in the send buffer has been - acknowledged. This only makes sense for reliable protocols. It is - currently only implemented for TCP. For that protocol, it may - over-report measurement, because the timestamp is generated when all - data up to and including the buffer at send() was acknowledged: the - cumulative acknowledgment. The mechanism ignores SACK and FACK. - This flag can be enabled via both socket options and control messages. - - -1.3.2 Timestamp Reporting - -The other three bits control which timestamps will be reported in a -generated control message. Changes to the bits take immediate -effect at the timestamp reporting locations in the stack. Timestamps -are only reported for packets that also have the relevant timestamp -generation request set. - -SOF_TIMESTAMPING_SOFTWARE: - Report any software timestamps when available. - -SOF_TIMESTAMPING_SYS_HARDWARE: - This option is deprecated and ignored. - -SOF_TIMESTAMPING_RAW_HARDWARE: - Report hardware timestamps as generated by - SOF_TIMESTAMPING_TX_HARDWARE when available. - - -1.3.3 Timestamp Options - -The interface supports the options - -SOF_TIMESTAMPING_OPT_ID: - - Generate a unique identifier along with each packet. A process can - have multiple concurrent timestamping requests outstanding. Packets - can be reordered in the transmit path, for instance in the packet - scheduler. In that case timestamps will be queued onto the error - queue out of order from the original send() calls. It is not always - possible to uniquely match timestamps to the original send() calls - based on timestamp order or payload inspection alone, then. - - This option associates each packet at send() with a unique - identifier and returns that along with the timestamp. The identifier - is derived from a per-socket u32 counter (that wraps). For datagram - sockets, the counter increments with each sent packet. For stream - sockets, it increments with every byte. - - The counter starts at zero. It is initialized the first time that - the socket option is enabled. It is reset each time the option is - enabled after having been disabled. Resetting the counter does not - change the identifiers of existing packets in the system. - - This option is implemented only for transmit timestamps. There, the - timestamp is always looped along with a struct sock_extended_err. - The option modifies field ee_data to pass an id that is unique - among all possibly concurrently outstanding timestamp requests for - that socket. - - -SOF_TIMESTAMPING_OPT_CMSG: - - Support recv() cmsg for all timestamped packets. Control messages - are already supported unconditionally on all packets with receive - timestamps and on IPv6 packets with transmit timestamp. This option - extends them to IPv4 packets with transmit timestamp. One use case - is to correlate packets with their egress device, by enabling socket - option IP_PKTINFO simultaneously. - - -SOF_TIMESTAMPING_OPT_TSONLY: - - Applies to transmit timestamps only. Makes the kernel return the - timestamp as a cmsg alongside an empty packet, as opposed to - alongside the original packet. This reduces the amount of memory - charged to the socket's receive budget (SO_RCVBUF) and delivers - the timestamp even if sysctl net.core.tstamp_allow_data is 0. - This option disables SOF_TIMESTAMPING_OPT_CMSG. - -SOF_TIMESTAMPING_OPT_STATS: - - Optional stats that are obtained along with the transmit timestamps. - It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the - transmit timestamp is available, the stats are available in a - separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a - list of TLVs (struct nlattr) of types. These stats allow the - application to associate various transport layer stats with - the transmit timestamps, such as how long a certain block of - data was limited by peer's receiver window. - -SOF_TIMESTAMPING_OPT_PKTINFO: - - Enable the SCM_TIMESTAMPING_PKTINFO control message for incoming - packets with hardware timestamps. The message contains struct - scm_ts_pktinfo, which supplies the index of the real interface which - received the packet and its length at layer 2. A valid (non-zero) - interface index will be returned only if CONFIG_NET_RX_BUSY_POLL is - enabled and the driver is using NAPI. The struct contains also two - other fields, but they are reserved and undefined. - -SOF_TIMESTAMPING_OPT_TX_SWHW: - - Request both hardware and software timestamps for outgoing packets - when SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE - are enabled at the same time. If both timestamps are generated, - two separate messages will be looped to the socket's error queue, - each containing just one timestamp. - -New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to -disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate -regardless of the setting of sysctl net.core.tstamp_allow_data. - -An exception is when a process needs additional cmsg data, for -instance SOL_IP/IP_PKTINFO to detect the egress network interface. -Then pass option SOF_TIMESTAMPING_OPT_CMSG. This option depends on -having access to the contents of the original packet, so cannot be -combined with SOF_TIMESTAMPING_OPT_TSONLY. - - -1.3.4. Enabling timestamps via control messages - -In addition to socket options, timestamp generation can be requested -per write via cmsg, only for SOF_TIMESTAMPING_TX_* (see Section 1.3.1). -Using this feature, applications can sample timestamps per sendmsg() -without paying the overhead of enabling and disabling timestamps via -setsockopt: - - struct msghdr *msg; - ... - cmsg = CMSG_FIRSTHDR(msg); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SO_TIMESTAMPING; - cmsg->cmsg_len = CMSG_LEN(sizeof(__u32)); - *((__u32 *) CMSG_DATA(cmsg)) = SOF_TIMESTAMPING_TX_SCHED | - SOF_TIMESTAMPING_TX_SOFTWARE | - SOF_TIMESTAMPING_TX_ACK; - err = sendmsg(fd, msg, 0); - -The SOF_TIMESTAMPING_TX_* flags set via cmsg will override -the SOF_TIMESTAMPING_TX_* flags set via setsockopt. - -Moreover, applications must still enable timestamp reporting via -setsockopt to receive timestamps: - - __u32 val = SOF_TIMESTAMPING_SOFTWARE | - SOF_TIMESTAMPING_OPT_ID /* or any other flag */; - err = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); - - -1.4 Bytestream Timestamps - -The SO_TIMESTAMPING interface supports timestamping of bytes in a -bytestream. Each request is interpreted as a request for when the -entire contents of the buffer has passed a timestamping point. That -is, for streams option SOF_TIMESTAMPING_TX_SOFTWARE will record -when all bytes have reached the device driver, regardless of how -many packets the data has been converted into. - -In general, bytestreams have no natural delimiters and therefore -correlating a timestamp with data is non-trivial. A range of bytes -may be split across segments, any segments may be merged (possibly -coalescing sections of previously segmented buffers associated with -independent send() calls). Segments can be reordered and the same -byte range can coexist in multiple segments for protocols that -implement retransmissions. - -It is essential that all timestamps implement the same semantics, -regardless of these possible transformations, as otherwise they are -incomparable. Handling "rare" corner cases differently from the -simple case (a 1:1 mapping from buffer to skb) is insufficient -because performance debugging often needs to focus on such outliers. - -In practice, timestamps can be correlated with segments of a -bytestream consistently, if both semantics of the timestamp and the -timing of measurement are chosen correctly. This challenge is no -different from deciding on a strategy for IP fragmentation. There, the -definition is that only the first fragment is timestamped. For -bytestreams, we chose that a timestamp is generated only when all -bytes have passed a point. SOF_TIMESTAMPING_TX_ACK as defined is easy to -implement and reason about. An implementation that has to take into -account SACK would be more complex due to possible transmission holes -and out of order arrival. - -On the host, TCP can also break the simple 1:1 mapping from buffer to -skbuff as a result of Nagle, cork, autocork, segmentation and GSO. The -implementation ensures correctness in all cases by tracking the -individual last byte passed to send(), even if it is no longer the -last byte after an skbuff extend or merge operation. It stores the -relevant sequence number in skb_shinfo(skb)->tskey. Because an skbuff -has only one such field, only one timestamp can be generated. - -In rare cases, a timestamp request can be missed if two requests are -collapsed onto the same skb. A process can detect this situation by -enabling SOF_TIMESTAMPING_OPT_ID and comparing the byte offset at -send time with the value returned for each timestamp. It can prevent -the situation by always flushing the TCP stack in between requests, -for instance by enabling TCP_NODELAY and disabling TCP_CORK and -autocork. - -These precautions ensure that the timestamp is generated only when all -bytes have passed a timestamp point, assuming that the network stack -itself does not reorder the segments. The stack indeed tries to avoid -reordering. The one exception is under administrator control: it is -possible to construct a packet scheduler configuration that delays -segments from the same stream differently. Such a setup would be -unusual. - - -2 Data Interfaces - -Timestamps are read using the ancillary data feature of recvmsg(). -See `man 3 cmsg` for details of this interface. The socket manual -page (`man 7 socket`) describes how timestamps generated with -SO_TIMESTAMP and SO_TIMESTAMPNS records can be retrieved. - - -2.1 SCM_TIMESTAMPING records - -These timestamps are returned in a control message with cmsg_level -SOL_SOCKET, cmsg_type SCM_TIMESTAMPING, and payload of type - -For SO_TIMESTAMPING_OLD: - -struct scm_timestamping { - struct timespec ts[3]; -}; - -For SO_TIMESTAMPING_NEW: - -struct scm_timestamping64 { - struct __kernel_timespec ts[3]; - -Always use SO_TIMESTAMPING_NEW timestamp to always get timestamp in -struct scm_timestamping64 format. - -SO_TIMESTAMPING_OLD returns incorrect timestamps after the year 2038 -on 32 bit machines. - -The structure can return up to three timestamps. This is a legacy -feature. At least one field is non-zero at any time. Most timestamps -are passed in ts[0]. Hardware timestamps are passed in ts[2]. - -ts[1] used to hold hardware timestamps converted to system time. -Instead, expose the hardware clock device on the NIC directly as -a HW PTP clock source, to allow time conversion in userspace and -optionally synchronize system time with a userspace PTP stack such -as linuxptp. For the PTP clock API, see Documentation/driver-api/ptp.rst. - -Note that if the SO_TIMESTAMP or SO_TIMESTAMPNS option is enabled -together with SO_TIMESTAMPING using SOF_TIMESTAMPING_SOFTWARE, a false -software timestamp will be generated in the recvmsg() call and passed -in ts[0] when a real software timestamp is missing. This happens also -on hardware transmit timestamps. - -2.1.1 Transmit timestamps with MSG_ERRQUEUE - -For transmit timestamps the outgoing packet is looped back to the -socket's error queue with the send timestamp(s) attached. A process -receives the timestamps by calling recvmsg() with flag MSG_ERRQUEUE -set and with a msg_control buffer sufficiently large to receive the -relevant metadata structures. The recvmsg call returns the original -outgoing data packet with two ancillary messages attached. - -A message of cm_level SOL_IP(V6) and cm_type IP(V6)_RECVERR -embeds a struct sock_extended_err. This defines the error type. For -timestamps, the ee_errno field is ENOMSG. The other ancillary message -will have cm_level SOL_SOCKET and cm_type SCM_TIMESTAMPING. This -embeds the struct scm_timestamping. - - -2.1.1.2 Timestamp types - -The semantics of the three struct timespec are defined by field -ee_info in the extended error structure. It contains a value of -type SCM_TSTAMP_* to define the actual timestamp passed in -scm_timestamping. - -The SCM_TSTAMP_* types are 1:1 matches to the SOF_TIMESTAMPING_* -control fields discussed previously, with one exception. For legacy -reasons, SCM_TSTAMP_SND is equal to zero and can be set for both -SOF_TIMESTAMPING_TX_HARDWARE and SOF_TIMESTAMPING_TX_SOFTWARE. It -is the first if ts[2] is non-zero, the second otherwise, in which -case the timestamp is stored in ts[0]. - - -2.1.1.3 Fragmentation - -Fragmentation of outgoing datagrams is rare, but is possible, e.g., by -explicitly disabling PMTU discovery. If an outgoing packet is fragmented, -then only the first fragment is timestamped and returned to the sending -socket. - - -2.1.1.4 Packet Payload - -The calling application is often not interested in receiving the whole -packet payload that it passed to the stack originally: the socket -error queue mechanism is just a method to piggyback the timestamp on. -In this case, the application can choose to read datagrams with a -smaller buffer, possibly even of length 0. The payload is truncated -accordingly. Until the process calls recvmsg() on the error queue, -however, the full packet is queued, taking up budget from SO_RCVBUF. - - -2.1.1.5 Blocking Read - -Reading from the error queue is always a non-blocking operation. To -block waiting on a timestamp, use poll or select. poll() will return -POLLERR in pollfd.revents if any data is ready on the error queue. -There is no need to pass this flag in pollfd.events. This flag is -ignored on request. See also `man 2 poll`. - - -2.1.2 Receive timestamps - -On reception, there is no reason to read from the socket error queue. -The SCM_TIMESTAMPING ancillary data is sent along with the packet data -on a normal recvmsg(). Since this is not a socket error, it is not -accompanied by a message SOL_IP(V6)/IP(V6)_RECVERROR. In this case, -the meaning of the three fields in struct scm_timestamping is -implicitly defined. ts[0] holds a software timestamp if set, ts[1] -is again deprecated and ts[2] holds a hardware timestamp if set. - - -3. Hardware Timestamping configuration: SIOCSHWTSTAMP and SIOCGHWTSTAMP - -Hardware time stamping must also be initialized for each device driver -that is expected to do hardware time stamping. The parameter is defined in -include/uapi/linux/net_tstamp.h as: - -struct hwtstamp_config { - int flags; /* no flags defined right now, must be zero */ - int tx_type; /* HWTSTAMP_TX_* */ - int rx_filter; /* HWTSTAMP_FILTER_* */ -}; - -Desired behavior is passed into the kernel and to a specific device by -calling ioctl(SIOCSHWTSTAMP) with a pointer to a struct ifreq whose -ifr_data points to a struct hwtstamp_config. The tx_type and -rx_filter are hints to the driver what it is expected to do. If -the requested fine-grained filtering for incoming packets is not -supported, the driver may time stamp more than just the requested types -of packets. - -Drivers are free to use a more permissive configuration than the requested -configuration. It is expected that drivers should only implement directly the -most generic mode that can be supported. For example if the hardware can -support HWTSTAMP_FILTER_V2_EVENT, then it should generally always upscale -HWTSTAMP_FILTER_V2_L2_SYNC_MESSAGE, and so forth, as HWTSTAMP_FILTER_V2_EVENT -is more generic (and more useful to applications). - -A driver which supports hardware time stamping shall update the struct -with the actual, possibly more permissive configuration. If the -requested packets cannot be time stamped, then nothing should be -changed and ERANGE shall be returned (in contrast to EINVAL, which -indicates that SIOCSHWTSTAMP is not supported at all). - -Only a processes with admin rights may change the configuration. User -space is responsible to ensure that multiple processes don't interfere -with each other and that the settings are reset. - -Any process can read the actual configuration by passing this -structure to ioctl(SIOCGHWTSTAMP) in the same way. However, this has -not been implemented in all drivers. - -/* possible values for hwtstamp_config->tx_type */ -enum { - /* - * no outgoing packet will need hardware time stamping; - * should a packet arrive which asks for it, no hardware - * time stamping will be done - */ - HWTSTAMP_TX_OFF, - - /* - * enables hardware time stamping for outgoing packets; - * the sender of the packet decides which are to be - * time stamped by setting SOF_TIMESTAMPING_TX_SOFTWARE - * before sending the packet - */ - HWTSTAMP_TX_ON, -}; - -/* possible values for hwtstamp_config->rx_filter */ -enum { - /* time stamp no incoming packet at all */ - HWTSTAMP_FILTER_NONE, - - /* time stamp any incoming packet */ - HWTSTAMP_FILTER_ALL, - - /* return value: time stamp all packets requested plus some others */ - HWTSTAMP_FILTER_SOME, - - /* PTP v1, UDP, any kind of event packet */ - HWTSTAMP_FILTER_PTP_V1_L4_EVENT, - - /* for the complete list of values, please check - * the include file include/uapi/linux/net_tstamp.h - */ -}; - -3.1 Hardware Timestamping Implementation: Device Drivers - -A driver which supports hardware time stamping must support the -SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with -the actual values as described in the section on SIOCSHWTSTAMP. It -should also support SIOCGHWTSTAMP. - -Time stamps for received packets must be stored in the skb. To get a pointer -to the shared time stamp structure of the skb call skb_hwtstamps(). Then -set the time stamps in the structure: - -struct skb_shared_hwtstamps { - /* hardware time stamp transformed into duration - * since arbitrary point in time - */ - ktime_t hwtstamp; -}; - -Time stamps for outgoing packets are to be generated as follows: -- In hard_start_xmit(), check if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) - is set no-zero. If yes, then the driver is expected to do hardware time - stamping. -- If this is possible for the skb and requested, then declare - that the driver is doing the time stamping by setting the flag - SKBTX_IN_PROGRESS in skb_shinfo(skb)->tx_flags , e.g. with - - skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; - - You might want to keep a pointer to the associated skb for the next step - and not free the skb. A driver not supporting hardware time stamping doesn't - do that. A driver must never touch sk_buff::tstamp! It is used to store - software generated time stamps by the network subsystem. -- Driver should call skb_tx_timestamp() as close to passing sk_buff to hardware - as possible. skb_tx_timestamp() provides a software time stamp if requested - and hardware timestamping is not possible (SKBTX_IN_PROGRESS not set). -- As soon as the driver has sent the packet and/or obtained a - hardware time stamp for it, it passes the time stamp back by - calling skb_hwtstamp_tx() with the original skb, the raw - hardware time stamp. skb_hwtstamp_tx() clones the original skb and - adds the timestamps, therefore the original skb has to be freed now. - If obtaining the hardware time stamp somehow fails, then the driver - should not fall back to software time stamping. The rationale is that - this would occur at a later time in the processing pipeline than other - software time stamping and therefore could lead to unexpected deltas - between time stamps. diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index 0cca19670fd2..ca5cb3e3c6df 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -36,7 +36,7 @@ struct sock_extended_err { * * The timestamping interfaces SO_TIMESTAMPING, MSG_TSTAMP_* * communicate network timestamps by passing this struct in a cmsg with - * recvmsg(). See Documentation/networking/timestamping.txt for details. + * recvmsg(). See Documentation/networking/timestamping.rst for details. * User space sees a timespec definition that matches either * __kernel_timespec or __kernel_old_timespec, in the kernel we * require two structure definitions to provide both. -- cgit v1.2.3 From d07dcf9aadd6b2842b439e8668ff7ea2873f28d7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Apr 2020 22:13:12 +0200 Subject: netlink: add infrastructure to expose policies to userspace Add, and use in generic netlink, helpers to dump out a netlink policy to userspace, including all the range validation data, nested policies etc. This lets userspace discover what the kernel understands. For families/commands other than generic netlink, the helpers need to be used directly in an appropriate command, or we can add some infrastructure (a new netlink family) that those can register their policies with for introspection. I'm not that familiar with non-generic netlink, so that's left out for now. The data exposed to userspace also includes min and max length for binary/string data, I've done that instead of letting the userspace tools figure out whether min/max is intended based on the type so that we can extend this later in the kernel, we might want to just use the range data for example. Because of this, I opted to not directly expose the NLA_* values, even if some of them are already exposed via BPF, as with min/max length we don't need to have different types here for NLA_BINARY/NLA_MIN_LEN/NLA_EXACT_LEN, we just make them all NL_ATTR_TYPE_BINARY with min/max length optionally set. Similarly, we don't really need NLA_MSECS, and perhaps can remove it in the future - but not if we encode it into the userspace API now. It gets mapped to NL_ATTR_TYPE_U64 here. Note that the exposing here corresponds to the strict policy interpretation, and NLA_UNSPEC items are omitted entirely. To get those, change them to NLA_MIN_LEN which behaves in exactly the same way, but is exposed. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 6 + include/uapi/linux/genetlink.h | 2 + include/uapi/linux/netlink.h | 103 ++++++++++++++ net/netlink/Makefile | 2 +- net/netlink/genetlink.c | 78 +++++++++++ net/netlink/policy.c | 308 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 498 insertions(+), 1 deletion(-) create mode 100644 net/netlink/policy.c (limited to 'include/uapi') diff --git a/include/net/netlink.h b/include/net/netlink.h index 557b67f1db99..c0411f14fb53 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1933,4 +1933,10 @@ void nla_get_range_unsigned(const struct nla_policy *pt, void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range); +int netlink_policy_dump_start(const struct nla_policy *policy, + unsigned int maxtype, + unsigned long *state); +bool netlink_policy_dump_loop(unsigned long *state); +int netlink_policy_dump_write(struct sk_buff *skb, unsigned long state); + #endif diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index 877f7fa95466..9c0636ec2286 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -48,6 +48,7 @@ enum { CTRL_CMD_NEWMCAST_GRP, CTRL_CMD_DELMCAST_GRP, CTRL_CMD_GETMCAST_GRP, /* unused */ + CTRL_CMD_GETPOLICY, __CTRL_CMD_MAX, }; @@ -62,6 +63,7 @@ enum { CTRL_ATTR_MAXATTR, CTRL_ATTR_OPS, CTRL_ATTR_MCAST_GROUPS, + CTRL_ATTR_POLICY, __CTRL_ATTR_MAX, }; diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 0a4d73317759..eac8a6a648ea 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -249,4 +249,107 @@ struct nla_bitfield32 { __u32 selector; }; +/* + * policy descriptions - it's specific to each family how this is used + * Normally, it should be retrieved via a dump inside another attribute + * specifying where it applies. + */ + +/** + * enum netlink_attribute_type - type of an attribute + * @NL_ATTR_TYPE_INVALID: unused + * @NL_ATTR_TYPE_FLAG: flag attribute (present/not present) + * @NL_ATTR_TYPE_U8: 8-bit unsigned attribute + * @NL_ATTR_TYPE_U16: 16-bit unsigned attribute + * @NL_ATTR_TYPE_U32: 32-bit unsigned attribute + * @NL_ATTR_TYPE_U64: 64-bit unsigned attribute + * @NL_ATTR_TYPE_S8: 8-bit signed attribute + * @NL_ATTR_TYPE_S16: 16-bit signed attribute + * @NL_ATTR_TYPE_S32: 32-bit signed attribute + * @NL_ATTR_TYPE_S64: 64-bit signed attribute + * @NL_ATTR_TYPE_BINARY: binary data, min/max length may be specified + * @NL_ATTR_TYPE_STRING: string, min/max length may be specified + * @NL_ATTR_TYPE_NUL_STRING: NUL-terminated string, + * min/max length may be specified + * @NL_ATTR_TYPE_NESTED: nested, i.e. the content of this attribute + * consists of sub-attributes. The nested policy and maxtype + * inside may be specified. + * @NL_ATTR_TYPE_NESTED_ARRAY: nested array, i.e. the content of this + * attribute contains sub-attributes whose type is irrelevant + * (just used to separate the array entries) and each such array + * entry has attributes again, the policy for those inner ones + * and the corresponding maxtype may be specified. + * @NL_ATTR_TYPE_BITFIELD32: &struct nla_bitfield32 attribute + */ +enum netlink_attribute_type { + NL_ATTR_TYPE_INVALID, + + NL_ATTR_TYPE_FLAG, + + NL_ATTR_TYPE_U8, + NL_ATTR_TYPE_U16, + NL_ATTR_TYPE_U32, + NL_ATTR_TYPE_U64, + + NL_ATTR_TYPE_S8, + NL_ATTR_TYPE_S16, + NL_ATTR_TYPE_S32, + NL_ATTR_TYPE_S64, + + NL_ATTR_TYPE_BINARY, + NL_ATTR_TYPE_STRING, + NL_ATTR_TYPE_NUL_STRING, + + NL_ATTR_TYPE_NESTED, + NL_ATTR_TYPE_NESTED_ARRAY, + + NL_ATTR_TYPE_BITFIELD32, +}; + +/** + * enum netlink_policy_type_attr - policy type attributes + * @NL_POLICY_TYPE_ATTR_UNSPEC: unused + * @NL_POLICY_TYPE_ATTR_TYPE: type of the attribute, + * &enum netlink_attribute_type (U32) + * @NL_POLICY_TYPE_ATTR_MIN_VALUE_S: minimum value for signed + * integers (S64) + * @NL_POLICY_TYPE_ATTR_MAX_VALUE_S: maximum value for signed + * integers (S64) + * @NL_POLICY_TYPE_ATTR_MIN_VALUE_U: minimum value for unsigned + * integers (U64) + * @NL_POLICY_TYPE_ATTR_MAX_VALUE_U: maximum value for unsigned + * integers (U64) + * @NL_POLICY_TYPE_ATTR_MIN_LENGTH: minimum length for binary + * attributes, no minimum if not given (U32) + * @NL_POLICY_TYPE_ATTR_MAX_LENGTH: maximum length for binary + * attributes, no maximum if not given (U32) + * @NL_POLICY_TYPE_ATTR_POLICY_IDX: sub policy for nested and + * nested array types (U32) + * @NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE: maximum sub policy + * attribute for nested and nested array types, this can + * in theory be < the size of the policy pointed to by + * the index, if limited inside the nesting (U32) + * @NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: valid mask for the + * bitfield32 type (U32) + * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment + */ +enum netlink_policy_type_attr { + NL_POLICY_TYPE_ATTR_UNSPEC, + NL_POLICY_TYPE_ATTR_TYPE, + NL_POLICY_TYPE_ATTR_MIN_VALUE_S, + NL_POLICY_TYPE_ATTR_MAX_VALUE_S, + NL_POLICY_TYPE_ATTR_MIN_VALUE_U, + NL_POLICY_TYPE_ATTR_MAX_VALUE_U, + NL_POLICY_TYPE_ATTR_MIN_LENGTH, + NL_POLICY_TYPE_ATTR_MAX_LENGTH, + NL_POLICY_TYPE_ATTR_POLICY_IDX, + NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, + NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, + NL_POLICY_TYPE_ATTR_PAD, + + /* keep last */ + __NL_POLICY_TYPE_ATTR_MAX, + NL_POLICY_TYPE_ATTR_MAX = __NL_POLICY_TYPE_ATTR_MAX - 1 +}; + #endif /* _UAPI__LINUX_NETLINK_H */ diff --git a/net/netlink/Makefile b/net/netlink/Makefile index de42df7f0068..e05202708c90 100644 --- a/net/netlink/Makefile +++ b/net/netlink/Makefile @@ -3,7 +3,7 @@ # Makefile for the netlink driver. # -obj-y := af_netlink.o genetlink.o +obj-y := af_netlink.o genetlink.o policy.o obj-$(CONFIG_NETLINK_DIAG) += netlink_diag.o netlink_diag-y := diag.o diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 9f357aa22b94..2f049692e012 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1043,6 +1043,80 @@ static int genl_ctrl_event(int event, const struct genl_family *family, return 0; } +static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct genl_family *rt; + unsigned int fam_id = cb->args[0]; + int err; + + if (!fam_id) { + struct nlattr *tb[CTRL_ATTR_MAX + 1]; + + err = genlmsg_parse(cb->nlh, &genl_ctrl, tb, + genl_ctrl.maxattr, + genl_ctrl.policy, cb->extack); + if (err) + return err; + + if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME]) + return -EINVAL; + + if (tb[CTRL_ATTR_FAMILY_ID]) { + fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]); + } else { + rt = genl_family_find_byname( + nla_data(tb[CTRL_ATTR_FAMILY_NAME])); + if (!rt) + return -ENOENT; + fam_id = rt->id; + } + } + + rt = genl_family_find_byid(fam_id); + if (!rt) + return -ENOENT; + + if (!rt->policy) + return -ENODATA; + + err = netlink_policy_dump_start(rt->policy, rt->maxattr, &cb->args[1]); + if (err) + return err; + + while (netlink_policy_dump_loop(&cb->args[1])) { + void *hdr; + struct nlattr *nest; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &genl_ctrl, + NLM_F_MULTI, CTRL_CMD_GETPOLICY); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, rt->id)) + goto nla_put_failure; + + nest = nla_nest_start(skb, CTRL_ATTR_POLICY); + if (!nest) + goto nla_put_failure; + + if (netlink_policy_dump_write(skb, cb->args[1])) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + genlmsg_end(skb, hdr); + continue; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + break; + } + + cb->args[0] = fam_id; + return skb->len; +} + static const struct genl_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, @@ -1050,6 +1124,10 @@ static const struct genl_ops genl_ctrl_ops[] = { .doit = ctrl_getfamily, .dumpit = ctrl_dumpfamily, }, + { + .cmd = CTRL_CMD_GETPOLICY, + .dumpit = ctrl_dumppolicy, + }, }; static const struct genl_multicast_group genl_ctrl_groups[] = { diff --git a/net/netlink/policy.c b/net/netlink/policy.c new file mode 100644 index 000000000000..f6491853c797 --- /dev/null +++ b/net/netlink/policy.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NETLINK Policy advertisement to userspace + * + * Authors: Johannes Berg + * + * Copyright 2019 Intel Corporation + */ + +#include +#include +#include +#include + +#define INITIAL_POLICIES_ALLOC 10 + +struct nl_policy_dump { + unsigned int policy_idx; + unsigned int attr_idx; + unsigned int n_alloc; + struct { + const struct nla_policy *policy; + unsigned int maxtype; + } policies[]; +}; + +static int add_policy(struct nl_policy_dump **statep, + const struct nla_policy *policy, + unsigned int maxtype) +{ + struct nl_policy_dump *state = *statep; + unsigned int n_alloc, i; + + if (!policy || !maxtype) + return 0; + + for (i = 0; i < state->n_alloc; i++) { + if (state->policies[i].policy == policy) + return 0; + + if (!state->policies[i].policy) { + state->policies[i].policy = policy; + state->policies[i].maxtype = maxtype; + return 0; + } + } + + n_alloc = state->n_alloc + INITIAL_POLICIES_ALLOC; + state = krealloc(state, struct_size(state, policies, n_alloc), + GFP_KERNEL); + if (!state) + return -ENOMEM; + + state->policies[state->n_alloc].policy = policy; + state->policies[state->n_alloc].maxtype = maxtype; + state->n_alloc = n_alloc; + *statep = state; + + return 0; +} + +static unsigned int get_policy_idx(struct nl_policy_dump *state, + const struct nla_policy *policy) +{ + unsigned int i; + + for (i = 0; i < state->n_alloc; i++) { + if (state->policies[i].policy == policy) + return i; + } + + WARN_ON_ONCE(1); + return -1; +} + +int netlink_policy_dump_start(const struct nla_policy *policy, + unsigned int maxtype, + unsigned long *_state) +{ + struct nl_policy_dump *state; + unsigned int policy_idx; + int err; + + /* also returns 0 if "*_state" is our ERR_PTR() end marker */ + if (*_state) + return 0; + + /* + * walk the policies and nested ones first, and build + * a linear list of them. + */ + + state = kzalloc(struct_size(state, policies, INITIAL_POLICIES_ALLOC), + GFP_KERNEL); + if (!state) + return -ENOMEM; + state->n_alloc = INITIAL_POLICIES_ALLOC; + + err = add_policy(&state, policy, maxtype); + if (err) + return err; + + for (policy_idx = 0; + policy_idx < state->n_alloc && state->policies[policy_idx].policy; + policy_idx++) { + const struct nla_policy *policy; + unsigned int type; + + policy = state->policies[policy_idx].policy; + + for (type = 0; + type <= state->policies[policy_idx].maxtype; + type++) { + switch (policy[type].type) { + case NLA_NESTED: + case NLA_NESTED_ARRAY: + err = add_policy(&state, + policy[type].nested_policy, + policy[type].len); + if (err) + return err; + break; + default: + break; + } + } + } + + *_state = (unsigned long)state; + + return 0; +} + +static bool netlink_policy_dump_finished(struct nl_policy_dump *state) +{ + return state->policy_idx >= state->n_alloc || + !state->policies[state->policy_idx].policy; +} + +bool netlink_policy_dump_loop(unsigned long *_state) +{ + struct nl_policy_dump *state = (void *)*_state; + + if (IS_ERR(state)) + return false; + + if (netlink_policy_dump_finished(state)) { + kfree(state); + /* store end marker instead of freed state */ + *_state = (unsigned long)ERR_PTR(-ENOENT); + return false; + } + + return true; +} + +int netlink_policy_dump_write(struct sk_buff *skb, unsigned long _state) +{ + struct nl_policy_dump *state = (void *)_state; + const struct nla_policy *pt; + struct nlattr *policy, *attr; + enum netlink_attribute_type type; + bool again; + +send_attribute: + again = false; + + pt = &state->policies[state->policy_idx].policy[state->attr_idx]; + + policy = nla_nest_start(skb, state->policy_idx); + if (!policy) + return -ENOBUFS; + + attr = nla_nest_start(skb, state->attr_idx); + if (!attr) + goto nla_put_failure; + + switch (pt->type) { + default: + case NLA_UNSPEC: + case NLA_REJECT: + /* skip - use NLA_MIN_LEN to advertise such */ + nla_nest_cancel(skb, policy); + again = true; + goto next; + case NLA_NESTED: + type = NL_ATTR_TYPE_NESTED; + /* fall through */ + case NLA_NESTED_ARRAY: + if (pt->type == NLA_NESTED_ARRAY) + type = NL_ATTR_TYPE_NESTED_ARRAY; + if (pt->nested_policy && pt->len && + (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_IDX, + get_policy_idx(state, pt->nested_policy)) || + nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, + pt->len))) + goto nla_put_failure; + break; + case NLA_U8: + case NLA_U16: + case NLA_U32: + case NLA_U64: + case NLA_MSECS: { + struct netlink_range_validation range; + + if (pt->type == NLA_U8) + type = NL_ATTR_TYPE_U8; + else if (pt->type == NLA_U16) + type = NL_ATTR_TYPE_U16; + else if (pt->type == NLA_U32) + type = NL_ATTR_TYPE_U32; + else + type = NL_ATTR_TYPE_U64; + + nla_get_range_unsigned(pt, &range); + + if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_U, + range.min, NL_POLICY_TYPE_ATTR_PAD) || + nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_U, + range.max, NL_POLICY_TYPE_ATTR_PAD)) + goto nla_put_failure; + break; + } + case NLA_S8: + case NLA_S16: + case NLA_S32: + case NLA_S64: { + struct netlink_range_validation_signed range; + + if (pt->type == NLA_S8) + type = NL_ATTR_TYPE_S8; + else if (pt->type == NLA_S16) + type = NL_ATTR_TYPE_S16; + else if (pt->type == NLA_S32) + type = NL_ATTR_TYPE_S32; + else + type = NL_ATTR_TYPE_S64; + + nla_get_range_signed(pt, &range); + + if (nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_S, + range.min, NL_POLICY_TYPE_ATTR_PAD) || + nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_S, + range.max, NL_POLICY_TYPE_ATTR_PAD)) + goto nla_put_failure; + break; + } + case NLA_BITFIELD32: + type = NL_ATTR_TYPE_BITFIELD32; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, + pt->bitfield32_valid)) + goto nla_put_failure; + break; + case NLA_EXACT_LEN: + type = NL_ATTR_TYPE_BINARY; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len) || + nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, pt->len)) + goto nla_put_failure; + break; + case NLA_STRING: + case NLA_NUL_STRING: + case NLA_BINARY: + if (pt->type == NLA_STRING) + type = NL_ATTR_TYPE_STRING; + else if (pt->type == NLA_NUL_STRING) + type = NL_ATTR_TYPE_NUL_STRING; + else + type = NL_ATTR_TYPE_BINARY; + if (pt->len && nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, + pt->len)) + goto nla_put_failure; + break; + case NLA_MIN_LEN: + type = NL_ATTR_TYPE_BINARY; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len)) + goto nla_put_failure; + break; + case NLA_FLAG: + type = NL_ATTR_TYPE_FLAG; + break; + } + + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_TYPE, type)) + goto nla_put_failure; + + /* finish and move state to next attribute */ + nla_nest_end(skb, attr); + nla_nest_end(skb, policy); + +next: + state->attr_idx += 1; + if (state->attr_idx > state->policies[state->policy_idx].maxtype) { + state->attr_idx = 0; + state->policy_idx++; + } + + if (again) { + if (netlink_policy_dump_finished(state)) + return -ENODATA; + goto send_attribute; + } + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, policy); + return -ENOBUFS; +} -- cgit v1.2.3 From 0aeaaf64e6d06e353de15dcf9973312ae0672ca1 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 29 Apr 2020 19:36:06 -0400 Subject: drm/amdkfd: Fix comment formatting Corrected two function names. Added a missing space. Signed-off-by: Felix Kuehling Reviewed-by: Kent Russell Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- include/uapi/linux/kfd_ioctl.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 598296034b43..d27221ddcdeb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1122,7 +1122,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) return p; } -/* process_evict_queues - Evict all user queues of a process +/* kfd_process_evict_queues - Evict all user queues of a process * * Eviction is reference-counted per process-device. This means multiple * evictions from different sources can be nested safely. @@ -1162,7 +1162,7 @@ fail: return r; } -/* process_restore_queues - Restore all user queues of a process */ +/* kfd_process_restore_queues - Restore all user queues of a process */ int kfd_process_restore_queues(struct kfd_process *p) { struct kfd_process_device *pdd; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 4f6676428c5c..b6be62356d34 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -251,7 +251,7 @@ struct kfd_memory_exception_failure { __u32 imprecise; /* Can't determine the exact fault address */ }; -/* memory exception data*/ +/* memory exception data */ struct kfd_hsa_memory_exception_data { struct kfd_memory_exception_failure failure; __u64 va; -- cgit v1.2.3 From d46edd671a147032e22cfeb271a5734703093649 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 30 Apr 2020 00:15:04 -0700 Subject: bpf: Sharing bpf runtime stats with BPF_ENABLE_STATS Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats. Typical userspace tools use kernel.bpf_stats_enabled as follows: 1. Enable kernel.bpf_stats_enabled; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Disable kernel.bpf_stats_enabled. The problem with this approach is that only one userspace tool can toggle this sysctl. If multiple tools toggle the sysctl at the same time, the measurement may be inaccurate. To fix this problem while keep backward compatibility, introduce a new bpf command BPF_ENABLE_STATS. On success, this command enables stats and returns a valid fd. BPF_ENABLE_STATS takes argument "type". Currently, only one type, BPF_STATS_RUN_TIME, is supported. We can extend the command to support other types of stats in the future. With BPF_ENABLE_STATS, user space tool would have the following flow: 1. Get a fd with BPF_ENABLE_STATS, and make sure it is valid; 2. Check program run_time_ns; 3. Sleep for the monitoring period; 4. Check program run_time_ns again, calculate the difference; 5. Close the fd. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200430071506.1408910-2-songliubraving@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 11 ++++++++ kernel/bpf/syscall.c | 57 ++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 36 +++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 11 ++++++++ 5 files changed, 115 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c07b1d2f3824..1262ec460ab3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -987,6 +987,7 @@ _out: \ #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); +extern struct mutex bpf_stats_enabled_mutex; /* * Block execution of BPF programs attached to instrumentation (perf, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0eccafae55bb..705e4822f997 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -115,6 +115,7 @@ enum bpf_cmd { BPF_LINK_UPDATE, BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, }; enum bpf_map_type { @@ -390,6 +391,12 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, @@ -601,6 +608,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c75b2dd2459c..4f34eecec9ce 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3872,6 +3872,60 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr) return fd; } +DEFINE_MUTEX(bpf_stats_enabled_mutex); + +static int bpf_stats_release(struct inode *inode, struct file *file) +{ + mutex_lock(&bpf_stats_enabled_mutex); + static_key_slow_dec(&bpf_stats_enabled_key.key); + mutex_unlock(&bpf_stats_enabled_mutex); + return 0; +} + +static const struct file_operations bpf_stats_fops = { + .release = bpf_stats_release, +}; + +static int bpf_enable_runtime_stats(void) +{ + int fd; + + mutex_lock(&bpf_stats_enabled_mutex); + + /* Set a very high limit to avoid overflow */ + if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { + mutex_unlock(&bpf_stats_enabled_mutex); + return -EBUSY; + } + + fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); + if (fd >= 0) + static_key_slow_inc(&bpf_stats_enabled_key.key); + + mutex_unlock(&bpf_stats_enabled_mutex); + return fd; +} + +#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type + +static int bpf_enable_stats(union bpf_attr *attr) +{ + + if (CHECK_ATTR(BPF_ENABLE_STATS)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (attr->enable_stats.type) { + case BPF_STATS_RUN_TIME: + return bpf_enable_runtime_stats(); + default: + break; + } + return -EINVAL; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3996,6 +4050,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &link_idr, &link_idr_lock); break; + case BPF_ENABLE_STATS: + err = bpf_enable_stats(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e961286d0e14..7adfe5dbce9d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -201,6 +201,40 @@ static int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ +#ifdef CONFIG_BPF_SYSCALL +static int bpf_stats_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} +#endif + /* * /proc/sys support */ @@ -2549,7 +2583,7 @@ static struct ctl_table kern_table[] = { .data = &bpf_stats_enabled_key.key, .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, - .proc_handler = proc_do_static_key, + .proc_handler = bpf_stats_handler, }, #endif #if defined(CONFIG_TREE_RCU) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0eccafae55bb..705e4822f997 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -115,6 +115,7 @@ enum bpf_cmd { BPF_LINK_UPDATE, BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, }; enum bpf_map_type { @@ -390,6 +391,12 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, @@ -601,6 +608,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 883780af72090daf9ab53779a3085a6ddfc468ca Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 May 2020 16:44:27 +0200 Subject: docs: networking: convert x25-iface.txt to ReST Not much to be done here: - add SPDX header; - adjust title markup; - remove a tail whitespace; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/x25-iface.rst | 129 +++++++++++++++++++++++++++++++++ Documentation/networking/x25-iface.txt | 123 ------------------------------- include/uapi/linux/if_x25.h | 2 +- net/x25/Kconfig | 2 +- 5 files changed, 132 insertions(+), 125 deletions(-) create mode 100644 Documentation/networking/x25-iface.rst delete mode 100644 Documentation/networking/x25-iface.txt (limited to 'include/uapi') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index a72fdfb391b6..7a4bdbc111b0 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -115,6 +115,7 @@ Contents: udplite vrf vxlan + x25-iface .. only:: subproject and html diff --git a/Documentation/networking/x25-iface.rst b/Documentation/networking/x25-iface.rst new file mode 100644 index 000000000000..df401891dce6 --- /dev/null +++ b/Documentation/networking/x25-iface.rst @@ -0,0 +1,129 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================- +X.25 Device Driver Interface +============================- + +Version 1.1 + + Jonathan Naylor 26.12.96 + +This is a description of the messages to be passed between the X.25 Packet +Layer and the X.25 device driver. They are designed to allow for the easy +setting of the LAPB mode from within the Packet Layer. + +The X.25 device driver will be coded normally as per the Linux device driver +standards. Most X.25 device drivers will be moderately similar to the +already existing Ethernet device drivers. However unlike those drivers, the +X.25 device driver has a state associated with it, and this information +needs to be passed to and from the Packet Layer for proper operation. + +All messages are held in sk_buff's just like real data to be transmitted +over the LAPB link. The first byte of the skbuff indicates the meaning of +the rest of the skbuff, if any more information does exist. + + +Packet Layer to Device Driver +----------------------------- + +First Byte = 0x00 (X25_IFACE_DATA) + +This indicates that the rest of the skbuff contains data to be transmitted +over the LAPB link. The LAPB link should already exist before any data is +passed down. + +First Byte = 0x01 (X25_IFACE_CONNECT) + +Establish the LAPB link. If the link is already established then the connect +confirmation message should be returned as soon as possible. + +First Byte = 0x02 (X25_IFACE_DISCONNECT) + +Terminate the LAPB link. If it is already disconnected then the disconnect +confirmation message should be returned as soon as possible. + +First Byte = 0x03 (X25_IFACE_PARAMS) + +LAPB parameters. To be defined. + + +Device Driver to Packet Layer +----------------------------- + +First Byte = 0x00 (X25_IFACE_DATA) + +This indicates that the rest of the skbuff contains data that has been +received over the LAPB link. + +First Byte = 0x01 (X25_IFACE_CONNECT) + +LAPB link has been established. The same message is used for both a LAPB +link connect_confirmation and a connect_indication. + +First Byte = 0x02 (X25_IFACE_DISCONNECT) + +LAPB link has been terminated. This same message is used for both a LAPB +link disconnect_confirmation and a disconnect_indication. + +First Byte = 0x03 (X25_IFACE_PARAMS) + +LAPB parameters. To be defined. + + + +Possible Problems +================= + +(Henner Eisen, 2000-10-28) + +The X.25 packet layer protocol depends on a reliable datalink service. +The LAPB protocol provides such reliable service. But this reliability +is not preserved by the Linux network device driver interface: + +- With Linux 2.4.x (and above) SMP kernels, packet ordering is not + preserved. Even if a device driver calls netif_rx(skb1) and later + netif_rx(skb2), skb2 might be delivered to the network layer + earlier that skb1. +- Data passed upstream by means of netif_rx() might be dropped by the + kernel if the backlog queue is congested. + +The X.25 packet layer protocol will detect this and reset the virtual +call in question. But many upper layer protocols are not designed to +handle such N-Reset events gracefully. And frequent N-Reset events +will always degrade performance. + +Thus, driver authors should make netif_rx() as reliable as possible: + +SMP re-ordering will not occur if the driver's interrupt handler is +always executed on the same CPU. Thus, + +- Driver authors should use irq affinity for the interrupt handler. + +The probability of packet loss due to backlog congestion can be +reduced by the following measures or a combination thereof: + +(1) Drivers for kernel versions 2.4.x and above should always check the + return value of netif_rx(). If it returns NET_RX_DROP, the + driver's LAPB protocol must not confirm reception of the frame + to the peer. + This will reliably suppress packet loss. The LAPB protocol will + automatically cause the peer to re-transmit the dropped packet + later. + The lapb module interface was modified to support this. Its + data_indication() method should now transparently pass the + netif_rx() return value to the (lapb module) caller. +(2) Drivers for kernel versions 2.2.x should always check the global + variable netdev_dropping when a new frame is received. The driver + should only call netif_rx() if netdev_dropping is zero. Otherwise + the driver should not confirm delivery of the frame and drop it. + Alternatively, the driver can queue the frame internally and call + netif_rx() later when netif_dropping is 0 again. In that case, delivery + confirmation should also be deferred such that the internal queue + cannot grow to much. + This will not reliably avoid packet loss, but the probability + of packet loss in netif_rx() path will be significantly reduced. +(3) Additionally, driver authors might consider to support + CONFIG_NET_HW_FLOWCONTROL. This allows the driver to be woken up + when a previously congested backlog queue becomes empty again. + The driver could uses this for flow-controlling the peer by means + of the LAPB protocol's flow-control service. diff --git a/Documentation/networking/x25-iface.txt b/Documentation/networking/x25-iface.txt deleted file mode 100644 index 7f213b556e85..000000000000 --- a/Documentation/networking/x25-iface.txt +++ /dev/null @@ -1,123 +0,0 @@ - X.25 Device Driver Interface 1.1 - - Jonathan Naylor 26.12.96 - -This is a description of the messages to be passed between the X.25 Packet -Layer and the X.25 device driver. They are designed to allow for the easy -setting of the LAPB mode from within the Packet Layer. - -The X.25 device driver will be coded normally as per the Linux device driver -standards. Most X.25 device drivers will be moderately similar to the -already existing Ethernet device drivers. However unlike those drivers, the -X.25 device driver has a state associated with it, and this information -needs to be passed to and from the Packet Layer for proper operation. - -All messages are held in sk_buff's just like real data to be transmitted -over the LAPB link. The first byte of the skbuff indicates the meaning of -the rest of the skbuff, if any more information does exist. - - -Packet Layer to Device Driver ------------------------------ - -First Byte = 0x00 (X25_IFACE_DATA) - -This indicates that the rest of the skbuff contains data to be transmitted -over the LAPB link. The LAPB link should already exist before any data is -passed down. - -First Byte = 0x01 (X25_IFACE_CONNECT) - -Establish the LAPB link. If the link is already established then the connect -confirmation message should be returned as soon as possible. - -First Byte = 0x02 (X25_IFACE_DISCONNECT) - -Terminate the LAPB link. If it is already disconnected then the disconnect -confirmation message should be returned as soon as possible. - -First Byte = 0x03 (X25_IFACE_PARAMS) - -LAPB parameters. To be defined. - - -Device Driver to Packet Layer ------------------------------ - -First Byte = 0x00 (X25_IFACE_DATA) - -This indicates that the rest of the skbuff contains data that has been -received over the LAPB link. - -First Byte = 0x01 (X25_IFACE_CONNECT) - -LAPB link has been established. The same message is used for both a LAPB -link connect_confirmation and a connect_indication. - -First Byte = 0x02 (X25_IFACE_DISCONNECT) - -LAPB link has been terminated. This same message is used for both a LAPB -link disconnect_confirmation and a disconnect_indication. - -First Byte = 0x03 (X25_IFACE_PARAMS) - -LAPB parameters. To be defined. - - - -Possible Problems -================= - -(Henner Eisen, 2000-10-28) - -The X.25 packet layer protocol depends on a reliable datalink service. -The LAPB protocol provides such reliable service. But this reliability -is not preserved by the Linux network device driver interface: - -- With Linux 2.4.x (and above) SMP kernels, packet ordering is not - preserved. Even if a device driver calls netif_rx(skb1) and later - netif_rx(skb2), skb2 might be delivered to the network layer - earlier that skb1. -- Data passed upstream by means of netif_rx() might be dropped by the - kernel if the backlog queue is congested. - -The X.25 packet layer protocol will detect this and reset the virtual -call in question. But many upper layer protocols are not designed to -handle such N-Reset events gracefully. And frequent N-Reset events -will always degrade performance. - -Thus, driver authors should make netif_rx() as reliable as possible: - -SMP re-ordering will not occur if the driver's interrupt handler is -always executed on the same CPU. Thus, - -- Driver authors should use irq affinity for the interrupt handler. - -The probability of packet loss due to backlog congestion can be -reduced by the following measures or a combination thereof: - -(1) Drivers for kernel versions 2.4.x and above should always check the - return value of netif_rx(). If it returns NET_RX_DROP, the - driver's LAPB protocol must not confirm reception of the frame - to the peer. - This will reliably suppress packet loss. The LAPB protocol will - automatically cause the peer to re-transmit the dropped packet - later. - The lapb module interface was modified to support this. Its - data_indication() method should now transparently pass the - netif_rx() return value to the (lapb module) caller. -(2) Drivers for kernel versions 2.2.x should always check the global - variable netdev_dropping when a new frame is received. The driver - should only call netif_rx() if netdev_dropping is zero. Otherwise - the driver should not confirm delivery of the frame and drop it. - Alternatively, the driver can queue the frame internally and call - netif_rx() later when netif_dropping is 0 again. In that case, delivery - confirmation should also be deferred such that the internal queue - cannot grow to much. - This will not reliably avoid packet loss, but the probability - of packet loss in netif_rx() path will be significantly reduced. -(3) Additionally, driver authors might consider to support - CONFIG_NET_HW_FLOWCONTROL. This allows the driver to be woken up - when a previously congested backlog queue becomes empty again. - The driver could uses this for flow-controlling the peer by means - of the LAPB protocol's flow-control service. diff --git a/include/uapi/linux/if_x25.h b/include/uapi/linux/if_x25.h index 5d962448345f..3a5938e38370 100644 --- a/include/uapi/linux/if_x25.h +++ b/include/uapi/linux/if_x25.h @@ -18,7 +18,7 @@ #include -/* Documentation/networking/x25-iface.txt */ +/* Documentation/networking/x25-iface.rst */ #define X25_IFACE_DATA 0x00 #define X25_IFACE_CONNECT 0x01 #define X25_IFACE_DISCONNECT 0x02 diff --git a/net/x25/Kconfig b/net/x25/Kconfig index 2ecb2e5e241e..a328f79885d1 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -21,7 +21,7 @@ config X25 . Information about X.25 for Linux is contained in the files and - . + . One connects to an X.25 network either with a dedicated network card using the X.21 protocol (not yet supported by Linux) or one can do -- cgit v1.2.3 From beecf11bc2188067824591612151c4dc6ec383c7 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 30 Apr 2020 16:31:52 -0700 Subject: bpf: Bpf_{g,s}etsockopt for struct bpf_sock_addr Currently, bpf_getsockopt and bpf_setsockopt helpers operate on the 'struct bpf_sock_ops' context in BPF_PROG_TYPE_SOCK_OPS program. Let's generalize them and make them available for 'struct bpf_sock_addr'. That way, in the future, we can allow those helpers in more places. As an example, let's expose those 'struct bpf_sock_addr' based helpers to BPF_CGROUP_INET{4,6}_CONNECT hooks. That way we can override CC before the connection is made. v3: * Expose custom helpers for bpf_sock_addr context instead of doing generic bpf_sock argument (as suggested by Daniel). Even with try_socket_lock that doesn't sleep we have a problem where context sk is already locked and socket lock is non-nestable. v2: * s/BPF_PROG_TYPE_CGROUP_SOCKOPT/BPF_PROG_TYPE_SOCK_OPS/ Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200430233152.199403-1-sdf@google.com --- include/uapi/linux/bpf.h | 14 ++- net/core/filter.c | 118 +++++++++++++++++----- tools/include/uapi/linux/bpf.h | 14 ++- tools/testing/selftests/bpf/config | 1 + tools/testing/selftests/bpf/progs/connect4_prog.c | 46 +++++++++ 5 files changed, 166 insertions(+), 27 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 705e4822f997..b3643e27e264 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1587,7 +1587,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,6 +1595,11 @@ union bpf_attr { * must be specified, see **setsockopt(2)** for more information. * The option value of length *optlen* is pointed by *optval*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: * @@ -1789,7 +1794,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1798,6 +1803,11 @@ union bpf_attr { * The retrieved value is stored in the structure pointed by * *opval* and of length *optlen*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **getsockopt()**. * It supports the following *level*\ s: * diff --git a/net/core/filter.c b/net/core/filter.c index 70b32723e6be..dfaf5df13722 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4194,16 +4194,19 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +#define SOCKOPT_CC_REINIT (1 << 0) + +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen, u32 flags) { - struct sock *sk = bpf_sock->sk; int ret = 0; int val; if (!sk_fullsock(sk)) return -EINVAL; + sock_owned_by_me(sk); + if (level == SOL_SOCKET) { if (optlen != sizeof(int)) return -EINVAL; @@ -4298,7 +4301,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; + bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); @@ -4345,24 +4348,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, return ret; } -static const struct bpf_func_proto bpf_setsockopt_proto = { - .func = bpf_setsockopt, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, -}; - -BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +static int _bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) { - struct sock *sk = bpf_sock->sk; - if (!sk_fullsock(sk)) goto err_clear; + + sock_owned_by_me(sk); + #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { struct inet_connection_sock *icsk; @@ -4428,8 +4421,71 @@ err_clear: return -EINVAL; } -static const struct bpf_func_proto bpf_getsockopt_proto = { - .func = bpf_getsockopt, +BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { + .func = bpf_sock_addr_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { + .func = bpf_sock_addr_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) + flags |= SOCKOPT_CC_REINIT; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { + .func = bpf_sock_ops_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { + .func = bpf_sock_ops_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -6043,6 +6099,22 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_getsockopt_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id); } @@ -6261,9 +6333,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: - return &bpf_setsockopt_proto; + return &bpf_sock_ops_setsockopt_proto; case BPF_FUNC_getsockopt: - return &bpf_getsockopt_proto; + return &bpf_sock_ops_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 705e4822f997..b3643e27e264 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1587,7 +1587,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,6 +1595,11 @@ union bpf_attr { * must be specified, see **setsockopt(2)** for more information. * The option value of length *optlen* is pointed by *optval*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: * @@ -1789,7 +1794,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1798,6 +1803,11 @@ union bpf_attr { * The retrieved value is stored in the structure pointed by * *opval* and of length *optlen*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **getsockopt()**. * It supports the following *level*\ s: * diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 60e3ae5d4e48..6e5b94c036ca 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -37,3 +37,4 @@ CONFIG_IPV6_SIT=m CONFIG_BPF_JIT=y CONFIG_BPF_LSM=y CONFIG_SECURITY=y +CONFIG_TCP_CONG_DCTCP=y diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index ad3c498a8150..972918cd2d7f 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -16,6 +17,10 @@ #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 +#ifndef TCP_CA_NAME_MAX +#define TCP_CA_NAME_MAX 16 +#endif + int _version SEC("version") = 1; __attribute__ ((noinline)) @@ -33,6 +38,43 @@ int do_bind(struct bpf_sock_addr *ctx) return 1; } +static __inline int verify_cc(struct bpf_sock_addr *ctx, + char expected[TCP_CA_NAME_MAX]) +{ + char buf[TCP_CA_NAME_MAX]; + int i; + + if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) + return 1; + + for (i = 0; i < TCP_CA_NAME_MAX; i++) { + if (buf[i] != expected[i]) + return 1; + if (buf[i] == 0) + break; + } + + return 0; +} + +static __inline int set_cc(struct bpf_sock_addr *ctx) +{ + char dctcp[TCP_CA_NAME_MAX] = "dctcp"; + char cubic[TCP_CA_NAME_MAX] = "cubic"; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &dctcp, sizeof(dctcp))) + return 1; + if (verify_cc(ctx, dctcp)) + return 1; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic))) + return 1; + if (verify_cc(ctx, cubic)) + return 1; + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -66,6 +108,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) bpf_sk_release(sk); + /* Rewrite congestion control. */ + if (ctx->type == SOCK_STREAM && set_cc(ctx)) + return 0; + /* Rewrite destination. */ ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4); ctx->user_port = bpf_htons(DST_REWRITE_PORT4); -- cgit v1.2.3 From a51c328df3106663879645680609eb49b3ff6444 Mon Sep 17 00:00:00 2001 From: Po Liu Date: Fri, 1 May 2020 08:53:15 +0800 Subject: net: qos: introduce a gate control flow action Introduce a ingress frame gate control flow action. Tc gate action does the work like this: Assume there is a gate allow specified ingress frames can be passed at specific time slot, and be dropped at specific time slot. Tc filter chooses the ingress frames, and tc gate action would specify what slot does these frames can be passed to device and what time slot would be dropped. Tc gate action would provide an entry list to tell how much time gate keep open and how much time gate keep state close. Gate action also assign a start time to tell when the entry list start. Then driver would repeat the gate entry list cyclically. For the software simulation, gate action requires the user assign a time clock type. Below is the setting example in user space. Tc filter a stream source ip address is 192.168.0.20 and gate action own two time slots. One is last 200ms gate open let frame pass another is last 100ms gate close let frames dropped. When the ingress frames have reach total frames over 8000000 bytes, the excessive frames will be dropped in that 200000000ns time slot. > tc qdisc add dev eth0 ingress > tc filter add dev eth0 parent ffff: protocol ip \ flower src_ip 192.168.0.20 \ action gate index 2 clockid CLOCK_TAI \ sched-entry open 200000000 -1 8000000 \ sched-entry close 100000000 -1 -1 > tc chain del dev eth0 ingress chain 0 "sched-entry" follow the name taprio style. Gate state is "open"/"close". Follow with period nanosecond. Then next item is internal priority value means which ingress queue should put. "-1" means wildcard. The last value optional specifies the maximum number of MSDU octets that are permitted to pass the gate during the specified time interval. Base-time is not set will be 0 as default, as result start time would be ((N + 1) * cycletime) which is the minimal of future time. Below example shows filtering a stream with destination mac address is 10:00:80:00:00:00 and ip type is ICMP, follow the action gate. The gate action would run with one close time slot which means always keep close. The time cycle is total 200000000ns. The base-time would calculate by: 1357000000000 + (N + 1) * cycletime When the total value is the future time, it will be the start time. The cycletime here would be 200000000ns for this case. > tc filter add dev eth0 parent ffff: protocol ip \ flower skip_hw ip_proto icmp dst_mac 10:00:80:00:00:00 \ action gate index 12 base-time 1357000000000 \ sched-entry close 200000000 -1 -1 \ clockid CLOCK_TAI Signed-off-by: Po Liu Signed-off-by: David S. Miller --- include/net/tc_act/tc_gate.h | 47 +++ include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/tc_act/tc_gate.h | 47 +++ net/sched/Kconfig | 12 + net/sched/Makefile | 1 + net/sched/act_gate.c | 636 ++++++++++++++++++++++++++++++++++++ 6 files changed, 744 insertions(+) create mode 100644 include/net/tc_act/tc_gate.h create mode 100644 include/uapi/linux/tc_act/tc_gate.h create mode 100644 net/sched/act_gate.c (limited to 'include/uapi') diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h new file mode 100644 index 000000000000..330ad8b02495 --- /dev/null +++ b/include/net/tc_act/tc_gate.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright 2020 NXP */ + +#ifndef __NET_TC_GATE_H +#define __NET_TC_GATE_H + +#include +#include + +struct tcfg_gate_entry { + int index; + u8 gate_state; + u32 interval; + s32 ipv; + s32 maxoctets; + struct list_head list; +}; + +struct tcf_gate_params { + s32 tcfg_priority; + u64 tcfg_basetime; + u64 tcfg_cycletime; + u64 tcfg_cycletime_ext; + u32 tcfg_flags; + s32 tcfg_clockid; + size_t num_entries; + struct list_head entries; +}; + +#define GATE_ACT_GATE_OPEN BIT(0) +#define GATE_ACT_PENDING BIT(1) + +struct tcf_gate { + struct tc_action common; + struct tcf_gate_params param; + u8 current_gate_status; + ktime_t current_close_time; + u32 current_entry_octets; + s32 current_max_octets; + struct tcfg_gate_entry *next_entry; + struct hrtimer hitimer; + enum tk_offsets tk_offset; +}; + +#define to_gate(a) ((struct tcf_gate *)a) + +#endif diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 9f06d29cab70..fc672b232437 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -134,6 +134,7 @@ enum tca_id { TCA_ID_CTINFO, TCA_ID_MPLS, TCA_ID_CT, + TCA_ID_GATE, /* other actions go here */ __TCA_ID_MAX = 255 }; diff --git a/include/uapi/linux/tc_act/tc_gate.h b/include/uapi/linux/tc_act/tc_gate.h new file mode 100644 index 000000000000..f214b3a6d44f --- /dev/null +++ b/include/uapi/linux/tc_act/tc_gate.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* Copyright 2020 NXP */ + +#ifndef __LINUX_TC_GATE_H +#define __LINUX_TC_GATE_H + +#include + +struct tc_gate { + tc_gen; +}; + +enum { + TCA_GATE_ENTRY_UNSPEC, + TCA_GATE_ENTRY_INDEX, + TCA_GATE_ENTRY_GATE, + TCA_GATE_ENTRY_INTERVAL, + TCA_GATE_ENTRY_IPV, + TCA_GATE_ENTRY_MAX_OCTETS, + __TCA_GATE_ENTRY_MAX, +}; +#define TCA_GATE_ENTRY_MAX (__TCA_GATE_ENTRY_MAX - 1) + +enum { + TCA_GATE_ONE_ENTRY_UNSPEC, + TCA_GATE_ONE_ENTRY, + __TCA_GATE_ONE_ENTRY_MAX, +}; +#define TCA_GATE_ONE_ENTRY_MAX (__TCA_GATE_ONE_ENTRY_MAX - 1) + +enum { + TCA_GATE_UNSPEC, + TCA_GATE_TM, + TCA_GATE_PARMS, + TCA_GATE_PAD, + TCA_GATE_PRIORITY, + TCA_GATE_ENTRY_LIST, + TCA_GATE_BASE_TIME, + TCA_GATE_CYCLE_TIME, + TCA_GATE_CYCLE_TIME_EXT, + TCA_GATE_FLAGS, + TCA_GATE_CLOCKID, + __TCA_GATE_MAX, +}; +#define TCA_GATE_MAX (__TCA_GATE_MAX - 1) + +#endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index bfbefb7bff9d..2f20073f4f84 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -981,6 +981,18 @@ config NET_ACT_CT To compile this code as a module, choose M here: the module will be called act_ct. +config NET_ACT_GATE + tristate "Frame gate entry list control tc action" + depends on NET_CLS_ACT + help + Say Y here to allow to control the ingress flow to be passed at + specific time slot and be dropped at other specific time slot by + the gate entry list. + + If unsure, say N. + To compile this code as a module, choose M here: the + module will be called act_gate. + config NET_IFE_SKBMARK tristate "Support to encoding decoding skb mark on IFE action" depends on NET_ACT_IFE diff --git a/net/sched/Makefile b/net/sched/Makefile index 31c367a6cd09..66bbf9a98f9e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o obj-$(CONFIG_NET_ACT_CT) += act_ct.o +obj-$(CONFIG_NET_ACT_GATE) += act_gate.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c new file mode 100644 index 000000000000..35fc48795541 --- /dev/null +++ b/net/sched/act_gate.c @@ -0,0 +1,636 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright 2020 NXP */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int gate_net_id; +static struct tc_action_ops act_gate_ops; + +static ktime_t gate_get_time(struct tcf_gate *gact) +{ + ktime_t mono = ktime_get(); + + switch (gact->tk_offset) { + case TK_OFFS_MAX: + return mono; + default: + return ktime_mono_to_any(mono, gact->tk_offset); + } + + return KTIME_MAX; +} + +static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start) +{ + struct tcf_gate_params *param = &gact->param; + ktime_t now, base, cycle; + u64 n; + + base = ns_to_ktime(param->tcfg_basetime); + now = gate_get_time(gact); + + if (ktime_after(base, now)) { + *start = base; + return 0; + } + + cycle = param->tcfg_cycletime; + + /* cycle time should not be zero */ + if (!cycle) + return -EFAULT; + + n = div64_u64(ktime_sub_ns(now, base), cycle); + *start = ktime_add_ns(base, (n + 1) * cycle); + return 0; +} + +static void gate_start_timer(struct tcf_gate *gact, ktime_t start) +{ + ktime_t expires; + + expires = hrtimer_get_expires(&gact->hitimer); + if (expires == 0) + expires = KTIME_MAX; + + start = min_t(ktime_t, start, expires); + + hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT); +} + +static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) +{ + struct tcf_gate *gact = container_of(timer, struct tcf_gate, + hitimer); + struct tcf_gate_params *p = &gact->param; + struct tcfg_gate_entry *next; + ktime_t close_time, now; + + spin_lock(&gact->tcf_lock); + + next = gact->next_entry; + + /* cycle start, clear pending bit, clear total octets */ + gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0; + gact->current_entry_octets = 0; + gact->current_max_octets = next->maxoctets; + + gact->current_close_time = ktime_add_ns(gact->current_close_time, + next->interval); + + close_time = gact->current_close_time; + + if (list_is_last(&next->list, &p->entries)) + next = list_first_entry(&p->entries, + struct tcfg_gate_entry, list); + else + next = list_next_entry(next, list); + + now = gate_get_time(gact); + + if (ktime_after(now, close_time)) { + ktime_t cycle, base; + u64 n; + + cycle = p->tcfg_cycletime; + base = ns_to_ktime(p->tcfg_basetime); + n = div64_u64(ktime_sub_ns(now, base), cycle); + close_time = ktime_add_ns(base, (n + 1) * cycle); + } + + gact->next_entry = next; + + hrtimer_set_expires(&gact->hitimer, close_time); + + spin_unlock(&gact->tcf_lock); + + return HRTIMER_RESTART; +} + +static int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_gate *gact = to_gate(a); + + spin_lock(&gact->tcf_lock); + + tcf_lastuse_update(&gact->tcf_tm); + bstats_update(&gact->tcf_bstats, skb); + + if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) { + spin_unlock(&gact->tcf_lock); + return gact->tcf_action; + } + + if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) + goto drop; + + if (gact->current_max_octets >= 0) { + gact->current_entry_octets += qdisc_pkt_len(skb); + if (gact->current_entry_octets > gact->current_max_octets) { + gact->tcf_qstats.overlimits++; + goto drop; + } + } + + spin_unlock(&gact->tcf_lock); + + return gact->tcf_action; +drop: + gact->tcf_qstats.drops++; + spin_unlock(&gact->tcf_lock); + + return TC_ACT_SHOT; +} + +static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = { + [TCA_GATE_ENTRY_INDEX] = { .type = NLA_U32 }, + [TCA_GATE_ENTRY_GATE] = { .type = NLA_FLAG }, + [TCA_GATE_ENTRY_INTERVAL] = { .type = NLA_U32 }, + [TCA_GATE_ENTRY_IPV] = { .type = NLA_S32 }, + [TCA_GATE_ENTRY_MAX_OCTETS] = { .type = NLA_S32 }, +}; + +static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = { + [TCA_GATE_PARMS] = { .len = sizeof(struct tc_gate), + .type = NLA_EXACT_LEN }, + [TCA_GATE_PRIORITY] = { .type = NLA_S32 }, + [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED }, + [TCA_GATE_BASE_TIME] = { .type = NLA_U64 }, + [TCA_GATE_CYCLE_TIME] = { .type = NLA_U64 }, + [TCA_GATE_CYCLE_TIME_EXT] = { .type = NLA_U64 }, + [TCA_GATE_FLAGS] = { .type = NLA_U32 }, + [TCA_GATE_CLOCKID] = { .type = NLA_S32 }, +}; + +static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry, + struct netlink_ext_ack *extack) +{ + u32 interval = 0; + + entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]); + + if (tb[TCA_GATE_ENTRY_INTERVAL]) + interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]); + + if (interval == 0) { + NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); + return -EINVAL; + } + + entry->interval = interval; + + if (tb[TCA_GATE_ENTRY_IPV]) + entry->ipv = nla_get_s32(tb[TCA_GATE_ENTRY_IPV]); + else + entry->ipv = -1; + + if (tb[TCA_GATE_ENTRY_MAX_OCTETS]) + entry->maxoctets = nla_get_s32(tb[TCA_GATE_ENTRY_MAX_OCTETS]); + else + entry->maxoctets = -1; + + return 0; +} + +static int parse_gate_entry(struct nlattr *n, struct tcfg_gate_entry *entry, + int index, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { }; + int err; + + err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + entry->index = index; + + return fill_gate_entry(tb, entry, extack); +} + +static void release_entry_list(struct list_head *entries) +{ + struct tcfg_gate_entry *entry, *e; + + list_for_each_entry_safe(entry, e, entries, list) { + list_del(&entry->list); + kfree(entry); + } +} + +static int parse_gate_list(struct nlattr *list_attr, + struct tcf_gate_params *sched, + struct netlink_ext_ack *extack) +{ + struct tcfg_gate_entry *entry; + struct nlattr *n; + int err, rem; + int i = 0; + + if (!list_attr) + return -EINVAL; + + nla_for_each_nested(n, list_attr, rem) { + if (nla_type(n) != TCA_GATE_ONE_ENTRY) { + NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'"); + continue; + } + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) { + NL_SET_ERR_MSG(extack, "Not enough memory for entry"); + err = -ENOMEM; + goto release_list; + } + + err = parse_gate_entry(n, entry, i, extack); + if (err < 0) { + kfree(entry); + goto release_list; + } + + list_add_tail(&entry->list, &sched->entries); + i++; + } + + sched->num_entries = i; + + return i; + +release_list: + release_entry_list(&sched->entries); + + return err; +} + +static int tcf_gate_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + enum tk_offsets tk_offset = TK_OFFS_TAI; + struct nlattr *tb[TCA_GATE_MAX + 1]; + struct tcf_chain *goto_ch = NULL; + struct tcf_gate_params *p; + s32 clockid = CLOCK_TAI; + struct tcf_gate *gact; + struct tc_gate *parm; + int ret = 0, err; + u64 basetime = 0; + u32 gflags = 0; + s32 prio = -1; + ktime_t start; + u32 index; + + if (!nla) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_GATE_PARMS]) + return -EINVAL; + + parm = nla_data(tb[TCA_GATE_PARMS]); + index = parm->index; + + err = tcf_idr_check_alloc(tn, &index, a, bind); + if (err < 0) + return err; + + if (err && bind) + return 0; + + if (!err) { + ret = tcf_idr_create(tn, index, est, a, + &act_gate_ops, bind, false, 0); + if (ret) { + tcf_idr_cleanup(tn, index); + return ret; + } + + ret = ACT_P_CREATED; + } else if (!ovr) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + + if (tb[TCA_GATE_PRIORITY]) + prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); + + if (tb[TCA_GATE_BASE_TIME]) + basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]); + + if (tb[TCA_GATE_FLAGS]) + gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); + + if (tb[TCA_GATE_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); + switch (clockid) { + case CLOCK_REALTIME: + tk_offset = TK_OFFS_REAL; + break; + case CLOCK_MONOTONIC: + tk_offset = TK_OFFS_MAX; + break; + case CLOCK_BOOTTIME: + tk_offset = TK_OFFS_BOOT; + break; + case CLOCK_TAI: + tk_offset = TK_OFFS_TAI; + break; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + goto release_idr; + } + } + + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + gact = to_gate(*a); + + spin_lock_bh(&gact->tcf_lock); + p = &gact->param; + + if (tb[TCA_GATE_CYCLE_TIME]) { + p->tcfg_cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); + if (!p->tcfg_cycletime_ext) + goto chain_put; + } + + INIT_LIST_HEAD(&p->entries); + if (tb[TCA_GATE_ENTRY_LIST]) { + err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); + if (err < 0) + goto chain_put; + } + + if (!p->tcfg_cycletime) { + struct tcfg_gate_entry *entry; + ktime_t cycle = 0; + + list_for_each_entry(entry, &p->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + p->tcfg_cycletime = cycle; + } + + if (tb[TCA_GATE_CYCLE_TIME_EXT]) + p->tcfg_cycletime_ext = + nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); + + p->tcfg_priority = prio; + p->tcfg_basetime = basetime; + p->tcfg_clockid = clockid; + p->tcfg_flags = gflags; + + gact->tk_offset = tk_offset; + hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); + gact->hitimer.function = gate_timer_func; + + err = gate_get_start_time(gact, &start); + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Internal error: failed get start time"); + release_entry_list(&p->entries); + goto chain_put; + } + + gact->current_close_time = start; + gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; + + gact->next_entry = list_first_entry(&p->entries, + struct tcfg_gate_entry, list); + + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + + gate_start_timer(gact, start); + + spin_unlock_bh(&gact->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + + return ret; + +chain_put: + spin_unlock_bh(&gact->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; +} + +static void tcf_gate_cleanup(struct tc_action *a) +{ + struct tcf_gate *gact = to_gate(a); + struct tcf_gate_params *p; + + hrtimer_cancel(&gact->hitimer); + + p = &gact->param; + + release_entry_list(&p->entries); +} + +static int dumping_entry(struct sk_buff *skb, + struct tcfg_gate_entry *entry) +{ + struct nlattr *item; + + item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY); + if (!item) + return -ENOSPC; + + if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index)) + goto nla_put_failure; + + if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv)) + goto nla_put_failure; + + return nla_nest_end(skb, item); + +nla_put_failure: + nla_nest_cancel(skb, item); + return -1; +} + +static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_gate *gact = to_gate(a); + struct tc_gate opt = { + .index = gact->tcf_index, + .refcnt = refcount_read(&gact->tcf_refcnt) - ref, + .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, + }; + struct tcfg_gate_entry *entry; + struct tcf_gate_params *p; + struct nlattr *entry_list; + struct tcf_t t; + + spin_lock_bh(&gact->tcf_lock); + opt.action = gact->tcf_action; + + p = &gact->param; + + if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME, + p->tcfg_basetime, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME, + p->tcfg_cycletime, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT, + p->tcfg_cycletime_ext, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority)) + goto nla_put_failure; + + entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST); + if (!entry_list) + goto nla_put_failure; + + list_for_each_entry(entry, &p->entries, list) { + if (dumping_entry(skb, entry) < 0) + goto nla_put_failure; + } + + nla_nest_end(skb, entry_list); + + tcf_tm_dump(&t, &gact->tcf_tm); + if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) + goto nla_put_failure; + spin_unlock_bh(&gact->tcf_lock); + + return skb->len; + +nla_put_failure: + spin_unlock_bh(&gact->tcf_lock); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_gate_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse, bool hw) +{ + struct tcf_gate *gact = to_gate(a); + struct tcf_t *tm = &gact->tcf_tm; + + tcf_action_update_stats(a, bytes, packets, false, hw); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); +} + +static int tcf_gate_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tcf_idr_search(tn, a, index); +} + +static size_t tcf_gate_get_fill_size(const struct tc_action *act) +{ + return nla_total_size(sizeof(struct tc_gate)); +} + +static struct tc_action_ops act_gate_ops = { + .kind = "gate", + .id = TCA_ID_GATE, + .owner = THIS_MODULE, + .act = tcf_gate_act, + .dump = tcf_gate_dump, + .init = tcf_gate_init, + .cleanup = tcf_gate_cleanup, + .walk = tcf_gate_walker, + .stats_update = tcf_gate_stats_update, + .get_fill_size = tcf_gate_get_fill_size, + .lookup = tcf_gate_search, + .size = sizeof(struct tcf_gate), +}; + +static __net_init int gate_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tc_action_net_init(net, tn, &act_gate_ops); +} + +static void __net_exit gate_exit_net(struct list_head *net_list) +{ + tc_action_net_exit(net_list, gate_net_id); +} + +static struct pernet_operations gate_net_ops = { + .init = gate_init_net, + .exit_batch = gate_exit_net, + .id = &gate_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init gate_init_module(void) +{ + return tcf_register_action(&act_gate_ops, &gate_net_ops); +} + +static void __exit gate_cleanup_module(void) +{ + tcf_unregister_action(&act_gate_ops, &gate_net_ops); +} + +module_init(gate_init_module); +module_exit(gate_cleanup_module); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From d3f1cbd29fa63f1bb608603a6cd54ca7af56a68b Mon Sep 17 00:00:00 2001 From: Vincent Cheng Date: Fri, 1 May 2020 23:35:37 -0400 Subject: ptp: Add adjust_phase to ptp_clock_caps capability. Add adjust_phase to ptp_clock_caps capability to allow user to query if a PHC driver supports adjust phase with ioctl PTP_CLOCK_GETCAPS command. Signed-off-by: Vincent Cheng Reviewed-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 1 + include/uapi/linux/ptp_clock.h | 4 +++- tools/testing/selftests/ptp/testptp.c | 6 ++++-- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 93d574faf1fe..375cd6e4aade 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -136,6 +136,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) caps.pps = ptp->info->pps; caps.n_pins = ptp->info->n_pins; caps.cross_timestamping = ptp->info->getcrosststamp != NULL; + caps.adjust_phase = ptp->info->adjphase != NULL; if (copy_to_user((void __user *)arg, &caps, sizeof(caps))) err = -EFAULT; break; diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 9dc9d0079e98..ff070aa64278 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -89,7 +89,9 @@ struct ptp_clock_caps { int n_pins; /* Number of input/output pins. */ /* Whether the clock supports precise system-device cross timestamps */ int cross_timestamping; - int rsv[13]; /* Reserved for future use. */ + /* Whether the clock supports adjust phase */ + int adjust_phase; + int rsv[12]; /* Reserved for future use. */ }; struct ptp_extts_request { diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c index c0dd10257df5..da7a9dda9490 100644 --- a/tools/testing/selftests/ptp/testptp.c +++ b/tools/testing/selftests/ptp/testptp.c @@ -269,14 +269,16 @@ int main(int argc, char *argv[]) " %d programmable periodic signals\n" " %d pulse per second\n" " %d programmable pins\n" - " %d cross timestamping\n", + " %d cross timestamping\n" + " %d adjust_phase\n", caps.max_adj, caps.n_alarm, caps.n_ext_ts, caps.n_per_out, caps.pps, caps.n_pins, - caps.cross_timestamping); + caps.cross_timestamping, + caps.adjust_phase); } } -- cgit v1.2.3 From 712b2698e4c024b561694cbcc1abba13eb0fd9ce Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 30 Apr 2020 07:41:34 -0700 Subject: fs/stat: Define DAX statx attribute In order for users to determine if a file is currently operating in DAX state (effective DAX). Define a statx attribute value and set that attribute if the effective DAX flag is set. To go along with this we propose the following addition to the statx man page: STATX_ATTR_DAX The file is in the DAX (cpu direct access) state. DAX state attempts to minimize software cache effects for both I/O and memory mappings of this file. It requires a file system which has been configured to support DAX. DAX generally assumes all accesses are via cpu load / store instructions which can minimize overhead for small accesses, but may adversely affect cpu utilization for large transfers. File I/O is done directly to/from user-space buffers and memory mapped I/O may be performed with direct memory mappings that bypass kernel page cache. While the DAX property tends to result in data being transferred synchronously, it does not give the same guarantees of O_SYNC where data and the necessary metadata are transferred together. A DAX file may support being mapped with the MAP_SYNC flag, which enables a program to use CPU cache flush instructions to persist CPU store operations without an explicit fsync(2). See mmap(2) for more information. Reviewed-by: Dave Chinner Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Ira Weiny Signed-off-by: Darrick J. Wong --- fs/stat.c | 3 +++ include/uapi/linux/stat.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/fs/stat.c b/fs/stat.c index 030008796479..894699c74dde 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -79,6 +79,9 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, if (IS_AUTOMOUNT(inode)) stat->attributes |= STATX_ATTR_AUTOMOUNT; + if (IS_DAX(inode)) + stat->attributes |= STATX_ATTR_DAX; + if (inode->i_op->getattr) return inode->i_op->getattr(path, stat, request_mask, query_flags); diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index ad80a5c885d5..e5f9d5517f6b 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -169,6 +169,7 @@ struct statx { #define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ #define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ +#define STATX_ATTR_DAX 0x00002000 /* [I] File is DAX */ #endif /* _UAPI_LINUX_STAT_H */ -- cgit v1.2.3 From 39d010504e6b4485d7ceee167743620dd33f4417 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 May 2020 07:07:41 -0700 Subject: net_sched: sch_fq: add horizon attribute QUIC servers would like to use SO_TXTIME, without having CAP_NET_ADMIN, to efficiently pace UDP packets. As far as sch_fq is concerned, we need to add safety checks, so that a buggy application does not fill the qdisc with packets having delivery time far in the future. This patch adds a configurable horizon (default: 10 seconds), and a configurable policy when a packet is beyond the horizon at enqueue() time: - either drop the packet (default policy) - or cap its delivery time to the horizon. $ tc -s -d qd sh dev eth0 qdisc fq 8022: root refcnt 257 limit 10000p flow_limit 100p buckets 1024 orphan_mask 1023 quantum 10Kb initial_quantum 51160b low_rate_threshold 550Kbit refill_delay 40.0ms timer_slack 10.000us horizon 10.000s Sent 1234215879 bytes 837099 pkt (dropped 21, overlimits 0 requeues 6) backlog 0b 0p requeues 6 flows 1191 (inactive 1177 throttled 0) gc 0 highprio 0 throttled 692 latency 11.480us pkts_too_long 0 alloc_errors 0 horizon_drops 21 horizon_caps 0 v2: fixed an overflow on 32bit kernels in fq_init(), reported by kbuild test robot Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 6 +++++ net/sched/sch_fq.c | 59 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 60 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 0c02737c8f47..a95f3ae7ab37 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -913,6 +913,10 @@ enum { TCA_FQ_TIMER_SLACK, /* timer slack */ + TCA_FQ_HORIZON, /* time horizon in us */ + + TCA_FQ_HORIZON_DROP, /* drop packets beyond horizon, or cap their EDT */ + __TCA_FQ_MAX }; @@ -932,6 +936,8 @@ struct tc_fq_qd_stats { __u32 throttled_flows; __u32 unthrottle_latency_ns; __u64 ce_mark; /* packets above ce_threshold */ + __u64 horizon_drops; + __u64 horizon_caps; }; /* Heavy-Hitter Filter */ diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4f0104243cc2..8f06a808c59a 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -100,6 +100,7 @@ struct fq_sched_data { struct rb_root delayed; /* for rate limited flows */ u64 time_next_delayed_flow; + u64 ktime_cache; /* copy of last ktime_get_ns() */ unsigned long unthrottle_latency_ns; struct fq_flow internal; /* for non classified or high prio packets */ @@ -109,12 +110,13 @@ struct fq_sched_data { u32 flow_plimit; /* max packets per flow */ unsigned long flow_max_rate; /* optional max rate per flow */ u64 ce_threshold; + u64 horizon; /* horizon in ns */ u32 orphan_mask; /* mask for orphaned skb */ u32 low_rate_threshold; struct rb_root *fq_root; u8 rate_enable; u8 fq_trees_log; - + u8 horizon_drop; u32 flows; u32 inactive_flows; u32 throttled_flows; @@ -123,6 +125,8 @@ struct fq_sched_data { u64 stat_internal_packets; u64 stat_throttled; u64 stat_ce_mark; + u64 stat_horizon_drops; + u64 stat_horizon_caps; u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; @@ -402,8 +406,6 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) struct rb_node **p, *parent; struct sk_buff *head, *aux; - fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns(); - head = flow->head; if (!head || fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { @@ -431,6 +433,12 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) rb_insert_color(&skb->rbnode, &flow->t_root); } +static bool fq_packet_beyond_horizon(const struct sk_buff *skb, + const struct fq_sched_data *q) +{ + return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon)); +} + static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -440,6 +448,28 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(sch->q.qlen >= sch->limit)) return qdisc_drop(skb, sch, to_free); + if (!skb->tstamp) { + fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); + } else { + /* Check if packet timestamp is too far in the future. + * Try first if our cached value, to avoid ktime_get_ns() + * cost in most cases. + */ + if (fq_packet_beyond_horizon(skb, q)) { + /* Refresh our cache and check another time */ + q->ktime_cache = ktime_get_ns(); + if (fq_packet_beyond_horizon(skb, q)) { + if (q->horizon_drop) { + q->stat_horizon_drops++; + return qdisc_drop(skb, sch, to_free); + } + q->stat_horizon_caps++; + skb->tstamp = q->ktime_cache + q->horizon; + } + } + fq_skb_cb(skb)->time_to_send = skb->tstamp; + } + f = fq_classify(skb, q); if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { q->stat_flows_plimit++; @@ -512,7 +542,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) goto out; } - now = ktime_get_ns(); + q->ktime_cache = now = ktime_get_ns(); fq_check_throttled(q, now); begin: head = &q->new_flows; @@ -765,6 +795,8 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, + [TCA_FQ_HORIZON] = { .type = NLA_U32 }, + [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt, @@ -854,7 +886,15 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_TIMER_SLACK]) q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); + if (tb[TCA_FQ_HORIZON]) + q->horizon = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_HORIZON]); + + if (tb[TCA_FQ_HORIZON_DROP]) + q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); + if (!err) { + sch_tree_unlock(sch); err = fq_resize(sch, fq_log); sch_tree_lock(sch); @@ -907,6 +947,9 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ + q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */ + q->horizon_drop = 1; /* by default, drop packets beyond horizon */ + /* Default ce_threshold of 4294 seconds */ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; @@ -924,6 +967,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); u64 ce_threshold = q->ce_threshold; + u64 horizon = q->horizon; struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -933,6 +977,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ do_div(ce_threshold, NSEC_PER_USEC); + do_div(horizon, NSEC_PER_USEC); if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || @@ -948,7 +993,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) q->low_rate_threshold) || nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || - nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack)) + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || + nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || + nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -979,6 +1026,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.unthrottle_latency_ns = min_t(unsigned long, q->unthrottle_latency_ns, ~0U); st.ce_mark = q->stat_ce_mark; + st.horizon_drops = q->stat_horizon_drops; + st.horizon_caps = q->stat_horizon_caps; sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); -- cgit v1.2.3 From f645e6256bd1b12523b759fcc610861fb21c24c7 Mon Sep 17 00:00:00 2001 From: Niklas Söderlund Date: Tue, 21 Apr 2020 15:57:38 +0200 Subject: media: v4l2-dev/ioctl: Add V4L2_CAP_IO_MC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a video device capability flag to indicate that its inputs and/or outputs are controlled by the Media Controller instead of the V4L2 API. When this flag is set, ioctl for enum inputs and outputs are automatically enabled and programmed to call a helper function. Suggested-by: Hans Verkuil Signed-off-by: Helen Koike Signed-off-by: Niklas Söderlund Reviewed-by: Laurent Pinchart Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/vidioc-querycap.rst | 6 +++ .../userspace-api/media/videodev2.h.rst.exceptions | 1 + drivers/media/v4l2-core/v4l2-dev.c | 25 +++++++--- drivers/media/v4l2-core/v4l2-ioctl.c | 57 ++++++++++++++++++++-- include/uapi/linux/videodev2.h | 2 + 5 files changed, 81 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/userspace-api/media/v4l/vidioc-querycap.rst b/Documentation/userspace-api/media/v4l/vidioc-querycap.rst index 28e1f766128c..666ac4d42051 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-querycap.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-querycap.rst @@ -264,6 +264,12 @@ specification the ioctl returns an ``EINVAL`` error code. * - ``V4L2_CAP_TOUCH`` - 0x10000000 - This is a touch device. + * - ``V4L2_CAP_IO_MC`` + - 0x20000000 + - There is only one input and/or output seen from userspace. The whole + video topology configuration, including which I/O entity is routed to + the input/output, is configured by userspace via the Media Controller. + See :ref:`media_controller`. * - ``V4L2_CAP_DEVICE_CAPS`` - 0x80000000 - The driver fills the ``device_caps`` field. This capability can diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions index cb6ccf91776e..a625fb90e3a9 100644 --- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions +++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions @@ -176,6 +176,7 @@ replace define V4L2_CAP_STREAMING device-capabilities replace define V4L2_CAP_META_OUTPUT device-capabilities replace define V4L2_CAP_DEVICE_CAPS device-capabilities replace define V4L2_CAP_TOUCH device-capabilities +replace define V4L2_CAP_IO_MC device-capabilities # V4L2 pix flags replace define V4L2_PIX_FMT_PRIV_MAGIC :c:type:`v4l2_pix_format` diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c index 97b6a3af1361..a593ea0598b5 100644 --- a/drivers/media/v4l2-core/v4l2-dev.c +++ b/drivers/media/v4l2-core/v4l2-dev.c @@ -552,6 +552,7 @@ static void determine_valid_ioctls(struct video_device *vdev) (vdev->device_caps & meta_caps); bool is_rx = vdev->vfl_dir != VFL_DIR_TX; bool is_tx = vdev->vfl_dir != VFL_DIR_RX; + bool is_io_mc = vdev->device_caps & V4L2_CAP_IO_MC; bitmap_zero(valid_ioctls, BASE_VIDIOC_PRIVATE); @@ -725,9 +726,15 @@ static void determine_valid_ioctls(struct video_device *vdev) SET_VALID_IOCTL(ops, VIDIOC_G_STD, vidioc_g_std); if (is_rx) { SET_VALID_IOCTL(ops, VIDIOC_QUERYSTD, vidioc_querystd); - SET_VALID_IOCTL(ops, VIDIOC_ENUMINPUT, vidioc_enum_input); - SET_VALID_IOCTL(ops, VIDIOC_G_INPUT, vidioc_g_input); - SET_VALID_IOCTL(ops, VIDIOC_S_INPUT, vidioc_s_input); + if (is_io_mc) { + set_bit(_IOC_NR(VIDIOC_ENUMINPUT), valid_ioctls); + set_bit(_IOC_NR(VIDIOC_G_INPUT), valid_ioctls); + set_bit(_IOC_NR(VIDIOC_S_INPUT), valid_ioctls); + } else { + SET_VALID_IOCTL(ops, VIDIOC_ENUMINPUT, vidioc_enum_input); + SET_VALID_IOCTL(ops, VIDIOC_G_INPUT, vidioc_g_input); + SET_VALID_IOCTL(ops, VIDIOC_S_INPUT, vidioc_s_input); + } SET_VALID_IOCTL(ops, VIDIOC_ENUMAUDIO, vidioc_enumaudio); SET_VALID_IOCTL(ops, VIDIOC_G_AUDIO, vidioc_g_audio); SET_VALID_IOCTL(ops, VIDIOC_S_AUDIO, vidioc_s_audio); @@ -735,9 +742,15 @@ static void determine_valid_ioctls(struct video_device *vdev) SET_VALID_IOCTL(ops, VIDIOC_S_EDID, vidioc_s_edid); } if (is_tx) { - SET_VALID_IOCTL(ops, VIDIOC_ENUMOUTPUT, vidioc_enum_output); - SET_VALID_IOCTL(ops, VIDIOC_G_OUTPUT, vidioc_g_output); - SET_VALID_IOCTL(ops, VIDIOC_S_OUTPUT, vidioc_s_output); + if (is_io_mc) { + set_bit(_IOC_NR(VIDIOC_ENUMOUTPUT), valid_ioctls); + set_bit(_IOC_NR(VIDIOC_G_OUTPUT), valid_ioctls); + set_bit(_IOC_NR(VIDIOC_S_OUTPUT), valid_ioctls); + } else { + SET_VALID_IOCTL(ops, VIDIOC_ENUMOUTPUT, vidioc_enum_output); + SET_VALID_IOCTL(ops, VIDIOC_G_OUTPUT, vidioc_g_output); + SET_VALID_IOCTL(ops, VIDIOC_S_OUTPUT, vidioc_s_output); + } SET_VALID_IOCTL(ops, VIDIOC_ENUMAUDOUT, vidioc_enumaudout); SET_VALID_IOCTL(ops, VIDIOC_G_AUDOUT, vidioc_g_audout); SET_VALID_IOCTL(ops, VIDIOC_S_AUDOUT, vidioc_s_audout); diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 833e79a2cb98..665b2bc7b732 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -1085,6 +1085,32 @@ static int v4l_querycap(const struct v4l2_ioctl_ops *ops, return ret; } +static int v4l_g_input(const struct v4l2_ioctl_ops *ops, + struct file *file, void *fh, void *arg) +{ + struct video_device *vfd = video_devdata(file); + + if (vfd->device_caps & V4L2_CAP_IO_MC) { + *(int *)arg = 0; + return 0; + } + + return ops->vidioc_g_input(file, fh, arg); +} + +static int v4l_g_output(const struct v4l2_ioctl_ops *ops, + struct file *file, void *fh, void *arg) +{ + struct video_device *vfd = video_devdata(file); + + if (vfd->device_caps & V4L2_CAP_IO_MC) { + *(int *)arg = 0; + return 0; + } + + return ops->vidioc_g_output(file, fh, arg); +} + static int v4l_s_input(const struct v4l2_ioctl_ops *ops, struct file *file, void *fh, void *arg) { @@ -1094,12 +1120,21 @@ static int v4l_s_input(const struct v4l2_ioctl_ops *ops, ret = v4l_enable_media_source(vfd); if (ret) return ret; + + if (vfd->device_caps & V4L2_CAP_IO_MC) + return *(int *)arg ? -EINVAL : 0; + return ops->vidioc_s_input(file, fh, *(unsigned int *)arg); } static int v4l_s_output(const struct v4l2_ioctl_ops *ops, struct file *file, void *fh, void *arg) { + struct video_device *vfd = video_devdata(file); + + if (vfd->device_caps & V4L2_CAP_IO_MC) + return *(int *)arg ? -EINVAL : 0; + return ops->vidioc_s_output(file, fh, *(unsigned int *)arg); } @@ -1143,6 +1178,14 @@ static int v4l_enuminput(const struct v4l2_ioctl_ops *ops, if (is_valid_ioctl(vfd, VIDIOC_S_STD)) p->capabilities |= V4L2_IN_CAP_STD; + if (vfd->device_caps & V4L2_CAP_IO_MC) { + if (p->index) + return -EINVAL; + strscpy(p->name, vfd->name, sizeof(p->name)); + p->type = V4L2_INPUT_TYPE_CAMERA; + return 0; + } + return ops->vidioc_enum_input(file, fh, p); } @@ -1161,6 +1204,14 @@ static int v4l_enumoutput(const struct v4l2_ioctl_ops *ops, if (is_valid_ioctl(vfd, VIDIOC_S_STD)) p->capabilities |= V4L2_OUT_CAP_STD; + if (vfd->device_caps & V4L2_CAP_IO_MC) { + if (p->index) + return -EINVAL; + strscpy(p->name, vfd->name, sizeof(p->name)); + p->type = V4L2_OUTPUT_TYPE_ANALOG; + return 0; + } + return ops->vidioc_enum_output(file, fh, p); } @@ -2683,10 +2734,8 @@ DEFINE_V4L_STUB_FUNC(expbuf) DEFINE_V4L_STUB_FUNC(g_std) DEFINE_V4L_STUB_FUNC(g_audio) DEFINE_V4L_STUB_FUNC(s_audio) -DEFINE_V4L_STUB_FUNC(g_input) DEFINE_V4L_STUB_FUNC(g_edid) DEFINE_V4L_STUB_FUNC(s_edid) -DEFINE_V4L_STUB_FUNC(g_output) DEFINE_V4L_STUB_FUNC(g_audout) DEFINE_V4L_STUB_FUNC(s_audout) DEFINE_V4L_STUB_FUNC(g_jpegcomp) @@ -2735,11 +2784,11 @@ static const struct v4l2_ioctl_info v4l2_ioctls[] = { IOCTL_INFO(VIDIOC_S_AUDIO, v4l_stub_s_audio, v4l_print_audio, INFO_FL_PRIO), IOCTL_INFO(VIDIOC_QUERYCTRL, v4l_queryctrl, v4l_print_queryctrl, INFO_FL_CTRL | INFO_FL_CLEAR(v4l2_queryctrl, id)), IOCTL_INFO(VIDIOC_QUERYMENU, v4l_querymenu, v4l_print_querymenu, INFO_FL_CTRL | INFO_FL_CLEAR(v4l2_querymenu, index)), - IOCTL_INFO(VIDIOC_G_INPUT, v4l_stub_g_input, v4l_print_u32, 0), + IOCTL_INFO(VIDIOC_G_INPUT, v4l_g_input, v4l_print_u32, 0), IOCTL_INFO(VIDIOC_S_INPUT, v4l_s_input, v4l_print_u32, INFO_FL_PRIO), IOCTL_INFO(VIDIOC_G_EDID, v4l_stub_g_edid, v4l_print_edid, INFO_FL_ALWAYS_COPY), IOCTL_INFO(VIDIOC_S_EDID, v4l_stub_s_edid, v4l_print_edid, INFO_FL_PRIO | INFO_FL_ALWAYS_COPY), - IOCTL_INFO(VIDIOC_G_OUTPUT, v4l_stub_g_output, v4l_print_u32, 0), + IOCTL_INFO(VIDIOC_G_OUTPUT, v4l_g_output, v4l_print_u32, 0), IOCTL_INFO(VIDIOC_S_OUTPUT, v4l_s_output, v4l_print_u32, INFO_FL_PRIO), IOCTL_INFO(VIDIOC_ENUMOUTPUT, v4l_enumoutput, v4l_print_enumoutput, INFO_FL_CLEAR(v4l2_output, index)), IOCTL_INFO(VIDIOC_G_AUDOUT, v4l_stub_g_audout, v4l_print_audioout, 0), diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 9817b7e2c968..b18f3f7cde31 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -487,6 +487,8 @@ struct v4l2_capability { #define V4L2_CAP_TOUCH 0x10000000 /* Is a touch device */ +#define V4L2_CAP_IO_MC 0x20000000 /* Is input/output controlled by the media controller */ + #define V4L2_CAP_DEVICE_CAPS 0x80000000 /* sets device capabilities field */ /* -- cgit v1.2.3 From e5b6b07a1b45dd9d19bec1fa1d60750b0fcf2fb0 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 24 Apr 2020 15:43:31 +0200 Subject: media: v4l2: Extend VIDIOC_ENUM_FMT to support MC-centric devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VIDIOC_ENUM_FMT ioctl enumerates all formats supported by a video node. For MC-centric devices, its behaviour has always been ill-defined, with drivers implementing one of the following behaviours: - No support for VIDIOC_ENUM_FMT at all - Enumerating all formats supported by the video node, regardless of the configuration of the pipeline - Enumerating formats supported by the video node for the active configuration of the connected subdevice The first behaviour is obviously useless for applications. The second behaviour provides the most information, but doesn't offer a way to find what formats are compatible with a given pipeline configuration. The third behaviour fixes that, but with the drawback that applications can't enumerate all supported formats anymore, and have to modify the active configuration of the pipeline to enumerate formats. The situation is messy as none of the implemented behaviours are ideal, and userspace can't predict what will happen as the behaviour is driver-specific. To fix this, let's extend the VIDIOC_ENUM_FMT with a missing capability: enumerating pixel formats for a given media bus code. The media bus code is passed through the v4l2_fmtdesc structure in a new mbus_code field (repurposed from the reserved fields). With this capability in place, applications can enumerate pixel formats for a given media bus code without modifying the active configuration of the device. The current behaviour of the ioctl is preserved when the new mbus_code field is set to 0, ensuring compatibility with existing userspace. The API extension is documented as mandatory for MC-centric devices (as advertised through the V4L2_CAP_IO_MC capability), allowing applications and compliance tools to easily determine the availability of the VIDIOC_ENUM_FMT extension. Signed-off-by: Laurent Pinchart Acked-by: Sakari Ailus Signed-off-by: Niklas Söderlund Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/vidioc-enum-fmt.rst | 27 ++++++++++++++++++---- drivers/media/v4l2-core/v4l2-ioctl.c | 13 +++++++++-- include/uapi/linux/videodev2.h | 3 ++- 3 files changed, 35 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst index 7e3142e11d77..9694111772a2 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst @@ -48,10 +48,21 @@ one until ``EINVAL`` is returned. If applicable, drivers shall return formats in preference order, where preferred formats are returned before (that is, with lower ``index`` value) less-preferred formats. -.. note:: - - After switching input or output the list of enumerated image - formats may be different. +If the driver doesn't advertise the ``V4L2_CAP_IO_MC`` :ref:`capability +`, applications shall initialize the ``mbus_code`` field +to zero and drivers shall ignore the value of the field. Drivers shall +enumerate all image formats. The enumerated formats may depend on the active +input or output of the device. + +If the driver advertises the ``V4L2_CAP_IO_MC`` :ref:`capability +`, applications may initialize the ``mbus_code`` field to +a valid :ref:`media bus format code `. If the +``mbus_code`` field is not zero, drivers shall restrict enumeration to only the +image formats that can produce (for video output devices) or be produced from +(for video capture devices) that media bus code. Regardless of the value of +the ``mbus_code`` field, the enumerated image formats shall not depend on the +active configuration of the video device or device pipeline. Enumeration shall +otherwise operate as previously described. .. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.7cm}| @@ -106,7 +117,13 @@ formats in preference order, where preferred formats are returned before These codes are not the same as those used in the Windows world. * - __u32 - - ``reserved``\ [4] + - ``mbus_code`` + - Media bus code restricting the enumerated formats, set by the + application. Only applicable to drivers that advertise the + ``V4L2_CAP_IO_MC`` :ref:`capability `, shall be 0 + otherwise. + * - __u32 + - ``reserved``\ [3] - Reserved for future extensions. Drivers must set the array to zero. diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 665b2bc7b732..2322f08a98be 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -264,12 +264,13 @@ static void v4l_print_fmtdesc(const void *arg, bool write_only) { const struct v4l2_fmtdesc *p = arg; - pr_cont("index=%u, type=%s, flags=0x%x, pixelformat=%c%c%c%c, description='%.*s'\n", + pr_cont("index=%u, type=%s, flags=0x%x, pixelformat=%c%c%c%c, mbus_code=0x%04x, description='%.*s'\n", p->index, prt_names(p->type, v4l2_type_names), p->flags, (p->pixelformat & 0xff), (p->pixelformat >> 8) & 0xff, (p->pixelformat >> 16) & 0xff, (p->pixelformat >> 24) & 0xff, + p->mbus_code, (int)sizeof(p->description), p->description); } @@ -1472,12 +1473,20 @@ static int v4l_enum_fmt(const struct v4l2_ioctl_ops *ops, struct video_device *vdev = video_devdata(file); struct v4l2_fmtdesc *p = arg; int ret = check_fmt(file, p->type); + u32 mbus_code; u32 cap_mask; if (ret) return ret; ret = -EINVAL; + if (!(vdev->device_caps & V4L2_CAP_IO_MC)) + p->mbus_code = 0; + + mbus_code = p->mbus_code; + CLEAR_AFTER_FIELD(p, type); + p->mbus_code = mbus_code; + switch (p->type) { case V4L2_BUF_TYPE_VIDEO_CAPTURE: case V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE: @@ -2757,7 +2766,7 @@ DEFINE_V4L_STUB_FUNC(dv_timings_cap) static const struct v4l2_ioctl_info v4l2_ioctls[] = { IOCTL_INFO(VIDIOC_QUERYCAP, v4l_querycap, v4l_print_querycap, 0), - IOCTL_INFO(VIDIOC_ENUM_FMT, v4l_enum_fmt, v4l_print_fmtdesc, INFO_FL_CLEAR(v4l2_fmtdesc, type)), + IOCTL_INFO(VIDIOC_ENUM_FMT, v4l_enum_fmt, v4l_print_fmtdesc, 0), IOCTL_INFO(VIDIOC_G_FMT, v4l_g_fmt, v4l_print_format, 0), IOCTL_INFO(VIDIOC_S_FMT, v4l_s_fmt, v4l_print_format, INFO_FL_PRIO), IOCTL_INFO(VIDIOC_REQBUFS, v4l_reqbufs, v4l_print_requestbuffers, INFO_FL_PRIO | INFO_FL_QUEUE), diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index b18f3f7cde31..c3a1cf1c507f 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -784,7 +784,8 @@ struct v4l2_fmtdesc { __u32 flags; __u8 description[32]; /* Description string */ __u32 pixelformat; /* Format fourcc */ - __u32 reserved[4]; + __u32 mbus_code; /* Media bus code */ + __u32 reserved[3]; }; #define V4L2_FMT_FLAG_COMPRESSED 0x0001 -- cgit v1.2.3 From bdbdac7649fac05f88c9f7ab18121a17fb591687 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 5 May 2020 08:35:05 +0200 Subject: ethtool: provide UAPI for PHY master/slave configuration. This UAPI is needed for BroadR-Reach 100BASE-T1 devices. Due to lack of auto-negotiation support, we needed to be able to configure the MASTER-SLAVE role of the port manually or from an application in user space. The same UAPI can be used for 1000BASE-T or MultiGBASE-T devices to force MASTER or SLAVE role. See IEEE 802.3-2018: 22.2.4.3.7 MASTER-SLAVE control register (Register 9) 22.2.4.3.8 MASTER-SLAVE status register (Register 10) 40.5.2 MASTER-SLAVE configuration resolution 45.2.1.185.1 MASTER-SLAVE config value (1.2100.14) 45.2.7.10 MultiGBASE-T AN control 1 register (Register 7.32) The MASTER-SLAVE role affects the clock configuration: ------------------------------------------------------------------------------- When the PHY is configured as MASTER, the PMA Transmit function shall source TX_TCLK from a local clock source. When configured as SLAVE, the PMA Transmit function shall source TX_TCLK from the clock recovered from data stream provided by MASTER. iMX6Q KSZ9031 XXX ------\ /-----------\ /------------\ | | | | | MAC |<----RGMII----->| PHY Slave |<------>| PHY Master | |<--- 125 MHz ---+-<------/ | | \ | ------/ \-----------/ \------------/ ^ \-TX_TCLK ------------------------------------------------------------------------------- Since some clock or link related issues are only reproducible in a specific MASTER-SLAVE-role, MAC and PHY configuration, it is beneficial to provide generic (not 100BASE-T1 specific) interface to the user space for configuration flexibility and trouble shooting. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 35 ++++++----- drivers/net/phy/phy.c | 4 +- drivers/net/phy/phy_device.c | 94 ++++++++++++++++++++++++++++ include/linux/phy.h | 3 + include/uapi/linux/ethtool.h | 16 ++++- include/uapi/linux/ethtool_netlink.h | 2 + include/uapi/linux/mii.h | 2 + net/ethtool/ioctl.c | 6 ++ net/ethtool/linkmodes.c | 53 ++++++++++++++++ 9 files changed, 197 insertions(+), 18 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 567326491f80..8f5cefc539cf 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -392,14 +392,16 @@ Request contents: Kernel response contents: - ==================================== ====== ========================== - ``ETHTOOL_A_LINKMODES_HEADER`` nested reply header - ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status - ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes - ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes - ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) - ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode - ==================================== ====== ========================== + ========================================== ====== ========================== + ``ETHTOOL_A_LINKMODES_HEADER`` nested reply header + ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status + ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes + ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes + ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) + ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode + ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG`` u8 Master/slave port mode + ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE`` u8 Master/slave port state + ========================================== ====== ========================== For ``ETHTOOL_A_LINKMODES_OURS``, value represents advertised modes and mask represents supported modes. ``ETHTOOL_A_LINKMODES_PEER`` in the reply is a bit @@ -414,14 +416,15 @@ LINKMODES_SET Request contents: - ==================================== ====== ========================== - ``ETHTOOL_A_LINKMODES_HEADER`` nested request header - ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status - ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes - ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes - ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) - ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode - ==================================== ====== ========================== + ========================================== ====== ========================== + ``ETHTOOL_A_LINKMODES_HEADER`` nested request header + ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status + ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes + ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes + ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) + ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode + ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG`` u8 Master/slave port mode + ========================================== ====== ========================== ``ETHTOOL_A_LINKMODES_OURS`` bit set allows setting advertised link modes. If autonegotiation is on (either set now or kept from before), advertised modes diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 72c69a9c8a98..8c22d02b4218 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -295,7 +295,7 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev, phydev->advertising, autoneg == AUTONEG_ENABLE); phydev->duplex = duplex; - + phydev->master_slave_set = cmd->base.master_slave_cfg; phydev->mdix_ctrl = cmd->base.eth_tp_mdix_ctrl; /* Restart the PHY */ @@ -314,6 +314,8 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev, cmd->base.speed = phydev->speed; cmd->base.duplex = phydev->duplex; + cmd->base.master_slave_cfg = phydev->master_slave_get; + cmd->base.master_slave_state = phydev->master_slave_state; if (phydev->interface == PHY_INTERFACE_MODE_MOCA) cmd->base.port = PORT_BNC; else diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index b1c5e4503bc4..83fc8e1b5793 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1913,6 +1913,90 @@ int genphy_setup_forced(struct phy_device *phydev) } EXPORT_SYMBOL(genphy_setup_forced); +static int genphy_setup_master_slave(struct phy_device *phydev) +{ + u16 ctl = 0; + + if (!phydev->is_gigabit_capable) + return 0; + + switch (phydev->master_slave_set) { + case MASTER_SLAVE_CFG_MASTER_PREFERRED: + ctl |= CTL1000_PREFER_MASTER; + break; + case MASTER_SLAVE_CFG_SLAVE_PREFERRED: + break; + case MASTER_SLAVE_CFG_MASTER_FORCE: + ctl |= CTL1000_AS_MASTER; + /* fallthrough */ + case MASTER_SLAVE_CFG_SLAVE_FORCE: + ctl |= CTL1000_ENABLE_MASTER; + break; + case MASTER_SLAVE_CFG_UNKNOWN: + case MASTER_SLAVE_CFG_UNSUPPORTED: + return 0; + default: + phydev_warn(phydev, "Unsupported Master/Slave mode\n"); + return -EOPNOTSUPP; + } + + return phy_modify_changed(phydev, MII_CTRL1000, + (CTL1000_ENABLE_MASTER | CTL1000_AS_MASTER | + CTL1000_PREFER_MASTER), ctl); +} + +static int genphy_read_master_slave(struct phy_device *phydev) +{ + int cfg, state; + u16 val; + + if (!phydev->is_gigabit_capable) { + phydev->master_slave_get = MASTER_SLAVE_CFG_UNSUPPORTED; + phydev->master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; + return 0; + } + + phydev->master_slave_get = MASTER_SLAVE_CFG_UNKNOWN; + phydev->master_slave_state = MASTER_SLAVE_STATE_UNKNOWN; + + val = phy_read(phydev, MII_CTRL1000); + if (val < 0) + return val; + + if (val & CTL1000_ENABLE_MASTER) { + if (val & CTL1000_AS_MASTER) + cfg = MASTER_SLAVE_CFG_MASTER_FORCE; + else + cfg = MASTER_SLAVE_CFG_SLAVE_FORCE; + } else { + if (val & CTL1000_PREFER_MASTER) + cfg = MASTER_SLAVE_CFG_MASTER_PREFERRED; + else + cfg = MASTER_SLAVE_CFG_SLAVE_PREFERRED; + } + + val = phy_read(phydev, MII_STAT1000); + if (val < 0) + return val; + + if (val & LPA_1000MSFAIL) { + state = MASTER_SLAVE_STATE_ERR; + } else if (phydev->link) { + /* this bits are valid only for active link */ + if (val & LPA_1000MSRES) + state = MASTER_SLAVE_STATE_MASTER; + else + state = MASTER_SLAVE_STATE_SLAVE; + } else { + state = MASTER_SLAVE_STATE_UNKNOWN; + } + + phydev->master_slave_get = cfg; + phydev->master_slave_state = state; + + return 0; +} + /** * genphy_restart_aneg - Enable and Restart Autonegotiation * @phydev: target phy_device struct @@ -1971,6 +2055,12 @@ int __genphy_config_aneg(struct phy_device *phydev, bool changed) if (genphy_config_eee_advert(phydev)) changed = true; + err = genphy_setup_master_slave(phydev); + if (err < 0) + return err; + else if (err) + changed = true; + if (AUTONEG_ENABLE != phydev->autoneg) return genphy_setup_forced(phydev); @@ -2205,6 +2295,10 @@ int genphy_read_status(struct phy_device *phydev) phydev->pause = 0; phydev->asym_pause = 0; + err = genphy_read_master_slave(phydev); + if (err < 0) + return err; + err = genphy_read_lpa(phydev); if (err < 0) return err; diff --git a/include/linux/phy.h b/include/linux/phy.h index 1d36ac608159..a2b91b5f9d0a 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -477,6 +477,9 @@ struct phy_device { int duplex; int pause; int asym_pause; + u8 master_slave_get; + u8 master_slave_set; + u8 master_slave_state; /* Union of PHY and Attached devices' supported link modes */ /* See ethtool.h for more info */ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 92f737f10117..f4662b3a9e1e 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1666,6 +1666,18 @@ static inline int ethtool_validate_duplex(__u8 duplex) return 0; } +#define MASTER_SLAVE_CFG_UNSUPPORTED 0 +#define MASTER_SLAVE_CFG_UNKNOWN 1 +#define MASTER_SLAVE_CFG_MASTER_PREFERRED 2 +#define MASTER_SLAVE_CFG_SLAVE_PREFERRED 3 +#define MASTER_SLAVE_CFG_MASTER_FORCE 4 +#define MASTER_SLAVE_CFG_SLAVE_FORCE 5 +#define MASTER_SLAVE_STATE_UNSUPPORTED 0 +#define MASTER_SLAVE_STATE_UNKNOWN 1 +#define MASTER_SLAVE_STATE_MASTER 2 +#define MASTER_SLAVE_STATE_SLAVE 3 +#define MASTER_SLAVE_STATE_ERR 4 + /* Which connector port. */ #define PORT_TP 0x00 #define PORT_AUI 0x01 @@ -1904,7 +1916,9 @@ struct ethtool_link_settings { __u8 eth_tp_mdix_ctrl; __s8 link_mode_masks_nwords; __u8 transceiver; - __u8 reserved1[3]; + __u8 master_slave_cfg; + __u8 master_slave_state; + __u8 reserved1[1]; __u32 reserved[7]; __u32 link_mode_masks[0]; /* layout of link_mode_masks fields: diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7fde76366ba4..bf1d310e20bc 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -216,6 +216,8 @@ enum { ETHTOOL_A_LINKMODES_PEER, /* bitset */ ETHTOOL_A_LINKMODES_SPEED, /* u32 */ ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ + ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ + ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ /* add new constants above here */ __ETHTOOL_A_LINKMODES_CNT, diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h index 90f9b4e1ba27..39f7c44baf53 100644 --- a/include/uapi/linux/mii.h +++ b/include/uapi/linux/mii.h @@ -151,11 +151,13 @@ /* 1000BASE-T Control register */ #define ADVERTISE_1000FULL 0x0200 /* Advertise 1000BASE-T full duplex */ #define ADVERTISE_1000HALF 0x0100 /* Advertise 1000BASE-T half duplex */ +#define CTL1000_PREFER_MASTER 0x0400 /* prefer to operate as master */ #define CTL1000_AS_MASTER 0x0800 #define CTL1000_ENABLE_MASTER 0x1000 /* 1000BASE-T Status register */ #define LPA_1000MSFAIL 0x8000 /* Master/Slave resolution failure */ +#define LPA_1000MSRES 0x4000 /* Master/Slave resolution status */ #define LPA_1000LOCALRXOK 0x2000 /* Link partner local receiver status */ #define LPA_1000REMRXOK 0x1000 /* Link partner remote receiver status */ #define LPA_1000FULL 0x0800 /* Link partner 1000BASE-T full duplex */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 226d5ecdd567..52102ab1709b 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -552,6 +552,8 @@ static int ethtool_get_link_ksettings(struct net_device *dev, link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; + link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED; + link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; return store_link_ksettings_for_user(useraddr, &link_ksettings); } @@ -589,6 +591,10 @@ static int ethtool_set_link_ksettings(struct net_device *dev, != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; + if (link_ksettings.base.master_slave_cfg || + link_ksettings.base.master_slave_state) + return -EINVAL; + err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (err >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 452608c6d856..fd4f3e58c6f6 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -27,6 +27,8 @@ linkmodes_get_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE] = { .type = NLA_REJECT }, }; static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, @@ -63,6 +65,7 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; + const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int len, ret; @@ -86,6 +89,12 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, len += ret; } + if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED) + len += nla_total_size(sizeof(u8)); + + if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED) + len += nla_total_size(sizeof(u8)); + return len; } @@ -122,6 +131,16 @@ static int linkmodes_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex)) return -EMSGSIZE; + if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED && + nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, + lsettings->master_slave_cfg)) + return -EMSGSIZE; + + if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED && + nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, + lsettings->master_slave_state)) + return -EMSGSIZE; + return 0; } @@ -249,6 +268,8 @@ linkmodes_set_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE] = { .type = NLA_REJECT }, }; /* Set advertised link modes to all supported modes matching requested speed @@ -287,14 +308,45 @@ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, __ETHTOOL_LINK_MODE_MASK_NBITS); } +static bool ethnl_validate_master_slave_cfg(u8 cfg) +{ + switch (cfg) { + case MASTER_SLAVE_CFG_MASTER_PREFERRED: + case MASTER_SLAVE_CFG_SLAVE_PREFERRED: + case MASTER_SLAVE_CFG_MASTER_FORCE: + case MASTER_SLAVE_CFG_SLAVE_FORCE: + return true; + } + + return false; +} + static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, struct ethtool_link_ksettings *ksettings, bool *mod) { struct ethtool_link_settings *lsettings = &ksettings->base; bool req_speed, req_duplex; + const struct nlattr *master_slave_cfg; int ret; + master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; + if (master_slave_cfg) { + u8 cfg = nla_get_u8(master_slave_cfg); + + if (lsettings->master_slave_cfg == MASTER_SLAVE_CFG_UNSUPPORTED) { + NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, + "master/slave configuration not supported by device"); + return -EOPNOTSUPP; + } + + if (!ethnl_validate_master_slave_cfg(cfg)) { + NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, + "master/slave value is invalid"); + return -EOPNOTSUPP; + } + } + *mod = false; req_speed = tb[ETHTOOL_A_LINKMODES_SPEED]; req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX]; @@ -311,6 +363,7 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, mod); ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX], mod); + ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod); if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg && (req_speed || req_duplex) && -- cgit v1.2.3 From 50b6f619a099af6dfe06e958bfa08d46440ea788 Mon Sep 17 00:00:00 2001 From: Mika Kahola Date: Wed, 6 May 2020 15:08:27 +0300 Subject: uapi/drm/drm_fourcc.h: Note on platform specificity for format modifiers Make an additional note on DRM format modifiers for x and y tiling. These format modifiers are defined for BDW+ platforms and therefore definition is not valid for older gens. This is due to address swizzling for tiled surfaces is no longer used. For newer platforms main memory controller has a more effective address swizzling algorithm. v2: Rephrase comment (Daniel) Signed-off-by: Mika Kahola Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20200506120827.12250-1-mika.kahola@intel.com --- include/uapi/drm/drm_fourcc.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 8bc0b31597d8..9e488d10f8b4 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -354,9 +354,12 @@ extern "C" { * a platform-dependent stride. On top of that the memory can apply * platform-depending swizzling of some higher address bits into bit6. * - * This format is highly platforms specific and not useful for cross-driver - * sharing. It exists since on a given platform it does uniquely identify the - * layout in a simple way for i915-specific userspace. + * Note that this layout is only accurate on intel gen 8+ or valleyview chipsets. + * On earlier platforms the is highly platforms specific and not useful for + * cross-driver sharing. It exists since on a given platform it does uniquely + * identify the layout in a simple way for i915-specific userspace, which + * facilitated conversion of userspace to modifiers. Additionally the exact + * format on some really old platforms is not known. */ #define I915_FORMAT_MOD_X_TILED fourcc_mod_code(INTEL, 1) @@ -369,9 +372,12 @@ extern "C" { * memory can apply platform-depending swizzling of some higher address bits * into bit6. * - * This format is highly platforms specific and not useful for cross-driver - * sharing. It exists since on a given platform it does uniquely identify the - * layout in a simple way for i915-specific userspace. + * Note that this layout is only accurate on intel gen 8+ or valleyview chipsets. + * On earlier platforms the is highly platforms specific and not useful for + * cross-driver sharing. It exists since on a given platform it does uniquely + * identify the layout in a simple way for i915-specific userspace, which + * facilitated conversion of userspace to modifiers. Additionally the exact + * format on some really old platforms is not known. */ #define I915_FORMAT_MOD_Y_TILED fourcc_mod_code(INTEL, 2) -- cgit v1.2.3 From 8086fbaf49345f988deec539ec8e182b02914401 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:11 -0700 Subject: bpf: Allow any port in bpf_bind helper We want to have a tighter control on what ports we bind to in the BPF_CGROUP_INET{4,6}_CONNECT hooks even if it means connect() becomes slightly more expensive. The expensive part comes from the fact that we now need to call inet_csk_get_port() that verifies that the port is not used and allocates an entry in the hash table for it. Since we can't rely on "snum || !bind_address_no_port" to prevent us from calling POST_BIND hook anymore, let's add another bind flag to indicate that the call site is BPF program. v5: * fix wrong AF_INET (should be AF_INET6) in the bpf program for v6 v3: * More bpf_bind documentation refinements (Martin KaFai Lau) * Add UDP tests as well (Martin KaFai Lau) * Don't start the thread, just do socket+bind+listen (Martin KaFai Lau) v2: * Update documentation (Andrey Ignatov) * Pass BIND_FORCE_ADDRESS_NO_PORT conditionally (Andrey Ignatov) Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200508174611.228805-5-sdf@google.com --- include/net/inet_common.h | 2 + include/uapi/linux/bpf.h | 9 +- net/core/filter.c | 18 ++-- net/ipv4/af_inet.c | 10 +- net/ipv6/af_inet6.c | 12 ++- tools/include/uapi/linux/bpf.h | 9 +- .../selftests/bpf/prog_tests/connect_force_port.c | 115 +++++++++++++++++++++ .../selftests/bpf/progs/connect_force_port4.c | 28 +++++ .../selftests/bpf/progs/connect_force_port6.c | 28 +++++ 9 files changed, 203 insertions(+), 28 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/connect_force_port.c create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port4.c create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port6.c (limited to 'include/uapi') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index c38f4f7d660a..cb2818862919 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -39,6 +39,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ #define BIND_WITH_LOCK (1 << 1) +/* Called from BPF program. */ +#define BIND_FROM_BPF (1 << 2) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b3643e27e264..6e5e7caa3739 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1994,10 +1994,11 @@ union bpf_attr { * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * diff --git a/net/core/filter.c b/net/core/filter.c index fa9ddab5dd1f..da0634979f53 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4525,32 +4525,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, { #ifdef CONFIG_INET struct sock *sk = ctx->sk; + u32 flags = BIND_FROM_BPF; int err; - /* Binding to port can be expensive so it's prohibited in the helper. - * Only binding to IP is supported. - */ err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; - if (((struct sockaddr_in *)addr)->sin_port != htons(0)) - return err; - return __inet_bind(sk, addr, addr_len, - BIND_FORCE_ADDRESS_NO_PORT); + if (((struct sockaddr_in *)addr)->sin_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; + return __inet_bind(sk, addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; - if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) - return err; + if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, - BIND_FORCE_ADDRESS_NO_PORT); + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 68e74b1b0f26..fcf0d12a407a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -526,10 +526,12 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EADDRINUSE; goto out_release_sock; } - err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); - if (err) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - goto out_release_sock; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 552c2592b81c..771a462a8322 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -407,11 +407,13 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EADDRINUSE; goto out; } - err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); - if (err) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - goto out; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b3643e27e264..6e5e7caa3739 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1994,10 +1994,11 @@ union bpf_attr { * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c new file mode 100644 index 000000000000..47fbb20cb6a6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "cgroup_helpers.h" +#include "network_helpers.h" + +static int verify_port(int family, int fd, int expected) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + __u16 port; + + if (getsockname(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected) { + log_err("Unexpected port %d, expected %d", ntohs(port), + expected); + return -1; + } + + return 0; +} + +static int run_test(int cgroup_fd, int server_fd, int family, int type) +{ + struct bpf_prog_load_attr attr = { + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + }; + struct bpf_object *obj; + int expected_port; + int prog_fd; + int err; + int fd; + + if (family == AF_INET) { + attr.file = "./connect_force_port4.o"; + attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT; + expected_port = 22222; + } else { + attr.file = "./connect_force_port6.o"; + attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT; + expected_port = 22223; + } + + err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + if (err) { + log_err("Failed to load BPF object"); + return -1; + } + + err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type, + 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + fd = connect_to_fd(family, type, server_fd); + if (fd < 0) { + err = -1; + goto close_bpf_object; + } + + err = verify_port(family, fd, expected_port); + + close(fd); + +close_bpf_object: + bpf_object__close(obj); + return err; +} + +void test_connect_force_port(void) +{ + int server_fd, cgroup_fd; + + cgroup_fd = test__join_cgroup("/connect_force_port"); + if (CHECK_FAIL(cgroup_fd < 0)) + return; + + server_fd = start_server(AF_INET, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server(AF_INET6, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server(AF_INET, SOCK_DGRAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM)); + close(server_fd); + + server_fd = start_server(AF_INET6, SOCK_DGRAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM)); + close(server_fd); + +close_cgroup_fd: + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c new file mode 100644 index 000000000000..1b8eb34b2db0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include + +#include +#include + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +SEC("cgroup/connect4") +int _connect4(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in sa = {}; + + sa.sin_family = AF_INET; + sa.sin_port = bpf_htons(22222); + sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */ + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c new file mode 100644 index 000000000000..ae6f7d750b4c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include + +#include +#include + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +SEC("cgroup/connect6") +int _connect6(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in6 sa = {}; + + sa.sin6_family = AF_INET6; + sa.sin6_port = bpf_htons(22223); + sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */ + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; +} -- cgit v1.2.3 From 15d83c4d7cef5c067a8b075ce59e97df4f60706e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:00 -0700 Subject: bpf: Allow loading of a bpf_iter program A bpf_iter program is a tracing program with attach type BPF_TRACE_ITER. The load attribute attach_btf_id is used by the verifier against a particular kernel function, which represents a target, e.g., __bpf_iter__bpf_map for target bpf_map which is implemented later. The program return value must be 0 or 1 for now. 0 : successful, except potential seq_file buffer overflow which is handled by seq_file reader. 1 : request to restart the same object In the future, other return values may be used for filtering or teminating the iterator. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175900.2474947-1-yhs@fb.com --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/bpf_iter.c | 36 ++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 21 +++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 62 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 40c78b86fe38..f28bdd714754 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1127,6 +1127,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); +#define BPF_ITER_FUNC_PREFIX "__bpf_iter__" + typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); @@ -1140,6 +1142,7 @@ struct bpf_iter_reg { int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); +bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6e5e7caa3739..c8a5325cc8d0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -218,6 +218,7 @@ enum bpf_attach_type { BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, BPF_LSM_MAC, + BPF_TRACE_ITER, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5a8119d17d14..dec182d8395a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -12,6 +12,7 @@ struct bpf_iter_target_info { bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; + u32 btf_id; /* cached value */ }; static struct list_head targets = LIST_HEAD_INIT(targets); @@ -57,3 +58,38 @@ void bpf_iter_unreg_target(const char *target) WARN_ON(found == false); } + +static void cache_btf_id(struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + tinfo->btf_id = prog->aux->attach_btf_id; +} + +bool bpf_iter_prog_supported(struct bpf_prog *prog) +{ + const char *attach_fname = prog->aux->attach_func_name; + u32 prog_btf_id = prog->aux->attach_btf_id; + const char *prefix = BPF_ITER_FUNC_PREFIX; + struct bpf_iter_target_info *tinfo; + int prefix_len = strlen(prefix); + bool supported = false; + + if (strncmp(attach_fname, prefix, prefix_len)) + return false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { + supported = true; + break; + } + if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + cache_btf_id(tinfo, prog); + supported = true; + break; + } + } + mutex_unlock(&targets_mutex); + + return supported; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 70ad009577f8..d725ff7d11db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; range = tnum_const(0); break; + case BPF_PROG_TYPE_TRACING: + if (env->prog->expected_attach_type != BPF_TRACE_ITER) + return 0; + break; default: return 0; } @@ -10481,6 +10485,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + struct btf_func_model fmodel; int ret = 0, subprog = -1, i; struct bpf_trampoline *tr; const struct btf_type *t; @@ -10622,6 +10627,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + case BPF_TRACE_ITER: + if (!btf_type_is_func(t)) { + verbose(env, "attach_btf_id %u is not a function\n", + btf_id); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + if (!bpf_iter_prog_supported(prog)) + return -EINVAL; + ret = btf_distill_func_proto(&env->log, btf, t, + tname, &fmodel); + return ret; default: if (!prog_extension) return -EINVAL; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6e5e7caa3739..c8a5325cc8d0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -218,6 +218,7 @@ enum bpf_attach_type { BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, BPF_LSM_MAC, + BPF_TRACE_ITER, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From de4e05cac46d206f9090051ef09930514bff73e4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:01 -0700 Subject: bpf: Support bpf tracing/iter programs for BPF_LINK_CREATE Given a bpf program, the step to create an anonymous bpf iterator is: - create a bpf_iter_link, which combines bpf program and the target. In the future, there could be more information recorded in the link. A link_fd will be returned to the user space. - create an anonymous bpf iterator with the given link_fd. The bpf_iter_link can be pinned to bpffs mount file system to create a file based bpf iterator as well. The benefit to use of bpf_iter_link: - using bpf link simplifies design and implementation as bpf link is used for other tracing bpf programs. - for file based bpf iterator, bpf_iter_link provides a standard way to replace underlying bpf programs. - for both anonymous and free based iterators, bpf link query capability can be leveraged. The patch added support of tracing/iter programs for BPF_LINK_CREATE. A new link type BPF_LINK_TYPE_ITER is added to facilitate link querying. Currently, only prog_id is needed, so there is no additional in-kernel show_fdinfo() and fill_link_info() hook is needed for BPF_LINK_TYPE_ITER link. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175901.2475084-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/bpf_iter.c | 62 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 14 ++++++++++ tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 80 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f28bdd714754..e93d2d33c82c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1143,6 +1143,7 @@ struct bpf_iter_reg { int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 8345cdf553b8..29d22752fc87 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -124,3 +124,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) #ifdef CONFIG_CGROUP_BPF BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) #endif +BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8a5325cc8d0..1e8dfff5d5d4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -229,6 +229,7 @@ enum bpf_link_type { BPF_LINK_TYPE_RAW_TRACEPOINT = 1, BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, MAX_BPF_LINK_TYPE, }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index dec182d8395a..03f5832909db 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -15,6 +15,11 @@ struct bpf_iter_target_info { u32 btf_id; /* cached value */ }; +struct bpf_iter_link { + struct bpf_link link; + struct bpf_iter_target_info *tinfo; +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); @@ -93,3 +98,60 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) return supported; } + +static void bpf_iter_link_release(struct bpf_link *link) +{ +} + +static void bpf_iter_link_dealloc(struct bpf_link *link) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + kfree(iter_link); +} + +static const struct bpf_link_ops bpf_iter_link_lops = { + .release = bpf_iter_link_release, + .dealloc = bpf_iter_link_dealloc, +}; + +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_iter_target_info *tinfo; + struct bpf_iter_link *link; + bool existed = false; + u32 prog_btf_id; + int err; + + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + prog_btf_id = prog->aux->attach_btf_id; + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id == prog_btf_id) { + existed = true; + break; + } + } + mutex_unlock(&targets_mutex); + if (!existed) + return -ENOENT; + + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + link->tinfo = tinfo; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + return bpf_link_settle(&link_primer); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bb1ab7da6103..6ffe2d8fb6c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2729,6 +2729,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; + case BPF_TRACE_ITER: + return BPF_PROG_TYPE_TRACING; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3729,6 +3731,15 @@ err_put: return err; } +static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + if (attr->link_create.attach_type == BPF_TRACE_ITER && + prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + + return -EINVAL; +} + #define BPF_LINK_CREATE_LAST_FIELD link_create.flags static int link_create(union bpf_attr *attr) { @@ -3765,6 +3776,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_TRACING: + ret = tracing_bpf_link_attach(attr, prog); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c8a5325cc8d0..1e8dfff5d5d4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -229,6 +229,7 @@ enum bpf_link_type { BPF_LINK_TYPE_RAW_TRACEPOINT = 1, BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, MAX_BPF_LINK_TYPE, }; -- cgit v1.2.3 From ac51d99bf81caac8d8881fe52098948110d0de68 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:05 -0700 Subject: bpf: Create anonymous bpf iterator A new bpf command BPF_ITER_CREATE is added. The anonymous bpf iterator is seq_file based. The seq_file private data are referenced by targets. The bpf_iter infrastructure allocated additional space at seq_file->private before the space used by targets to store some meta data, e.g., prog: prog to run session_id: an unique id for each opened seq_file seq_num: how many times bpf programs are queried in this session done_stop: an internal state to decide whether bpf program should be called in seq_ops->stop() or not The seq_num will start from 0 for valid objects. The bpf program may see the same seq_num more than once if - seq_file buffer overflow happens and the same object is retried by bpf_seq_read(), or - the bpf program explicitly requests a retry of the same object Since module is not supported for bpf_iter, all target registeration happens at __init time, so there is no need to change bpf_iter_unreg_target() as it is used mostly in error path of the init function at which time no bpf iterators have been created yet. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175905.2475770-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++ kernel/bpf/bpf_iter.c | 129 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 26 +++++++++ tools/include/uapi/linux/bpf.h | 6 ++ 5 files changed, 168 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e93d2d33c82c..80b1b9d8a638 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1144,6 +1144,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int bpf_iter_new_fd(struct bpf_link *link); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1e8dfff5d5d4..708763f702e1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_cmd { BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, + BPF_ITER_CREATE, }; enum bpf_map_type { @@ -614,6 +615,11 @@ union bpf_attr { __u32 type; } enable_stats; + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 832973ee80fa..e7129b57865f 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -2,6 +2,7 @@ /* Copyright (c) 2020 Facebook */ #include +#include #include #include @@ -20,12 +21,24 @@ struct bpf_iter_link { struct bpf_iter_target_info *tinfo; }; +struct bpf_iter_priv_data { + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u64 session_id; + u64 seq_num; + bool done_stop; + u8 target_private[] __aligned(8); +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); +/* incremented on every opened seq_file */ +static atomic64_t session_id; + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -149,6 +162,33 @@ done: return copied; } +static int iter_release(struct inode *inode, struct file *file) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + + seq = file->private_data; + if (!seq) + return 0; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + + if (iter_priv->tinfo->fini_seq_private) + iter_priv->tinfo->fini_seq_private(seq->private); + + bpf_prog_put(iter_priv->prog); + seq->private = iter_priv; + + return seq_release_private(inode, file); +} + +static const struct file_operations bpf_iter_fops = { + .llseek = no_llseek, + .read = bpf_seq_read, + .release = iter_release, +}; + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -309,3 +349,92 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return bpf_link_settle(&link_primer); } + +static void init_seq_meta(struct bpf_iter_priv_data *priv_data, + struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + priv_data->tinfo = tinfo; + priv_data->prog = prog; + priv_data->session_id = atomic64_inc_return(&session_id); + priv_data->seq_num = 0; + priv_data->done_stop = false; +} + +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +{ + struct bpf_iter_priv_data *priv_data; + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u32 total_priv_dsize; + struct seq_file *seq; + int err = 0; + + mutex_lock(&link_mutex); + prog = link->link.prog; + bpf_prog_inc(prog); + mutex_unlock(&link_mutex); + + tinfo = link->tinfo; + total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + + tinfo->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + if (!priv_data) { + err = -ENOMEM; + goto release_prog; + } + + if (tinfo->init_seq_private) { + err = tinfo->init_seq_private(priv_data->target_private); + if (err) + goto release_seq_file; + } + + init_seq_meta(priv_data, tinfo, prog); + seq = file->private_data; + seq->private = priv_data->target_private; + + return 0; + +release_seq_file: + seq_release_private(file->f_inode, file); + file->private_data = NULL; +release_prog: + bpf_prog_put(prog); + return err; +} + +int bpf_iter_new_fd(struct bpf_link *link) +{ + struct file *file; + unsigned int flags; + int err, fd; + + if (link->ops != &bpf_iter_link_lops) + return -EINVAL; + + flags = O_RDONLY | O_CLOEXEC; + fd = get_unused_fd_flags(flags); + if (fd < 0) + return fd; + + file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto free_fd; + } + + err = prepare_seq_file(file, + container_of(link, struct bpf_iter_link, link)); + if (err) + goto free_file; + + fd_install(fd, file); + return fd; + +free_file: + fput(file); +free_fd: + put_unused_fd(fd); + return err; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6ffe2d8fb6c7..a293e88ee01a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3941,6 +3941,29 @@ static int bpf_enable_stats(union bpf_attr *attr) return -EINVAL; } +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags + +static int bpf_iter_create(union bpf_attr *attr) +{ + struct bpf_link *link; + int err; + + if (CHECK_ATTR(BPF_ITER_CREATE)) + return -EINVAL; + + if (attr->iter_create.flags) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->iter_create.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + err = bpf_iter_new_fd(link); + bpf_link_put(link); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4068,6 +4091,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ENABLE_STATS: err = bpf_enable_stats(&attr); break; + case BPF_ITER_CREATE: + err = bpf_iter_create(&attr); + break; default: err = -EINVAL; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1e8dfff5d5d4..708763f702e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_cmd { BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, + BPF_ITER_CREATE, }; enum bpf_map_type { @@ -614,6 +615,11 @@ union bpf_attr { __u32 type; } enable_stats; + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 492e639f0c222784e2e0f121966375f641c61b15 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:14 -0700 Subject: bpf: Add bpf_seq_printf and bpf_seq_write helpers Two helpers bpf_seq_printf and bpf_seq_write, are added for writing data to the seq_file buffer. bpf_seq_printf supports common format string flag/width/type fields so at least I can get identical results for netlink and ipv6_route targets. For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW specifically indicates a write failure due to overflow, which means the object will be repeated in the next bpf invocation if object collection stays the same. Note that if the object collection is changed, depending how collection traversal is done, even if the object still in the collection, it may not be visited. For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to read kernel memory. Reading kernel memory may fail in the following two cases: - invalid kernel address, or - valid kernel address but requiring a major fault If reading kernel memory failed, the %s string will be an empty string and %p{i,I}{4,6} will be all 0. Not returning error to bpf program is consistent with what bpf_trace_printk() does for now. bpf_seq_printf may return -EBUSY meaning that internal percpu buffer for memory copy of strings or other pointees is not available. Bpf program can return 1 to indicate it wants the same object to be repeated. Right now, this should not happen on no-RT kernels since migrate_disable(), which guards bpf prog call, calls preempt_disable(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com --- include/uapi/linux/bpf.h | 39 +++++++- kernel/trace/bpf_trace.c | 214 +++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 + tools/include/uapi/linux/bpf.h | 39 +++++++- 4 files changed, 292 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 708763f702e1..9d1932e23cec 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3077,6 +3077,41 @@ union bpf_attr { * See: clock_gettime(CLOCK_BOOTTIME) * Return * Current *ktime*. + * + * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * seq_printf uses seq_file seq_printf() to print out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a u64 array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the *data* size in term of bytes. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what bpf_trace_printk() does for now. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EBUSY** Percpu memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * * **-EINVAL** Invalid arguments, or invalid/unsupported formats. + * * **-E2BIG** Too many format specifiers. + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * + * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * seq_write uses seq_file seq_write() to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3204,7 +3239,9 @@ union bpf_attr { FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ FN(sk_assign), \ - FN(ktime_get_boot_ns), + FN(ktime_get_boot_ns), \ + FN(seq_printf), \ + FN(seq_write), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e875c95d3ced..d961428fb5b6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -457,6 +457,212 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +#define MAX_SEQ_PRINTF_VARARGS 12 +#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 +#define MAX_SEQ_PRINTF_STR_LEN 128 + +struct bpf_seq_printf_buf { + char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); +static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); + +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, + const void *, data, u32, data_len) +{ + int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; + int i, buf_used, copy_size, num_args; + u64 params[MAX_SEQ_PRINTF_VARARGS]; + struct bpf_seq_printf_buf *bufs; + const u64 *args = data; + + buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); + if (WARN_ON_ONCE(buf_used > 1)) { + err = -EBUSY; + goto out; + } + + bufs = this_cpu_ptr(&bpf_seq_printf_buf); + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + goto out; + + if (data_len & 7) + goto out; + + for (i = 0; i < fmt_size; i++) { + if (fmt[i] == '%') { + if (fmt[i + 1] == '%') + i++; + else if (!data || !data_len) + goto out; + } + } + + num_args = data_len / 8; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + /* only printable ascii for now. */ + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto out; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { + err = -E2BIG; + goto out; + } + + if (fmt_cnt >= num_args) { + err = -EINVAL; + goto out; + } + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + + /* skip optional "[0 +-][num]" width formating field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 's') { + /* try our best to copy */ + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + err = strncpy_from_unsafe(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + MAX_SEQ_PRINTF_STR_LEN); + if (err < 0) + bufs->buf[memcpy_cnt][0] = '\0'; + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'p') { + if (fmt[i + 1] == 0 || + fmt[i + 1] == 'K' || + fmt[i + 1] == 'x') { + /* just kernel pointers */ + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + continue; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { + err = -EINVAL; + goto out; + } + if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { + err = -EINVAL; + goto out; + } + + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + + err = probe_kernel_read(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + copy_size); + if (err < 0) + memset(bufs->buf[memcpy_cnt], 0, copy_size); + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + i += 2; + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'l') { + i++; + if (fmt[i] == 'l') + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x') { + err = -EINVAL; + goto out; + } + + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + } + + /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give + * all of them to seq_printf(). + */ + seq_printf(m, fmt, params[0], params[1], params[2], params[3], + params[4], params[5], params[6], params[7], params[8], + params[9], params[10], params[11]); + + err = seq_has_overflowed(m) ? -EOVERFLOW : 0; +out: + this_cpu_dec(bpf_seq_printf_buf_used); + return err; +} + +static int bpf_seq_printf_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_printf_proto = { + .func = bpf_seq_printf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_printf_btf_ids, +}; + +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) +{ + return seq_write(m, data, len) ? -EOVERFLOW : 0; +} + +static int bpf_seq_write_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_write_proto = { + .func = bpf_seq_write, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_write_btf_ids, +}; + static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) @@ -1226,6 +1432,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; #endif + case BPF_FUNC_seq_printf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_proto : + NULL; + case BPF_FUNC_seq_write: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_write_proto : + NULL; default: return raw_tp_prog_func_proto(func_id, prog); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index f43d193aff3a..ded304c96a05 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -414,6 +414,7 @@ class PrinterHelpers(Printer): 'struct sk_reuseport_md', 'struct sockaddr', 'struct tcphdr', + 'struct seq_file', 'struct __sk_buff', 'struct sk_msg_md', @@ -450,6 +451,7 @@ class PrinterHelpers(Printer): 'struct sk_reuseport_md', 'struct sockaddr', 'struct tcphdr', + 'struct seq_file', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 708763f702e1..9d1932e23cec 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3077,6 +3077,41 @@ union bpf_attr { * See: clock_gettime(CLOCK_BOOTTIME) * Return * Current *ktime*. + * + * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * seq_printf uses seq_file seq_printf() to print out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a u64 array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the *data* size in term of bytes. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what bpf_trace_printk() does for now. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EBUSY** Percpu memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * * **-EINVAL** Invalid arguments, or invalid/unsupported formats. + * * **-E2BIG** Too many format specifiers. + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * + * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * seq_write uses seq_file seq_write() to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3204,7 +3239,9 @@ union bpf_attr { FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ FN(sk_assign), \ - FN(ktime_get_boot_ns), + FN(ktime_get_boot_ns), \ + FN(seq_printf), \ + FN(seq_write), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 11ca3c4261cdb4e2f33e32daf6447f8185843317 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 May 2020 21:12:33 +0200 Subject: net: ethtool: netlink: Add support for triggering a cable test Add new ethtool netlink calls to trigger the starting of a PHY cable test. Add Kconfig'ury to ETHTOOL_NETLINK so that PHYLIB is not a module when ETHTOOL_NETLINK is builtin, which would result in kernel linking errors. v2: Remove unwanted white space change Remove ethnl_cable_test_act_ops and use doit handler Rename cable_test_set_policy cable_test_act_policy Remove ETHTOOL_MSG_CABLE_TEST_ACT_REPLY v3: Remove ETHTOOL_MSG_CABLE_TEST_ACT_REPLY from documentation Remove unused cable_test_get_policy Add Reviewed-by tags v4: Remove unwanted blank line Signed-off-by: Andrew Lunn Reviewed-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- Documentation/networking/ethtool-netlink.rst | 16 ++++++++- include/uapi/linux/ethtool_netlink.h | 12 +++++++ net/Kconfig | 1 + net/ethtool/Makefile | 2 +- net/ethtool/cabletest.c | 54 ++++++++++++++++++++++++++++ net/ethtool/netlink.c | 5 +++ net/ethtool/netlink.h | 1 + 7 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 net/ethtool/cabletest.c (limited to 'include/uapi') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 8f5cefc539cf..a8731d33d0c9 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -204,6 +204,7 @@ Userspace to kernel: ``ETHTOOL_MSG_EEE_GET`` get EEE settings ``ETHTOOL_MSG_EEE_SET`` set EEE settings ``ETHTOOL_MSG_TSINFO_GET`` get timestamping info + ``ETHTOOL_MSG_CABLE_TEST_ACT`` action start cable test ===================================== ================================ Kernel to userspace: @@ -958,13 +959,25 @@ Kernel response contents: is no special value for this case). The bitset attributes are omitted if they would be empty (no bit set). +CABLE_TEST +========== + +Start a cable test. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_CABLE_TEST_HEADER`` nested request header + ==================================== ====== ========================== + Request translation =================== The following table maps ioctl commands to netlink commands providing their functionality. Entries with "n/a" in right column are commands which do not -have their netlink replacement yet. +have their netlink replacement yet. Entries which "n/a" in the left column +are netlink only. =================================== ===================================== ioctl command netlink command @@ -1053,4 +1066,5 @@ have their netlink replacement yet. ``ETHTOOL_PHY_STUNABLE`` n/a ``ETHTOOL_GFECPARAM`` n/a ``ETHTOOL_SFECPARAM`` n/a + n/a ''ETHTOOL_MSG_CABLE_TEST_ACT'' =================================== ===================================== diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index bf1d310e20bc..6bfd648c32cf 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -39,6 +39,7 @@ enum { ETHTOOL_MSG_EEE_GET, ETHTOOL_MSG_EEE_SET, ETHTOOL_MSG_TSINFO_GET, + ETHTOOL_MSG_CABLE_TEST_ACT, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -405,6 +406,17 @@ enum { ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) }; +/* CABLE TEST */ + +enum { + ETHTOOL_A_CABLE_TEST_UNSPEC, + ETHTOOL_A_CABLE_TEST_HEADER, /* nest - _A_HEADER_* */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_CNT, + ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/Kconfig b/net/Kconfig index c5ba2d180c43..5c524c6ee75d 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -455,6 +455,7 @@ config FAILOVER config ETHTOOL_NETLINK bool "Netlink interface for ethtool" default y + depends on PHYLIB=y || PHYLIB=n help An alternative userspace interface for ethtool based on generic netlink. It provides better extensibility and some new features, diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 6c360c9c9370..0c2b94f20499 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -6,4 +6,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ - channels.o coalesce.o pause.o eee.o tsinfo.o + channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c new file mode 100644 index 000000000000..aeb6672a46d0 --- /dev/null +++ b/net/ethtool/cabletest.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include "netlink.h" +#include "common.h" + +/* CABLE_TEST_ACT */ + +static const struct nla_policy +cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_CABLE_TEST_HEADER] = { .type = NLA_NESTED }, +}; + +int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_CABLE_TEST_MAX, + cable_test_act_policy, info->extack); + if (ret < 0) + return ret; + + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_CABLE_TEST_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + dev = req_info.dev; + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out_dev_put; + } + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = phy_start_cable_test(dev->phydev, info->extack); + + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); +out_dev_put: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 0c772318c023..b9c9ddf408fe 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -839,6 +839,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_CABLE_TEST_ACT, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_act_cable_test, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 81b8fa020bcb..bd7df592db2f 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -357,5 +357,6 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info); int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info); int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); +int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From b28efb930ba5a7c263826fe02e13e1b6eadb5559 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 10 May 2020 21:12:34 +0200 Subject: net: ethtool: Add attributes for cable test reports Add the attributes needed to report cable test results to userspace. The reports are expected to be per twisted pair. A nested property per pair can report the result of the cable test. A nested property can also report the length of the cable to any fault. v2: Grammar fixes Change length from u16 to u32 s/DEV/HEADER/g Add status attributes Rename pairs from numbers to letters. v3: Fixed example in document Add ETHTOOL_A_CABLE_NEST_* enum Add ETHTOOL_MSG_CABLE_TEST_NTF to documentation Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Michal Kubecek Signed-off-by: Jakub Kicinski --- Documentation/networking/ethtool-netlink.rst | 41 +++++++++++++++++++ include/uapi/linux/ethtool_netlink.h | 59 ++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index a8731d33d0c9..eed46b6aa07d 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -236,6 +236,7 @@ Kernel to userspace: ``ETHTOOL_MSG_EEE_GET_REPLY`` EEE settings ``ETHTOOL_MSG_EEE_NTF`` EEE settings ``ETHTOOL_MSG_TSINFO_GET_REPLY`` timestamping info + ``ETHTOOL_MSG_CABLE_TEST_NTF`` Cable test results ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -970,6 +971,46 @@ Request contents: ``ETHTOOL_A_CABLE_TEST_HEADER`` nested request header ==================================== ====== ========================== +Notification contents: + +An Ethernet cable typically contains 1, 2 or 4 pairs. The length of +the pair can only be measured when there is a fault in the pair and +hence a reflection. Information about the fault may not be available, +depending on the specific hardware. Hence the contents of the notify +message are mostly optional. The attributes can be repeated an +arbitrary number of times, in an arbitrary order, for an arbitrary +number of pairs. + +The example shows the notification sent when the test is completed for +a T2 cable, i.e. two pairs. One pair is OK and hence has no length +information. The second pair has a fault and does have length +information. + + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_CABLE_TEST_HEADER`` | nested | reply header | + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_CABLE_TEST_STATUS`` | u8 | completed | + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_CABLE_TEST_NTF_NEST`` | nested | all the results | + +-+-------------------------------------------+--------+---------------------+ + | | ``ETHTOOL_A_CABLE_NEST_RESULT`` | nested | cable test result | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_CABLE_RESULTS_PAIR`` | u8 | pair number | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_CABLE_RESULTS_CODE`` | u8 | result code | + +-+-+-----------------------------------------+--------+---------------------+ + | | ``ETHTOOL_A_CABLE_NEST_RESULT`` | nested | cable test results | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_CABLE_RESULTS_PAIR`` | u8 | pair number | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_CABLE_RESULTS_CODE`` | u8 | result code | + +-+-+-----------------------------------------+--------+---------------------+ + | | ``ETHTOOL_A_CABLE_NEST_FAULT_LENGTH`` | nested | cable length | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR`` | u8 | pair number | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_CABLE_FAULT_LENGTH_CM`` | u32 | length in cm | + +-+-+-----------------------------------------+--------+---------------------+ Request translation =================== diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 6bfd648c32cf..2881af411f76 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -75,6 +75,7 @@ enum { ETHTOOL_MSG_EEE_GET_REPLY, ETHTOOL_MSG_EEE_NTF, ETHTOOL_MSG_TSINFO_GET_REPLY, + ETHTOOL_MSG_CABLE_TEST_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -417,6 +418,64 @@ enum { ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 }; +/* CABLE TEST NOTIFY */ +enum { + ETHTOOL_A_CABLE_RESULT_CODE_UNSPEC, + ETHTOOL_A_CABLE_RESULT_CODE_OK, + ETHTOOL_A_CABLE_RESULT_CODE_OPEN, + ETHTOOL_A_CABLE_RESULT_CODE_SAME_SHORT, + ETHTOOL_A_CABLE_RESULT_CODE_CROSS_SHORT, +}; + +enum { + ETHTOOL_A_CABLE_PAIR_A, + ETHTOOL_A_CABLE_PAIR_B, + ETHTOOL_A_CABLE_PAIR_C, + ETHTOOL_A_CABLE_PAIR_D, +}; + +enum { + ETHTOOL_A_CABLE_RESULT_UNSPEC, + ETHTOOL_A_CABLE_RESULT_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ + ETHTOOL_A_CABLE_RESULT_CODE, /* u8 ETHTOOL_A_CABLE_RESULT_CODE_ */ + + __ETHTOOL_A_CABLE_RESULT_CNT, + ETHTOOL_A_CABLE_RESULT_MAX = (__ETHTOOL_A_CABLE_RESULT_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_FAULT_LENGTH_UNSPEC, + ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ + ETHTOOL_A_CABLE_FAULT_LENGTH_CM, /* u32 */ + + __ETHTOOL_A_CABLE_FAULT_LENGTH_CNT, + ETHTOOL_A_CABLE_FAULT_LENGTH_MAX = (__ETHTOOL_A_CABLE_FAULT_LENGTH_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_TEST_NTF_STATUS_UNSPEC, + ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED, + ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED +}; + +enum { + ETHTOOL_A_CABLE_NEST_UNSPEC, + ETHTOOL_A_CABLE_NEST_RESULT, /* nest - ETHTOOL_A_CABLE_RESULT_ */ + ETHTOOL_A_CABLE_NEST_FAULT_LENGTH, /* nest - ETHTOOL_A_CABLE_FAULT_LENGTH_ */ + __ETHTOOL_A_CABLE_NEST_CNT, + ETHTOOL_A_CABLE_NEST_MAX = (__ETHTOOL_A_CABLE_NEST_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_TEST_NTF_UNSPEC, + ETHTOOL_A_CABLE_TEST_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ + ETHTOOL_A_CABLE_TEST_NTF_NEST, /* nest - of results: */ + + __ETHTOOL_A_CABLE_TEST_NTF_CNT, + ETHTOOL_A_CABLE_TEST_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_NTF_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 -- cgit v1.2.3 From 9e3307a169537a6adc30b13bf9063e94990a5493 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Sun, 3 May 2020 17:53:37 +0200 Subject: mtd: Add support for emulated SLC mode on MLC NANDs MLC NANDs can be made a bit more reliable if we only program the lower page of each pair. At least, this solves the paired-pages corruption issue. Signed-off-by: Boris Brezillon Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200503155341.16712-5-miquel.raynal@bootlin.com --- drivers/mtd/mtdcore.c | 189 +++++++++++++++++++++++++++++++++++++---- drivers/mtd/mtdpart.c | 54 +++++++----- include/linux/mtd/mtd.h | 7 +- include/linux/mtd/partitions.h | 2 + include/uapi/mtd/mtd-abi.h | 1 + 5 files changed, 213 insertions(+), 40 deletions(-) (limited to 'include/uapi') diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 2916674208b3..50437b4ffe76 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -617,6 +617,19 @@ int add_mtd_device(struct mtd_info *mtd) !(mtd->flags & MTD_NO_ERASE))) return -EINVAL; + /* + * MTD_SLC_ON_MLC_EMULATION can only be set on partitions, when the + * master is an MLC NAND and has a proper pairing scheme defined. + * We also reject masters that implement ->_writev() for now, because + * NAND controller drivers don't implement this hook, and adding the + * SLC -> MLC address/length conversion to this path is useless if we + * don't have a user. + */ + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION && + (!mtd_is_partition(mtd) || master->type != MTD_MLCNANDFLASH || + !master->pairing || master->_writev)) + return -EINVAL; + mutex_lock(&mtd_table_mutex); i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL); @@ -632,6 +645,14 @@ int add_mtd_device(struct mtd_info *mtd) if (mtd->bitflip_threshold == 0) mtd->bitflip_threshold = mtd->ecc_strength; + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + int ngroups = mtd_pairing_groups(master); + + mtd->erasesize /= ngroups; + mtd->size = (u64)mtd_div_by_eb(mtd->size, master) * + mtd->erasesize; + } + if (is_power_of_2(mtd->erasesize)) mtd->erasesize_shift = ffs(mtd->erasesize) - 1; else @@ -1074,9 +1095,11 @@ int mtd_erase(struct mtd_info *mtd, struct erase_info *instr) { struct mtd_info *master = mtd_get_master(mtd); u64 mst_ofs = mtd_get_master_ofs(mtd, 0); + struct erase_info adjinstr; int ret; instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; + adjinstr = *instr; if (!mtd->erasesize || !master->_erase) return -ENOTSUPP; @@ -1091,12 +1114,27 @@ int mtd_erase(struct mtd_info *mtd, struct erase_info *instr) ledtrig_mtd_activity(); - instr->addr += mst_ofs; - ret = master->_erase(master, instr); - if (instr->fail_addr != MTD_FAIL_ADDR_UNKNOWN) - instr->fail_addr -= mst_ofs; + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + adjinstr.addr = (loff_t)mtd_div_by_eb(instr->addr, mtd) * + master->erasesize; + adjinstr.len = ((u64)mtd_div_by_eb(instr->addr + instr->len, mtd) * + master->erasesize) - + adjinstr.addr; + } + + adjinstr.addr += mst_ofs; + + ret = master->_erase(master, &adjinstr); + + if (adjinstr.fail_addr != MTD_FAIL_ADDR_UNKNOWN) { + instr->fail_addr = adjinstr.fail_addr - mst_ofs; + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + instr->fail_addr = mtd_div_by_eb(instr->fail_addr, + master); + instr->fail_addr *= mtd->erasesize; + } + } - instr->addr -= mst_ofs; return ret; } EXPORT_SYMBOL_GPL(mtd_erase); @@ -1276,6 +1314,101 @@ static int mtd_check_oob_ops(struct mtd_info *mtd, loff_t offs, return 0; } +static int mtd_read_oob_std(struct mtd_info *mtd, loff_t from, + struct mtd_oob_ops *ops) +{ + struct mtd_info *master = mtd_get_master(mtd); + int ret; + + from = mtd_get_master_ofs(mtd, from); + if (master->_read_oob) + ret = master->_read_oob(master, from, ops); + else + ret = master->_read(master, from, ops->len, &ops->retlen, + ops->datbuf); + + return ret; +} + +static int mtd_write_oob_std(struct mtd_info *mtd, loff_t to, + struct mtd_oob_ops *ops) +{ + struct mtd_info *master = mtd_get_master(mtd); + int ret; + + to = mtd_get_master_ofs(mtd, to); + if (master->_write_oob) + ret = master->_write_oob(master, to, ops); + else + ret = master->_write(master, to, ops->len, &ops->retlen, + ops->datbuf); + + return ret; +} + +static int mtd_io_emulated_slc(struct mtd_info *mtd, loff_t start, bool read, + struct mtd_oob_ops *ops) +{ + struct mtd_info *master = mtd_get_master(mtd); + int ngroups = mtd_pairing_groups(master); + int npairs = mtd_wunit_per_eb(master) / ngroups; + struct mtd_oob_ops adjops = *ops; + unsigned int wunit, oobavail; + struct mtd_pairing_info info; + int max_bitflips = 0; + u32 ebofs, pageofs; + loff_t base, pos; + + ebofs = mtd_mod_by_eb(start, mtd); + base = (loff_t)mtd_div_by_eb(start, mtd) * master->erasesize; + info.group = 0; + info.pair = mtd_div_by_ws(ebofs, mtd); + pageofs = mtd_mod_by_ws(ebofs, mtd); + oobavail = mtd_oobavail(mtd, ops); + + while (ops->retlen < ops->len || ops->oobretlen < ops->ooblen) { + int ret; + + if (info.pair >= npairs) { + info.pair = 0; + base += master->erasesize; + } + + wunit = mtd_pairing_info_to_wunit(master, &info); + pos = mtd_wunit_to_offset(mtd, base, wunit); + + adjops.len = ops->len - ops->retlen; + if (adjops.len > mtd->writesize - pageofs) + adjops.len = mtd->writesize - pageofs; + + adjops.ooblen = ops->ooblen - ops->oobretlen; + if (adjops.ooblen > oobavail - adjops.ooboffs) + adjops.ooblen = oobavail - adjops.ooboffs; + + if (read) { + ret = mtd_read_oob_std(mtd, pos + pageofs, &adjops); + if (ret > 0) + max_bitflips = max(max_bitflips, ret); + } else { + ret = mtd_write_oob_std(mtd, pos + pageofs, &adjops); + } + + if (ret < 0) + return ret; + + max_bitflips = max(max_bitflips, ret); + ops->retlen += adjops.retlen; + ops->oobretlen += adjops.oobretlen; + adjops.datbuf += adjops.retlen; + adjops.oobbuf += adjops.oobretlen; + adjops.ooboffs = 0; + pageofs = 0; + info.pair++; + } + + return max_bitflips; +} + int mtd_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); @@ -1294,12 +1427,10 @@ int mtd_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops) if (!master->_read_oob && (!master->_read || ops->oobbuf)) return -EOPNOTSUPP; - from = mtd_get_master_ofs(mtd, from); - if (master->_read_oob) - ret_code = master->_read_oob(master, from, ops); + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) + ret_code = mtd_io_emulated_slc(mtd, from, true, ops); else - ret_code = master->_read(master, from, ops->len, &ops->retlen, - ops->datbuf); + ret_code = mtd_read_oob_std(mtd, from, ops); mtd_update_ecc_stats(mtd, master, &old_stats); @@ -1338,13 +1469,10 @@ int mtd_write_oob(struct mtd_info *mtd, loff_t to, if (!master->_write_oob && (!master->_write || ops->oobbuf)) return -EOPNOTSUPP; - to = mtd_get_master_ofs(mtd, to); + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) + return mtd_io_emulated_slc(mtd, to, false, ops); - if (master->_write_oob) - return master->_write_oob(master, to, ops); - else - return master->_write(master, to, ops->len, &ops->retlen, - ops->datbuf); + return mtd_write_oob_std(mtd, to, ops); } EXPORT_SYMBOL_GPL(mtd_write_oob); @@ -1817,6 +1945,12 @@ int mtd_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) return -EINVAL; if (!len) return 0; + + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + len = (u64)mtd_div_by_eb(len, mtd) * master->erasesize; + } + return master->_lock(master, mtd_get_master_ofs(mtd, ofs), len); } EXPORT_SYMBOL_GPL(mtd_lock); @@ -1831,6 +1965,12 @@ int mtd_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) return -EINVAL; if (!len) return 0; + + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + len = (u64)mtd_div_by_eb(len, mtd) * master->erasesize; + } + return master->_unlock(master, mtd_get_master_ofs(mtd, ofs), len); } EXPORT_SYMBOL_GPL(mtd_unlock); @@ -1845,6 +1985,12 @@ int mtd_is_locked(struct mtd_info *mtd, loff_t ofs, uint64_t len) return -EINVAL; if (!len) return 0; + + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + len = (u64)mtd_div_by_eb(len, mtd) * master->erasesize; + } + return master->_is_locked(master, mtd_get_master_ofs(mtd, ofs), len); } EXPORT_SYMBOL_GPL(mtd_is_locked); @@ -1857,6 +2003,10 @@ int mtd_block_isreserved(struct mtd_info *mtd, loff_t ofs) return -EINVAL; if (!master->_block_isreserved) return 0; + + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + return master->_block_isreserved(master, mtd_get_master_ofs(mtd, ofs)); } EXPORT_SYMBOL_GPL(mtd_block_isreserved); @@ -1869,6 +2019,10 @@ int mtd_block_isbad(struct mtd_info *mtd, loff_t ofs) return -EINVAL; if (!master->_block_isbad) return 0; + + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + return master->_block_isbad(master, mtd_get_master_ofs(mtd, ofs)); } EXPORT_SYMBOL_GPL(mtd_block_isbad); @@ -1885,6 +2039,9 @@ int mtd_block_markbad(struct mtd_info *mtd, loff_t ofs) if (!(mtd->flags & MTD_WRITEABLE)) return -EROFS; + if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) + ofs = (loff_t)mtd_div_by_eb(ofs, mtd) * master->erasesize; + ret = master->_block_markbad(master, mtd_get_master_ofs(mtd, ofs)); if (ret) return ret; diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index 3f6025684f58..c3575b686f79 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -35,9 +35,12 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, const struct mtd_partition *part, int partno, uint64_t cur_offset) { - int wr_alignment = (parent->flags & MTD_NO_ERASE) ? parent->writesize : - parent->erasesize; - struct mtd_info *child, *master = mtd_get_master(parent); + struct mtd_info *master = mtd_get_master(parent); + int wr_alignment = (parent->flags & MTD_NO_ERASE) ? + master->writesize : master->erasesize; + u64 parent_size = mtd_is_partition(parent) ? + parent->part.size : parent->size; + struct mtd_info *child; u32 remainder; char *name; u64 tmp; @@ -56,8 +59,9 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, /* set up the MTD object for this partition */ child->type = parent->type; child->part.flags = parent->flags & ~part->mask_flags; + child->part.flags |= part->add_flags; child->flags = child->part.flags; - child->size = part->size; + child->part.size = part->size; child->writesize = parent->writesize; child->writebufsize = parent->writebufsize; child->oobsize = parent->oobsize; @@ -98,29 +102,29 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, } if (child->part.offset == MTDPART_OFS_RETAIN) { child->part.offset = cur_offset; - if (parent->size - child->part.offset >= child->size) { - child->size = parent->size - child->part.offset - - child->size; + if (parent_size - child->part.offset >= child->part.size) { + child->part.size = parent_size - child->part.offset - + child->part.size; } else { printk(KERN_ERR "mtd partition \"%s\" doesn't have enough space: %#llx < %#llx, disabled\n", - part->name, parent->size - child->part.offset, - child->size); + part->name, parent_size - child->part.offset, + child->part.size); /* register to preserve ordering */ goto out_register; } } - if (child->size == MTDPART_SIZ_FULL) - child->size = parent->size - child->part.offset; + if (child->part.size == MTDPART_SIZ_FULL) + child->part.size = parent_size - child->part.offset; printk(KERN_NOTICE "0x%012llx-0x%012llx : \"%s\"\n", - child->part.offset, child->part.offset + child->size, + child->part.offset, child->part.offset + child->part.size, child->name); /* let's do some sanity checks */ - if (child->part.offset >= parent->size) { + if (child->part.offset >= parent_size) { /* let's register it anyway to preserve ordering */ child->part.offset = 0; - child->size = 0; + child->part.size = 0; /* Initialize ->erasesize to make add_mtd_device() happy. */ child->erasesize = parent->erasesize; @@ -128,15 +132,16 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, part->name); goto out_register; } - if (child->part.offset + child->size > parent->size) { - child->size = parent->size - child->part.offset; + if (child->part.offset + child->part.size > parent->size) { + child->part.size = parent_size - child->part.offset; printk(KERN_WARNING"mtd: partition \"%s\" extends beyond the end of device \"%s\" -- size truncated to %#llx\n", - part->name, parent->name, child->size); + part->name, parent->name, child->part.size); } + if (parent->numeraseregions > 1) { /* Deal with variable erase size stuff */ int i, max = parent->numeraseregions; - u64 end = child->part.offset + child->size; + u64 end = child->part.offset + child->part.size; struct mtd_erase_region_info *regions = parent->eraseregions; /* Find the first erase regions which is part of this @@ -156,7 +161,7 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, BUG_ON(child->erasesize == 0); } else { /* Single erase size */ - child->erasesize = parent->erasesize; + child->erasesize = master->erasesize; } /* @@ -178,7 +183,7 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, part->name); } - tmp = mtd_get_master_ofs(child, 0) + child->size; + tmp = mtd_get_master_ofs(child, 0) + child->part.size; remainder = do_div(tmp, wr_alignment); if ((child->flags & MTD_WRITEABLE) && remainder) { child->flags &= ~MTD_WRITEABLE; @@ -186,6 +191,7 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, part->name); } + child->size = child->part.size; child->ecc_step_size = parent->ecc_step_size; child->ecc_strength = parent->ecc_strength; child->bitflip_threshold = parent->bitflip_threshold; @@ -193,7 +199,7 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, if (master->_block_isbad) { uint64_t offs = 0; - while (offs < child->size) { + while (offs < child->part.size) { if (mtd_block_isreserved(child, offs)) child->ecc_stats.bbtblocks++; else if (mtd_block_isbad(child, offs)) @@ -234,6 +240,8 @@ int mtd_add_partition(struct mtd_info *parent, const char *name, long long offset, long long length) { struct mtd_info *master = mtd_get_master(parent); + u64 parent_size = mtd_is_partition(parent) ? + parent->part.size : parent->size; struct mtd_partition part; struct mtd_info *child; int ret = 0; @@ -244,7 +252,7 @@ int mtd_add_partition(struct mtd_info *parent, const char *name, return -EINVAL; if (length == MTDPART_SIZ_FULL) - length = parent->size - offset; + length = parent_size - offset; if (length <= 0) return -EINVAL; @@ -419,7 +427,7 @@ int add_mtd_partitions(struct mtd_info *parent, /* Look for subpartitions */ parse_mtd_partitions(child, parts[i].types, NULL); - cur_offset = child->part.offset + child->size; + cur_offset = child->part.offset + child->part.size; } return 0; diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 2d1f4a61f4ac..157357ec1441 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -200,6 +200,8 @@ struct mtd_debug_info { * * @node: list node used to add an MTD partition to the parent partition list * @offset: offset of the partition relatively to the parent offset + * @size: partition size. Should be equal to mtd->size unless + * MTD_SLC_ON_MLC_EMULATION is set * @flags: original flags (before the mtdpart logic decided to tweak them based * on flash constraints, like eraseblock/pagesize alignment) * @@ -209,6 +211,7 @@ struct mtd_debug_info { struct mtd_part { struct list_head node; u64 offset; + u64 size; u32 flags; }; @@ -622,7 +625,9 @@ static inline uint32_t mtd_mod_by_ws(uint64_t sz, struct mtd_info *mtd) static inline int mtd_wunit_per_eb(struct mtd_info *mtd) { - return mtd->erasesize / mtd->writesize; + struct mtd_info *master = mtd_get_master(mtd); + + return master->erasesize / mtd->writesize; } static inline int mtd_offset_to_wunit(struct mtd_info *mtd, loff_t offs) diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h index e545c050d3e8..b74a539ec581 100644 --- a/include/linux/mtd/partitions.h +++ b/include/linux/mtd/partitions.h @@ -37,6 +37,7 @@ * master MTD flag set for the corresponding MTD partition. * For example, to force a read-only partition, simply adding * MTD_WRITEABLE to the mask_flags will do the trick. + * add_flags: contains flags to add to the parent flags * * Note: writeable partitions require their size and offset be * erasesize aligned (e.g. use MTDPART_OFS_NEXTBLK). @@ -48,6 +49,7 @@ struct mtd_partition { uint64_t size; /* partition size */ uint64_t offset; /* offset within the master MTD space */ uint32_t mask_flags; /* master MTD flags to mask out for this partition */ + uint32_t add_flags; /* flags to add to the partition */ struct device_node *of_node; }; diff --git a/include/uapi/mtd/mtd-abi.h b/include/uapi/mtd/mtd-abi.h index 47ffe3208c27..4b48fbf7d343 100644 --- a/include/uapi/mtd/mtd-abi.h +++ b/include/uapi/mtd/mtd-abi.h @@ -104,6 +104,7 @@ struct mtd_write_req { #define MTD_BIT_WRITEABLE 0x800 /* Single bits can be flipped */ #define MTD_NO_ERASE 0x1000 /* No erase necessary */ #define MTD_POWERUP_LOCK 0x2000 /* Always locked after reset */ +#define MTD_SLC_ON_MLC_EMULATION 0x4000 /* Emulate SLC behavior on MLC NANDs */ /* Some common devices / combinations of capabilities */ #define MTD_CAP_ROM 0 -- cgit v1.2.3 From 734e5e4e268f792d514ec2313792f30ec5f6c94f Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 5 May 2020 22:13:06 +0200 Subject: rtc: add new VL flag for backup switchover A new flag RTC_VL_BACKUP_SWITCH means that a backup switchover happened since last flag clear. Link: https://lore.kernel.org/r/20200505201310.255145-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- include/uapi/linux/rtc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtc.h b/include/uapi/linux/rtc.h index 83bba58d47f4..fa9aff91cbf2 100644 --- a/include/uapi/linux/rtc.h +++ b/include/uapi/linux/rtc.h @@ -99,6 +99,7 @@ struct rtc_pll_info { #define RTC_VL_BACKUP_LOW _BITUL(1) /* Backup voltage is low */ #define RTC_VL_BACKUP_EMPTY _BITUL(2) /* Backup empty or not present */ #define RTC_VL_ACCURACY_LOW _BITUL(3) /* Voltage is low, RTC accuracy is reduced */ +#define RTC_VL_BACKUP_SWITCH _BITUL(4) /* Backup switchover happened */ #define RTC_VL_READ _IOR('p', 0x13, unsigned int) /* Voltage low detection */ #define RTC_VL_CLR _IO('p', 0x14) /* Clear voltage low information */ -- cgit v1.2.3 From ab8d78093dfa2e7820ca0c28dda9142aa771c510 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 11 May 2020 17:15:35 +0100 Subject: bpf: Minor fixes to BPF helpers documentation Minor improvements to the documentation for BPF helpers: * Fix formatting for the description of "bpf_socket" for bpf_getsockopt() and bpf_setsockopt(), thus suppressing two warnings from rst2man about "Unexpected indentation". * Fix formatting for return values for bpf_sk_assign() and seq_file helpers. * Fix and harmonise formatting, in particular for function/struct names. * Remove blank lines before "Return:" sections. * Replace tabs found in the middle of text lines. * Fix typos. * Add a note to the footer (in Python script) about "bpftool feature probe", including for listing features available to unprivileged users, and add a reference to bpftool man page. Thanks to Florian for reporting two typos (duplicated words). Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200511161536.29853-4-quentin@isovalent.com --- include/uapi/linux/bpf.h | 109 ++++++++++++++++++++++++--------------------- scripts/bpf_helpers_doc.py | 6 +++ 2 files changed, 65 insertions(+), 50 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9d1932e23cec..bfb31c1be219 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -675,8 +675,8 @@ union bpf_attr { * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. * - * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() - * instead. + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. * Return * 0 on success, or a negative error in case of failure. * @@ -684,7 +684,7 @@ union bpf_attr { * Description * Return the time elapsed since system boot, in nanoseconds. * Does not include time the system was suspended. - * See: clock_gettime(CLOCK_MONOTONIC) + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) * Return * Current *ktime*. * @@ -1543,11 +1543,11 @@ union bpf_attr { * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address - * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for * more details. * - * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() - * instead. + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. * Return * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative @@ -1575,7 +1575,7 @@ union bpf_attr { * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description - * Equivalent to bpf_get_socket_cookie() helper that accepts + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return * A 8-byte long non-decreasing number. @@ -1604,6 +1604,7 @@ union bpf_attr { * The option value of length *optlen* is pointed by *optval*. * * *bpf_socket* should be one of the following: + * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * and **BPF_CGROUP_INET6_CONNECT**. @@ -1672,12 +1673,12 @@ union bpf_attr { * * The lower two bits of *flags* are used as the return code if * the map lookup fails. This is so that the return value can be - * one of the XDP program return codes up to XDP_TX, as chosen by - * the caller. Any higher bits in the *flags* argument must be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. Any higher bits in the *flags* argument must be * unset. * - * See also bpf_redirect(), which only supports redirecting to an - * ifindex, but doesn't require a map to do so. + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. * Return * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. @@ -1785,7 +1786,7 @@ union bpf_attr { * the time running for event since last normalization. The * enabled and running times are accumulated since the perf event * open. To achieve scaling factor between two invocations of an - * eBPF program, users can can use CPU id as the key (which is + * eBPF program, users can use CPU id as the key (which is * typical for perf array usage model) to remember the previous * value and do the calculation inside the eBPF program. * Return @@ -1812,6 +1813,7 @@ union bpf_attr { * *opval* and of length *optlen*. * * *bpf_socket* should be one of the following: + * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * and **BPF_CGROUP_INET6_CONNECT**. @@ -1833,7 +1835,7 @@ union bpf_attr { * The first argument is the context *regs* on which the kprobe * works. * - * This helper works by setting setting the PC (program counter) + * This helper works by setting the PC (program counter) * to an override function which is run in place of the original * probed function. This means the probed function is not run at * all. The replacement function just returns with the required @@ -2300,7 +2302,7 @@ union bpf_attr { * **bpf_rc_keydown**\ () again with the same values, or calling * **bpf_rc_repeat**\ (). * - * Some protocols include a toggle bit, in case the button was + * Some protocols include a toggle bit, in case the button was * released and pressed again between consecutive scancodes. * * The *ctx* should point to the lirc sample as passed into @@ -2646,7 +2648,6 @@ union bpf_attr { * * *th* points to the start of the TCP header, while *th_len* * contains **sizeof**\ (**struct tcphdr**). - * * Return * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. @@ -2829,7 +2830,6 @@ union bpf_attr { * * *th* points to the start of the TCP header, while *th_len* * contains the length of the TCP header. - * * Return * On success, lower 32 bits hold the generated SYN cookie in * followed by 16 bits which hold the MSS value for that cookie, @@ -2912,7 +2912,7 @@ union bpf_attr { * // size, after checking its boundaries. * } * - * In comparison, using **bpf_probe_read_user()** helper here + * In comparison, using **bpf_probe_read_user**\ () helper here * instead to read the string would require to estimate the length * at compile time, and would often result in copying more memory * than necessary. @@ -2930,14 +2930,14 @@ union bpf_attr { * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* - * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. * Return - * On success, the strictly positive length of the string, including + * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. * * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * Description - * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * Return * 0 on success, or a negative error in case of failure. @@ -2965,19 +2965,19 @@ union bpf_attr { * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * Description * For an eBPF program attached to a perf event, retrieve the - * branch records (struct perf_branch_entry) associated to *ctx* - * and store it in the buffer pointed by *buf* up to size + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size * *size* bytes. * Return * On success, number of bytes written to *buf*. On error, a * negative value. * * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to - * instead return the number of bytes required to store all the + * instead return the number of bytes required to store all the * branch entries. If this flag is set, *buf* may be NULL. * * **-EINVAL** if arguments invalid or **size** not a multiple - * of sizeof(struct perf_branch_entry). + * of **sizeof**\ (**struct perf_branch_entry**\ ). * * **-ENOENT** if architecture does not support branch records. * @@ -2985,8 +2985,8 @@ union bpf_attr { * Description * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. - * - * On failure, the returned value is one of the following: + * Return + * 0 on success, or one of the following in case of failure: * * **-EINVAL** if dev and inum supplied don't match dev_t and inode number * with nsfs of current task, or if dev conversion to dev_t lost high bits. @@ -3025,8 +3025,8 @@ union bpf_attr { * a global identifier that can be assumed unique. If *ctx* is * NULL, then the helper returns the cookie for the initial * network namespace. The cookie itself is very similar to that - * of bpf_get_socket_cookie() helper, but for network namespaces - * instead of sockets. + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. * Return * A 8-byte long opaque number. * @@ -3061,57 +3061,66 @@ union bpf_attr { * * The *flags* argument must be zero. * Return - * 0 on success, or a negative errno in case of failure. + * 0 on success, or a negative error in case of failure: * - * * **-EINVAL** Unsupported flags specified. - * * **-ENOENT** Socket is unavailable for assignment. - * * **-ENETUNREACH** Socket is unreachable (wrong netns). - * * **-EOPNOTSUPP** Unsupported operation, for example a - * call from outside of TC ingress. - * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * **-ESOCKTNOSUPPORT** if the socket type is not supported + * (reuseport). * * u64 bpf_ktime_get_boot_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. * Does include the time the system was suspended. - * See: clock_gettime(CLOCK_BOOTTIME) + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) * Return * Current *ktime*. * * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description - * seq_printf uses seq_file seq_printf() to print out the format string. + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. * The *m* represents the seq_file. The *fmt* and *fmt_size* are for * the format string itself. The *data* and *data_len* are format string - * arguments. The *data* are a u64 array and corresponding format string + * arguments. The *data* are a **u64** array and corresponding format string * values are stored in the array. For strings and pointers where pointees * are accessed, only the pointer values are stored in the *data* array. - * The *data_len* is the *data* size in term of bytes. + * The *data_len* is the size of *data* in bytes. * * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. * Reading kernel memory may fail due to either invalid address or * valid address but requiring a major memory fault. If reading kernel memory * fails, the string for **%s** will be an empty string, and the ip * address for **%p{i,I}{4,6}** will be 0. Not returning error to - * bpf program is consistent with what bpf_trace_printk() does for now. + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. * Return - * 0 on success, or a negative errno in case of failure. + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. * - * * **-EBUSY** Percpu memory copy buffer is busy, can try again - * by returning 1 from bpf program. - * * **-EINVAL** Invalid arguments, or invalid/unsupported formats. - * * **-E2BIG** Too many format specifiers. - * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) * Description - * seq_write uses seq_file seq_write() to write the data. + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. * The *m* represents the seq_file. The *data* and *len* represent the - * data to write in bytes. + * data to write in bytes. * Return - * 0 on success, or a negative errno in case of failure. + * 0 on success, or a negative error in case of failure: * - * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index ded304c96a05..91fa668fa860 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -318,6 +318,11 @@ may be interested in: of eBPF maps are used with a given helper function. * *kernel/bpf/* directory contains other files in which additional helpers are defined (for cgroups, sockmaps, etc.). +* The bpftool utility can be used to probe the availability of helper functions + on the system (as well as supported program and map types, and a number of + other parameters). To do so, run **bpftool feature probe** (see + **bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to + list features available to unprivileged users. Compatibility between helper functions and program types can generally be found in the files where helper functions are defined. Look for the **struct @@ -338,6 +343,7 @@ SEE ALSO ======== **bpf**\ (2), +**bpftool**\ (8), **cgroups**\ (7), **ip**\ (8), **perf_event_open**\ (2), -- cgit v1.2.3 From 6446ec6cbf46483737e832cd6050885fa8eb87fa Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 7 May 2020 17:12:52 +0200 Subject: media: v4l2-subdev: add VIDIOC_SUBDEV_QUERYCAP ioctl While normal video/radio/vbi/swradio nodes have a proper QUERYCAP ioctl that apps can call to determine that it is indeed a V4L2 device, there is currently no equivalent for v4l-subdev nodes. Adding this ioctl will solve that, and it will allow utilities like v4l2-compliance to be used with these devices as well. SUBDEV_QUERYCAP currently returns the version and capabilities of the subdevice. Define a capability flag to report if the subdevice is registered in read-only mode. Signed-off-by: Hans Verkuil Signed-off-by: Jacopo Mondi Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/v4l2-subdev.c | 11 +++++++++++ include/uapi/linux/v4l2-subdev.h | 16 ++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include/uapi') diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c index 174778f9c0bc..665b0c0f74cf 100644 --- a/drivers/media/v4l2-core/v4l2-subdev.c +++ b/drivers/media/v4l2-core/v4l2-subdev.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -344,6 +345,16 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg) int rval; switch (cmd) { + case VIDIOC_SUBDEV_QUERYCAP: { + struct v4l2_subdev_capability *cap = arg; + + memset(cap->reserved, 0, sizeof(cap->reserved)); + cap->version = LINUX_VERSION_CODE; + cap->capabilities = ro_subdev ? V4L2_SUBDEV_CAP_RO_SUBDEV : 0; + + return 0; + } + case VIDIOC_QUERYCTRL: /* * TODO: this really should be folded into v4l2_queryctrl (this diff --git a/include/uapi/linux/v4l2-subdev.h b/include/uapi/linux/v4l2-subdev.h index 03970ce30741..5d2a1dab7911 100644 --- a/include/uapi/linux/v4l2-subdev.h +++ b/include/uapi/linux/v4l2-subdev.h @@ -155,9 +155,25 @@ struct v4l2_subdev_selection { __u32 reserved[8]; }; +/** + * struct v4l2_subdev_capability - subdev capabilities + * @version: the driver versioning number + * @capabilities: the subdev capabilities, see V4L2_SUBDEV_CAP_* + * @reserved: for future use, set to zero for now + */ +struct v4l2_subdev_capability { + __u32 version; + __u32 capabilities; + __u32 reserved[14]; +}; + +/* The v4l2 sub-device video device node is registered in read-only mode. */ +#define V4L2_SUBDEV_CAP_RO_SUBDEV BIT(0) + /* Backwards compatibility define --- to be removed */ #define v4l2_subdev_edid v4l2_edid +#define VIDIOC_SUBDEV_QUERYCAP _IOR('V', 0, struct v4l2_subdev_capability) #define VIDIOC_SUBDEV_G_FMT _IOWR('V', 4, struct v4l2_subdev_format) #define VIDIOC_SUBDEV_S_FMT _IOWR('V', 5, struct v4l2_subdev_format) #define VIDIOC_SUBDEV_G_FRAME_INTERVAL _IOWR('V', 21, struct v4l2_subdev_frame_interval) -- cgit v1.2.3 From 7d33850abdb9048c6aa421440a64905eb4ad07a2 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 31 Mar 2020 11:40:33 +0200 Subject: floppy: add references to 82077's extra registers This controller provides extra status registers SRA and SRB as well as a tape drive register (TDR) and a data rate select register (DSR), which are referenced in the sparc port, so let's have their symbolic definitions centralized. Link: https://lore.kernel.org/r/20200331094054.24441-3-w@1wt.eu Signed-off-by: Willy Tarreau Signed-off-by: Denis Efremov --- include/uapi/linux/fdreg.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/fdreg.h b/include/uapi/linux/fdreg.h index 1318881954e1..10d33632939d 100644 --- a/include/uapi/linux/fdreg.h +++ b/include/uapi/linux/fdreg.h @@ -7,13 +7,23 @@ * Handbook", Sanches and Canton. */ -/* Fd controller regs. S&C, about page 340 */ -#define FD_STATUS 4 -#define FD_DATA 5 +/* 82077's auxiliary status registers A & B (R) */ +#define FD_SRA 0 +#define FD_SRB 1 /* Digital Output Register */ #define FD_DOR 2 +/* 82077's tape drive register (R/W) */ +#define FD_TDR 3 + +/* 82077's data rate select register (W) */ +#define FD_DSR 4 + +/* Fd controller regs. S&C, about page 340 */ +#define FD_STATUS 4 +#define FD_DATA 5 + /* Digital Input Register (read) */ #define FD_DIR 7 -- cgit v1.2.3 From 9c4c5a24c85585fb8904bd2872501cd8181b3854 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 1 May 2020 16:44:14 +0300 Subject: floppy: add FD_AUTODETECT_SIZE define for struct floppy_drive_params Use FD_AUTODETECT_SIZE for autodetect buffer size in struct floppy_drive_params instead of a magic number. Link: https://lore.kernel.org/r/20200501134416.72248-3-efremov@linux.com Reviewed-by: Christoph Hellwig Signed-off-by: Denis Efremov --- drivers/block/floppy.c | 9 +++++---- include/uapi/linux/fd.h | 5 ++++- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 3ab6e804b5ec..b82b3d38b834 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2073,7 +2073,8 @@ static int next_valid_format(int drive) probed_format = drive_state[drive].probed_format; while (1) { - if (probed_format >= 8 || !drive_params[drive].autodetect[probed_format]) { + if (probed_format >= FD_AUTODETECT_SIZE || + !drive_params[drive].autodetect[probed_format]) { drive_state[drive].probed_format = 0; return 1; } @@ -3442,13 +3443,13 @@ static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static bool valid_floppy_drive_params(const short autodetect[8], +static bool valid_floppy_drive_params(const short autodetect[FD_AUTODETECT_SIZE], int native_format) { size_t floppy_type_size = ARRAY_SIZE(floppy_type); size_t i = 0; - for (i = 0; i < 8; ++i) { + for (i = 0; i < FD_AUTODETECT_SIZE; ++i) { if (autodetect[i] < 0 || autodetect[i] >= floppy_type_size) return false; @@ -3673,7 +3674,7 @@ struct compat_floppy_drive_params { struct floppy_max_errors max_errors; char flags; char read_track; - short autodetect[8]; + short autodetect[FD_AUTODETECT_SIZE]; compat_int_t checkfreq; compat_int_t native_format; }; diff --git a/include/uapi/linux/fd.h b/include/uapi/linux/fd.h index 90fb94712c41..3f6b7be4c096 100644 --- a/include/uapi/linux/fd.h +++ b/include/uapi/linux/fd.h @@ -172,7 +172,10 @@ struct floppy_drive_params { * used in succession to try to read the disk. If the FDC cannot lock onto * the disk, the next format is tried. This uses the variable 'probing'. */ - short autodetect[8]; /* autodetected formats */ + +#define FD_AUTODETECT_SIZE 8 + + short autodetect[FD_AUTODETECT_SIZE]; /* autodetected formats */ int checkfreq; /* how often should the drive be checked for disk * changes */ -- cgit v1.2.3 From bd10a5f3e21b1cb8e2133c1f08b3e8207cee12dd Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 1 May 2020 16:44:15 +0300 Subject: floppy: add defines for sizes of cmd & reply buffers of floppy_raw_cmd Use FD_RAW_CMD_SIZE, FD_RAW_REPLY_SIZE defines instead of magic numbers for cmd & reply buffers of struct floppy_raw_cmd. Remove local to floppy.c MAX_REPLIES define, as it is now FD_RAW_REPLY_SIZE. FD_RAW_CMD_FULLSIZE added as we allow command to also fill reply_count and reply fields. Link: https://lore.kernel.org/r/20200501134416.72248-4-efremov@linux.com Reviewed-by: Christoph Hellwig Signed-off-by: Denis Efremov --- drivers/block/floppy.c | 19 +++++-------------- include/uapi/linux/fd.h | 14 ++++++++++++-- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'include/uapi') diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index b82b3d38b834..9e098d53b046 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -337,8 +337,7 @@ static bool initialized; /* * globals used by 'result()' */ -#define MAX_REPLIES 16 -static unsigned char reply_buffer[MAX_REPLIES]; +static unsigned char reply_buffer[FD_RAW_REPLY_SIZE]; static int inr; /* size of reply buffer, when called from interrupt */ #define ST0 0 #define ST1 1 @@ -1162,7 +1161,7 @@ static int result(int fdc) int i; int status = 0; - for (i = 0; i < MAX_REPLIES; i++) { + for (i = 0; i < FD_RAW_REPLY_SIZE; i++) { status = wait_til_ready(fdc); if (status < 0) break; @@ -3079,7 +3078,7 @@ static void raw_cmd_done(int flag) raw_cmd->flags |= FD_RAW_HARDFAILURE; } else { raw_cmd->reply_count = inr; - if (raw_cmd->reply_count > MAX_REPLIES) + if (raw_cmd->reply_count > FD_RAW_REPLY_SIZE) raw_cmd->reply_count = 0; for (i = 0; i < raw_cmd->reply_count; i++) raw_cmd->reply[i] = reply_buffer[i]; @@ -3190,18 +3189,10 @@ loop: if (ret) return -EFAULT; param += sizeof(struct floppy_raw_cmd); - if (ptr->cmd_count > 33) - /* the command may now also take up the space - * initially intended for the reply & the - * reply count. Needed for long 82078 commands - * such as RESTORE, which takes ... 17 command - * bytes. Murphy's law #137: When you reserve - * 16 bytes for a structure, you'll one day - * discover that you really need 17... - */ + if (ptr->cmd_count > FD_RAW_CMD_FULLSIZE) return -EINVAL; - for (i = 0; i < 16; i++) + for (i = 0; i < FD_RAW_REPLY_SIZE; i++) ptr->reply[i] = 0; ptr->resultcode = 0; diff --git a/include/uapi/linux/fd.h b/include/uapi/linux/fd.h index 3f6b7be4c096..2e9c2c1c18e6 100644 --- a/include/uapi/linux/fd.h +++ b/include/uapi/linux/fd.h @@ -360,10 +360,20 @@ struct floppy_raw_cmd { int buffer_length; /* length of allocated buffer */ unsigned char rate; + +#define FD_RAW_CMD_SIZE 16 +#define FD_RAW_REPLY_SIZE 16 +#define FD_RAW_CMD_FULLSIZE (FD_RAW_CMD_SIZE + 1 + FD_RAW_REPLY_SIZE) + + /* The command may take up the space initially intended for the reply + * and the reply count. Needed for long 82078 commands such as RESTORE, + * which takes 17 command bytes. + */ + unsigned char cmd_count; - unsigned char cmd[16]; + unsigned char cmd[FD_RAW_CMD_SIZE]; unsigned char reply_count; - unsigned char reply[16]; + unsigned char reply[FD_RAW_REPLY_SIZE]; int track; int resultcode; -- cgit v1.2.3 From 0836275df4db20daf040fff5d9a1da89c4c08a85 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 1 May 2020 16:44:16 +0300 Subject: floppy: suppress UBSAN warning in setup_rw_floppy() UBSAN: array-index-out-of-bounds in drivers/block/floppy.c:1521:45 index 16 is out of range for type 'unsigned char [16]' Call Trace: ... setup_rw_floppy+0x5c3/0x7f0 floppy_ready+0x2be/0x13b0 process_one_work+0x2c1/0x5d0 worker_thread+0x56/0x5e0 kthread+0x122/0x170 ret_from_fork+0x35/0x40 From include/uapi/linux/fd.h: struct floppy_raw_cmd { ... unsigned char cmd_count; unsigned char cmd[16]; unsigned char reply_count; unsigned char reply[16]; ... } This out-of-bounds access is intentional. The command in struct floppy_raw_cmd may take up the space initially intended for the reply and the reply count. It is needed for long 82078 commands such as RESTORE, which takes 17 command bytes. Initial cmd size is not enough and since struct setup_rw_floppy is a part of uapi we check that cmd_count is in [0:16+1+16] in raw_cmd_copyin(). The patch adds union with original cmd,reply_count,reply fields and fullcmd field of equivalent size. The cmd accesses are turned to fullcmd where appropriate to suppress UBSAN warning. Link: https://lore.kernel.org/r/20200501134416.72248-5-efremov@linux.com Reviewed-by: Christoph Hellwig Signed-off-by: Denis Efremov --- drivers/block/floppy.c | 4 ++-- include/uapi/linux/fd.h | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 9e098d53b046..064c1acb9f00 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -1070,7 +1070,7 @@ static void setup_DMA(void) if (raw_cmd->length == 0) { print_hex_dump(KERN_INFO, "zero dma transfer size: ", DUMP_PREFIX_NONE, 16, 1, - raw_cmd->cmd, raw_cmd->cmd_count, false); + raw_cmd->fullcmd, raw_cmd->cmd_count, false); cont->done(0); fdc_state[current_fdc].reset = 1; return; @@ -1515,7 +1515,7 @@ static void setup_rw_floppy(void) r = 0; for (i = 0; i < raw_cmd->cmd_count; i++) - r |= output_byte(current_fdc, raw_cmd->cmd[i]); + r |= output_byte(current_fdc, raw_cmd->fullcmd[i]); debugt(__func__, "rw_command"); diff --git a/include/uapi/linux/fd.h b/include/uapi/linux/fd.h index 2e9c2c1c18e6..8b80c63b971c 100644 --- a/include/uapi/linux/fd.h +++ b/include/uapi/linux/fd.h @@ -371,9 +371,14 @@ struct floppy_raw_cmd { */ unsigned char cmd_count; - unsigned char cmd[FD_RAW_CMD_SIZE]; - unsigned char reply_count; - unsigned char reply[FD_RAW_REPLY_SIZE]; + union { + struct { + unsigned char cmd[FD_RAW_CMD_SIZE]; + unsigned char reply_count; + unsigned char reply[FD_RAW_REPLY_SIZE]; + }; + unsigned char fullcmd[FD_RAW_CMD_FULLSIZE]; + }; int track; int resultcode; -- cgit v1.2.3 From 17793833f81ceb319be599ec09498ec0136d9acf Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 4 May 2020 16:25:41 +0300 Subject: RDMA/ucma: Return stable IB device index as identifier The librdmacm uses node_guid as identifier to correlate between IB devices and CMA devices. However FW resets cause to such "connection" to be lost and require from the user to restart its application. Extend UCMA to return IB device index, which is stable identifier. Link: https://lore.kernel.org/r/20200504132541.355710-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 16 +++++++++------- include/uapi/rdma/rdma_user_cm.h | 4 ++++ 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 16b6cf57fa85..06127c800a49 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -845,7 +845,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, struct sockaddr *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_route_resp, ibdev_index)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) @@ -869,6 +869,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, goto out; resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; + resp.ibdev_index = ctx->cm_id->device->index; resp.port_num = ctx->cm_id->port_num; if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) @@ -880,8 +881,8 @@ static ssize_t ucma_query_route(struct ucma_file *file, out: mutex_unlock(&ctx->mutex); - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, + min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; ucma_put_ctx(ctx); @@ -895,6 +896,7 @@ static void ucma_query_device_addr(struct rdma_cm_id *cm_id, return; resp->node_guid = (__force __u64) cm_id->device->node_guid; + resp->ibdev_index = cm_id->device->index; resp->port_num = cm_id->port_num; resp->pkey = (__force __u16) cpu_to_be16( ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); @@ -907,7 +909,7 @@ static ssize_t ucma_query_addr(struct ucma_context *ctx, struct sockaddr *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); @@ -922,7 +924,7 @@ static ssize_t ucma_query_addr(struct ucma_context *ctx, ucma_query_device_addr(ctx->cm_id, &resp); - if (copy_to_user(response, &resp, sizeof(resp))) + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; @@ -974,7 +976,7 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, struct sockaddr_ib *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); @@ -1007,7 +1009,7 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, &ctx->cm_id->route.addr.dst_addr); } - if (copy_to_user(response, &resp, sizeof(resp))) + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index e42940a215a3..1bb6e75d254b 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -164,6 +164,8 @@ struct rdma_ucm_query_route_resp { __u32 num_paths; __u8 port_num; __u8 reserved[3]; + __u32 ibdev_index; + __u32 reserved1; }; struct rdma_ucm_query_addr_resp { @@ -175,6 +177,8 @@ struct rdma_ucm_query_addr_resp { __u16 dst_size; struct __kernel_sockaddr_storage src_addr; struct __kernel_sockaddr_storage dst_addr; + __u32 ibdev_index; + __u32 reserved1; }; struct rdma_ucm_query_path_resp { -- cgit v1.2.3 From 8c112a5f29a343f89072bed4b9fa176fea226798 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 4 May 2020 08:30:12 +0300 Subject: RDMA/mlx5: Add support in steering default miss User can configure default miss rule in order to skip matching in the user domain and forward the packet to the kernel steering domain. When user requests a default miss rule, we add steering rule to forward the traffic to the next namespace. Link: https://lore.kernel.org/r/20200504053012.270689-5-leon@kernel.org Signed-off-by: Maor Gottlieb Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 32 ++++++++++++++++++++++++++------ drivers/infiniband/hw/mlx5/main.c | 9 ++++----- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 5 +++++ 3 files changed, 35 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 5533b5083c29..3fa66474afa6 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -69,19 +69,32 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { static int get_dests(struct uverbs_attr_bundle *attrs, struct mlx5_ib_flow_matcher *fs_matcher, int *dest_id, - int *dest_type, struct ib_qp **qp) + int *dest_type, struct ib_qp **qp, bool *def_miss) { bool dest_devx, dest_qp; void *devx_obj; + u32 flags; + int err; dest_devx = uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); dest_qp = uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); - if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS && - ((dest_devx && dest_qp) || (!dest_devx && !dest_qp))) - return -EINVAL; + *def_miss = false; + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS); + if (err) + return err; + *def_miss = flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS; + + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) { + if (dest_devx && (dest_qp || *def_miss)) + return -EINVAL; + else if (dest_qp && *def_miss) + return -EINVAL; + } /* Allow only DEVX object as dest when inserting to FDB */ if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !dest_devx) @@ -153,6 +166,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( void *devx_obj, *cmd_in; struct ib_uobject *uobj; struct mlx5_ib_dev *dev; + bool def_miss; if (!capable(CAP_NET_RAW)) return -EPERM; @@ -162,9 +176,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); dev = mlx5_udata_to_mdev(&attrs->driver_udata); - if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp)) + if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp, &def_miss)) return -EINVAL; + if (def_miss) + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS; + len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions); if (len) { @@ -636,7 +653,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, UVERBS_ATTR_MIN_SIZE(sizeof(u32)), UA_OPTIONAL, - UA_ALLOC_AND_COPY)); + UA_ALLOC_AND_COPY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_FLAGS, + enum mlx5_ib_create_flow_flags, + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_DESTROY_FLOW, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 3af57dfe7224..38bf3841741c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4200,18 +4200,17 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) { dst[dst_num].type = dest_type; - dst[dst_num].tir_num = dest_id; + dst[dst_num++].tir_num = dest_id; flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) { dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM; - dst[dst_num].ft_num = dest_id; + dst[dst_num++].ft_num = dest_id; flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - } else { - dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_PORT; + } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_PORT) { + dst[dst_num++].type = MLX5_FLOW_DESTINATION_TYPE_PORT; flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; } - dst_num++; if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 24f3388c3182..07cf54333193 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -241,6 +241,10 @@ enum mlx5_ib_flow_type { MLX5_IB_FLOW_TYPE_MC_DEFAULT, }; +enum mlx5_ib_create_flow_flags { + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS = 1 << 0, +}; + enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_HANDLE = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE, @@ -251,6 +255,7 @@ enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_TAG, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS, }; enum mlx5_ib_destoy_flow_attrs { -- cgit v1.2.3 From f29de9eee78253d9ae57cd58a6b21eed021742c8 Mon Sep 17 00:00:00 2001 From: Daria Velikovsky Date: Mon, 4 May 2020 08:42:27 +0300 Subject: RDMA/mlx5: Add support for drop action in DV steering When drop action is used the matching packet will stop processing in steering and will be dropped. This functionality will allow users to drop matching packets. Link: https://lore.kernel.org/r/20200504054227.271486-1-leon@kernel.org Signed-off-by: Daria Velikovsky Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 35 +++++++++++++++++++------------- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + 2 files changed, 22 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 3fa66474afa6..6fa1a510c5d7 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -69,11 +69,10 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { static int get_dests(struct uverbs_attr_bundle *attrs, struct mlx5_ib_flow_matcher *fs_matcher, int *dest_id, - int *dest_type, struct ib_qp **qp, bool *def_miss) + int *dest_type, struct ib_qp **qp, u32 *flags) { bool dest_devx, dest_qp; void *devx_obj; - u32 flags; int err; dest_devx = uverbs_attr_is_valid(attrs, @@ -81,23 +80,28 @@ static int get_dests(struct uverbs_attr_bundle *attrs, dest_qp = uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); - *def_miss = false; - err = uverbs_get_flags32(&flags, attrs, - MLX5_IB_ATTR_CREATE_FLOW_FLAGS, - MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS); + *flags = 0; + err = uverbs_get_flags32(flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_FLAGS, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS | + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP); if (err) return err; - *def_miss = flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS; + + /* Both flags are not allowed */ + if (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS && + *flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP) + return -EINVAL; if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) { - if (dest_devx && (dest_qp || *def_miss)) + if (dest_devx && (dest_qp || *flags)) return -EINVAL; - else if (dest_qp && *def_miss) + else if (dest_qp && *flags) return -EINVAL; } - /* Allow only DEVX object as dest when inserting to FDB */ - if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !dest_devx) + /* Allow only DEVX object, drop as dest for FDB */ + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !(dest_devx || + (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP))) return -EINVAL; /* Allow only DEVX object or QP as dest when inserting to RDMA_RX */ @@ -166,7 +170,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( void *devx_obj, *cmd_in; struct ib_uobject *uobj; struct mlx5_ib_dev *dev; - bool def_miss; + u32 flags; if (!capable(CAP_NET_RAW)) return -EPERM; @@ -176,12 +180,15 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); dev = mlx5_udata_to_mdev(&attrs->driver_udata); - if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp, &def_miss)) + if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp, &flags)) return -EINVAL; - if (def_miss) + if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS) flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS; + if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP) + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_DROP; + len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions); if (len) { diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 07cf54333193..8e316ef896b5 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -243,6 +243,7 @@ enum mlx5_ib_flow_type { enum mlx5_ib_create_flow_flags { MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS = 1 << 0, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP = 1 << 1, }; enum mlx5_ib_create_flow_attrs { -- cgit v1.2.3 From 581701b7efd60ba13d8a7eed60cbdd7fefaf6696 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 May 2020 16:44:24 +0200 Subject: uapi: deprecate STATX_ALL Constants of the *_ALL type can be actively harmful due to the fact that developers will usually fail to consider the possible effects of future changes to the definition. Deprecate STATX_ALL in the uapi, while no damage has been done yet. We could keep something like this around in the kernel, but there's actually no point, since all filesystems should be explicitly checking flags that they support and not rely on the VFS masking unknown ones out: a flag could be known to the VFS, yet not known to the filesystem. Cc: David Howells Cc: linux-api@vger.kernel.org Cc: linux-man@vger.kernel.org Signed-off-by: Miklos Szeredi Reviewed-by: Christoph Hellwig --- fs/stat.c | 1 - include/uapi/linux/stat.h | 11 ++++++++++- samples/vfs/test-statx.c | 2 +- tools/include/uapi/linux/stat.h | 11 ++++++++++- 4 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/fs/stat.c b/fs/stat.c index 030008796479..a6709e7ba71d 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -70,7 +70,6 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, memset(stat, 0, sizeof(*stat)); stat->result_mask |= STATX_BASIC_STATS; - request_mask &= STATX_ALL; query_flags &= KSTAT_QUERY_FLAGS; /* allow the fs to override these if it really wants to */ diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index ad80a5c885d5..d1192783139a 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -148,9 +148,18 @@ struct statx { #define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ -#define STATX_ALL 0x00000fffU /* All currently supported flags */ + #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ +#ifndef __KERNEL__ +/* + * This is deprecated, and shall remain the same value in the future. To avoid + * confusion please use the equivalent (STATX_BASIC_STATS | STATX_BTIME) + * instead. + */ +#define STATX_ALL 0x00000fffU +#endif + /* * Attributes to be found in stx_attributes and masked in stx_attributes_mask. * diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c index a3d68159fb51..76c577ea4fd8 100644 --- a/samples/vfs/test-statx.c +++ b/samples/vfs/test-statx.c @@ -216,7 +216,7 @@ int main(int argc, char **argv) struct statx stx; int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW; - unsigned int mask = STATX_ALL; + unsigned int mask = STATX_BASIC_STATS | STATX_BTIME; for (argv++; *argv; argv++) { if (strcmp(*argv, "-F") == 0) { diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h index ad80a5c885d5..d1192783139a 100644 --- a/tools/include/uapi/linux/stat.h +++ b/tools/include/uapi/linux/stat.h @@ -148,9 +148,18 @@ struct statx { #define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ -#define STATX_ALL 0x00000fffU /* All currently supported flags */ + #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ +#ifndef __KERNEL__ +/* + * This is deprecated, and shall remain the same value in the future. To avoid + * confusion please use the equivalent (STATX_BASIC_STATS | STATX_BTIME) + * instead. + */ +#define STATX_ALL 0x00000fffU +#endif + /* * Attributes to be found in stx_attributes and masked in stx_attributes_mask. * -- cgit v1.2.3 From fa2fcf4f1df1559a0a4ee0f46915b496cc2ebf60 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 May 2020 16:44:24 +0200 Subject: statx: add mount ID Systemd is hacking around to get it and it's trivial to add to statx, so... Cc: linux-api@vger.kernel.org Cc: linux-man@vger.kernel.org Signed-off-by: Miklos Szeredi Reviewed-by: Christoph Hellwig --- fs/stat.c | 4 ++++ include/linux/stat.h | 1 + include/uapi/linux/stat.h | 6 +++++- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/fs/stat.c b/fs/stat.c index f7f07d1b73cb..3d88c99f7743 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -22,6 +22,7 @@ #include #include "internal.h" +#include "mount.h" /** * generic_fillattr - Fill in the basic attributes from the inode struct @@ -199,6 +200,8 @@ retry: goto out; error = vfs_getattr(&path, stat, request_mask, flags); + stat->mnt_id = real_mount(path.mnt)->mnt_id; + stat->result_mask |= STATX_MNT_ID; path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -563,6 +566,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_rdev_minor = MINOR(stat->rdev); tmp.stx_dev_major = MAJOR(stat->dev); tmp.stx_dev_minor = MINOR(stat->dev); + tmp.stx_mnt_id = stat->mnt_id; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/include/linux/stat.h b/include/linux/stat.h index 528c4baad091..56614af83d4a 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -47,6 +47,7 @@ struct kstat { struct timespec64 ctime; struct timespec64 btime; /* File creation time */ u64 blocks; + u64 mnt_id; }; #endif diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index d1192783139a..d81456247f10 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -123,7 +123,10 @@ struct statx { __u32 stx_dev_major; /* ID of device containing file [uncond] */ __u32 stx_dev_minor; /* 0x90 */ - __u64 __spare2[14]; /* Spare space for future expansion */ + __u64 stx_mnt_id; + __u64 __spare2; + /* 0xa0 */ + __u64 __spare3[12]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -148,6 +151,7 @@ struct statx { #define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ +#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ -- cgit v1.2.3 From 80340fe3605c0e78cfe496c3b3878be828cfdbfe Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 May 2020 16:44:24 +0200 Subject: statx: add mount_root Determining whether a path or file descriptor refers to a mountpoint (or more precisely a mount root) is not trivial using current tools. Add a flag to statx that indicates whether the path or fd refers to the root of a mount or not. Cc: linux-api@vger.kernel.org Cc: linux-man@vger.kernel.org Reported-by: Lennart Poettering Reported-by: J. Bruce Fields Signed-off-by: Miklos Szeredi Reviewed-by: Christoph Hellwig --- fs/stat.c | 3 +++ include/uapi/linux/stat.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/fs/stat.c b/fs/stat.c index 3d88c99f7743..b9faa6cafafe 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -202,6 +202,9 @@ retry: error = vfs_getattr(&path, stat, request_mask, flags); stat->mnt_id = real_mount(path.mnt)->mnt_id; stat->result_mask |= STATX_MNT_ID; + if (path.mnt->mnt_root == path.dentry) + stat->attributes |= STATX_ATTR_MOUNT_ROOT; + stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index d81456247f10..6df9348bb277 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -181,6 +181,7 @@ struct statx { #define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */ #define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ #define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ +#define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ -- cgit v1.2.3 From c8ffd8bcdd28296a198f237cc595148a8d4adfbe Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 May 2020 16:44:25 +0200 Subject: vfs: add faccessat2 syscall POSIX defines faccessat() as having a fourth "flags" argument, while the linux syscall doesn't have it. Glibc tries to emulate AT_EACCESS and AT_SYMLINK_NOFOLLOW, but AT_EACCESS emulation is broken. Add a new faccessat(2) syscall with the added flags argument and implement both flags. The value of AT_EACCESS is defined in glibc headers to be the same as AT_REMOVEDIR. Use this value for the kernel interface as well, together with the explanatory comment. Also add AT_EMPTY_PATH support, which is not documented by POSIX, but can be useful and is trivial to implement. Signed-off-by: Miklos Szeredi --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + fs/internal.h | 1 - fs/open.c | 34 ++++++++++++++++++++++------- include/linux/syscalls.h | 6 +++-- include/uapi/asm-generic/unistd.h | 4 +++- include/uapi/linux/fcntl.h | 10 +++++++++ 23 files changed, 62 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 36d42da7466a..5ddd128d4b7a 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -477,3 +477,4 @@ # 545 reserved for clone3 547 common openat2 sys_openat2 548 common pidfd_getfd sys_pidfd_getfd +549 common faccessat2 sys_faccessat2 diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 4d1cf74a2caa..d5cae5ffede0 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -451,3 +451,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 803039d504de..3b859596840d 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 439 +#define __NR_compat_syscalls 440 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index c1c61635f89c..6d95d0c8bf2f 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -883,6 +883,8 @@ __SYSCALL(__NR_clone3, sys_clone3) __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) +#define __NR_faccessat2 439 +__SYSCALL(__NR_faccessat2, sys_faccessat2) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 042911e670b8..49e325b604b3 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -358,3 +358,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index f4f49fcb76d0..f71b1bbcc198 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -437,3 +437,4 @@ 435 common clone3 __sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 4c67b11f9c9e..edacc4561f2b 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -443,3 +443,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 1f9e8ad636cc..f777141f5256 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -376,3 +376,4 @@ 435 n32 clone3 __sys_clone3 437 n32 openat2 sys_openat2 438 n32 pidfd_getfd sys_pidfd_getfd +439 n32 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index c0b9d802dbf6..da8c76394e17 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -352,3 +352,4 @@ 435 n64 clone3 __sys_clone3 437 n64 openat2 sys_openat2 438 n64 pidfd_getfd sys_pidfd_getfd +439 n64 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index ac586774c980..13280625d312 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -425,3 +425,4 @@ 435 o32 clone3 __sys_clone3 437 o32 openat2 sys_openat2 438 o32 pidfd_getfd sys_pidfd_getfd +439 o32 faccessat2 sys_faccessat2 diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 52a15f5cd130..5a758fa6ec52 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -435,3 +435,4 @@ 435 common clone3 sys_clone3_wrapper 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 220ae11555f2..f833a3190822 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -527,3 +527,4 @@ 435 spu clone3 sys_ni_syscall 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index bd7bd3581a0f..bfdcb7633957 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -440,3 +440,4 @@ 435 common clone3 sys_clone3 sys_clone3 437 common openat2 sys_openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 sys_faccessat2 diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index c7a30fcd135f..acc35daa1b79 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -440,3 +440,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index f13615ecdecc..8004a276cb74 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -483,3 +483,4 @@ # 435 reserved for clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 54581ac671b4..d8f8a1a69ed1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -442,3 +442,4 @@ 435 i386 clone3 sys_clone3 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd +439 i386 faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 37b844f839bc..78847b32e137 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -359,6 +359,7 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 85a9ab1bc04d..69d0d73876b3 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -408,3 +408,4 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 diff --git a/fs/internal.h b/fs/internal.h index aa5d45524e87..0d467e32dd7e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -126,7 +126,6 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); -long do_faccessat(int dfd, const char __user *filename, int mode); int do_fchmodat(int dfd, const char __user *filename, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); diff --git a/fs/open.c b/fs/open.c index 0ea3cd1a1250..e62b1db06638 100644 --- a/fs/open.c +++ b/fs/open.c @@ -394,20 +394,30 @@ static const struct cred *access_override_creds(void) return old_cred; } -long do_faccessat(int dfd, const char __user *filename, int mode) +long do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; - const struct cred *old_cred; + const struct cred *old_cred = NULL; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; - old_cred = access_override_creds(); - if (!old_cred) - return -ENOMEM; + if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) + return -EINVAL; + + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + if (!(flags & AT_EACCESS)) { + old_cred = access_override_creds(); + if (!old_cred) + return -ENOMEM; + } retry: res = user_path_at(dfd, filename, lookup_flags, &path); @@ -450,18 +460,26 @@ out_path_release: goto retry; } out: - revert_creds(old_cred); + if (old_cred) + revert_creds(old_cred); + return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { - return do_faccessat(dfd, filename, mode); + return do_faccessat(dfd, filename, mode, 0); +} + +SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, + int, flags) +{ + return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { - return do_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } int ksys_chdir(const char __user *filename) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1815065d52f3..7c354c2955f5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -428,6 +428,8 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length); #endif asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode); +asmlinkage long sys_faccessat2(int dfd, const char __user *filename, int mode, + int flags); asmlinkage long sys_chdir(const char __user *filename); asmlinkage long sys_fchdir(unsigned int fd); asmlinkage long sys_chroot(const char __user *filename); @@ -1333,11 +1335,11 @@ static inline int ksys_chmod(const char __user *filename, umode_t mode) return do_fchmodat(AT_FDCWD, filename, mode); } -extern long do_faccessat(int dfd, const char __user *filename, int mode); +long do_faccessat(int dfd, const char __user *filename, int mode, int flags); static inline long ksys_access(const char __user *filename, int mode) { - return do_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } extern int do_fchownat(int dfd, const char __user *filename, uid_t user, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 3a3201e4618e..f4a01305d9a6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -855,9 +855,11 @@ __SYSCALL(__NR_clone3, sys_clone3) __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) +#define __NR_faccessat2 439 +__SYSCALL(__NR_faccessat2, sys_faccessat2) #undef __NR_syscalls -#define __NR_syscalls 439 +#define __NR_syscalls 440 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index ca88b7bce553..2f86b2ad6d7e 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -84,10 +84,20 @@ #define DN_ATTRIB 0x00000020 /* File changed attibutes */ #define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ +/* + * The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is + * meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to + * unlinkat. The two functions do completely different things and therefore, + * the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to + * faccessat would be undefined behavior and thus treating it equivalent to + * AT_EACCESS is valid undefined behavior. + */ #define AT_FDCWD -100 /* Special value used to indicate openat should use the current working directory. */ #define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ +#define AT_EACCESS 0x200 /* Test access permitted for + effective IDs, not real IDs. */ #define AT_REMOVEDIR 0x200 /* Remove directory instead of unlinking file. */ #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ -- cgit v1.2.3 From 7aebfa1b3885b5aa29fcb4a596d0485ac463bbe8 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 13 May 2020 18:50:27 -0700 Subject: bpf: Support narrow loads from bpf_sock_addr.user_port bpf_sock_addr.user_port supports only 4-byte load and it leads to ugly code in BPF programs, like: volatile __u32 user_port = ctx->user_port; __u16 port = bpf_ntohs(user_port); Since otherwise clang may optimize the load to be 2-byte and it's rejected by verifier. Add support for 1- and 2-byte loads same way as it's supported for other fields in bpf_sock_addr like user_ip4, msg_src_ip4, etc. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/c1e983f4c17573032601d0b2b1f9d1274f24bc16.1589420814.git.rdna@fb.com --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 15 +++++++-------- tools/include/uapi/linux/bpf.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bfb31c1be219..85cfdffde182 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3728,7 +3728,7 @@ struct bpf_sock_addr { __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ - __u32 user_port; /* Allows 4-byte read and write. + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ diff --git a/net/core/filter.c b/net/core/filter.c index da0634979f53..1fe8c0c2d408 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7029,6 +7029,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); @@ -7059,10 +7060,6 @@ static bool sock_addr_is_valid_access(int off, int size, return false; } break; - case bpf_ctx_range(struct bpf_sock_addr, user_port): - if (size != size_default) - return false; - break; case offsetof(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; @@ -7958,8 +7955,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { + int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): @@ -7994,9 +7991,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, - struct sockaddr_in6, uaddr, - sin6_port, tmp_reg); + /* Account for sin6_port being smaller than user_port. */ + port_size = min(port_size, BPF_LDST_BYTES(si)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bfb31c1be219..85cfdffde182 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3728,7 +3728,7 @@ struct bpf_sock_addr { __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ - __u32 user_port; /* Allows 4-byte read and write. + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ -- cgit v1.2.3 From f307fa2cb4c935f7f1ff0aeb880c7b44fb9a642b Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:47 -0700 Subject: bpf: Introduce bpf_sk_{, ancestor_}cgroup_id helpers With having ability to lookup sockets in cgroup skb programs it becomes useful to access cgroup id of retrieved sockets so that policies can be implemented based on origin cgroup of such socket. For example, a container running in a cgroup can have cgroup skb ingress program that can lookup peer socket that is sending packets to a process inside the container and decide whether those packets should be allowed or denied based on cgroup id of the peer. More specifically such ingress program can implement intra-host policy "allow incoming packets only from this same container and not from any other container on same host" w/o relying on source IP addresses since quite often it can be the case that containers share same IP address on the host. Introduce two new helpers for this use-case: bpf_sk_cgroup_id() and bpf_sk_ancestor_cgroup_id(). These helpers are similar to existing bpf_skb_{,ancestor_}cgroup_id helpers with the only difference that sk is used to get cgroup id instead of skb, and share code with them. See documentation in UAPI for more details. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/f5884981249ce911f63e9b57ecd5d7d19154ff39.1589486450.git.rdna@fb.com --- include/uapi/linux/bpf.h | 36 ++++++++++++++++++++++++- net/core/filter.c | 60 +++++++++++++++++++++++++++++++++++------- tools/include/uapi/linux/bpf.h | 36 ++++++++++++++++++++++++- 3 files changed, 121 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85cfdffde182..146c742f1d49 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3121,6 +3121,38 @@ union bpf_attr { * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3250,7 +3282,9 @@ union bpf_attr { FN(sk_assign), \ FN(ktime_get_boot_ns), \ FN(seq_printf), \ - FN(seq_write), + FN(seq_write), \ + FN(sk_cgroup_id), \ + FN(sk_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index a47dc5b9dad4..5815902bb617 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { }; #ifdef CONFIG_SOCK_CGROUP_DATA +static inline u64 __bpf_sk_cgroup_id(struct sock *sk) +{ + struct cgroup *cgrp; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgroup_id(cgrp); +} + BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); - struct cgroup *cgrp; if (!sk || !sk_fullsock(sk)) return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgroup_id(cgrp); + return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, - ancestor_level) +static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, + int ancestor_level) { - struct sock *sk = skb_to_full_sk(skb); struct cgroup *ancestor; struct cgroup *cgrp; - if (!sk || !sk_fullsock(sk)) - return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) @@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, return cgroup_id(ancestor); } +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, @@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) +{ + return __bpf_sk_cgroup_id(sk); +} + +static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { + .func = bpf_sk_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; + +BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) +{ + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + +static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { + .func = bpf_sk_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -6159,6 +6197,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; + case BPF_FUNC_sk_cgroup_id: + return &bpf_sk_cgroup_id_proto; + case BPF_FUNC_sk_ancestor_cgroup_id: + return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 85cfdffde182..146c742f1d49 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3121,6 +3121,38 @@ union bpf_attr { * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3250,7 +3282,9 @@ union bpf_attr { FN(sk_assign), \ FN(ktime_get_boot_ns), \ FN(seq_printf), \ - FN(seq_write), + FN(seq_write), \ + FN(sk_cgroup_id), \ + FN(sk_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From c8741e2bfe872425ea6f10bb6f7dc1d67bc60c3a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:25 +0200 Subject: xdp: Allow bpf_xdp_adjust_tail() to grow packet size Finally, after all drivers have a frame size, allow BPF-helper bpf_xdp_adjust_tail() to grow or extend packet size at frame tail. Remember that helper/macro xdp_data_hard_end have reserved some tailroom. Thus, this helper makes sure that the BPF-prog don't have access to this tailroom area. V2: Remove one chicken check and use WARN_ONCE for other Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158945348530.97035.12577148209134239291.stgit@firesoul --- include/uapi/linux/bpf.h | 4 ++-- net/core/filter.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 32cbf36c7729..b9b8a0f63b91 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2015,8 +2015,8 @@ union bpf_attr { * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is - * only possible to shrink the packet as of this writing, - * therefore *delta* must be a negative integer. + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers diff --git a/net/core/filter.c b/net/core/filter.c index 5815902bb617..e7b033dad44e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3411,12 +3411,19 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { + void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; - /* only shrinking is allowed for now. */ - if (unlikely(offset >= 0)) + /* Notice that xdp_data_hard_end have reserved some tailroom */ + if (unlikely(data_end > data_hard_end)) return -EINVAL; + /* ALL drivers MUST init xdp->frame_sz, chicken check below */ + if (unlikely(xdp->frame_sz > PAGE_SIZE)) { + WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); + return -EINVAL; + } + if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; -- cgit v1.2.3 From a17b53c4a4b55ec322c132b6670743612229ee9c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:53 -0700 Subject: bpf, capability: Introduce CAP_BPF Split BPF operations that are allowed under CAP_SYS_ADMIN into combination of CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN. For backward compatibility include them in CAP_SYS_ADMIN as well. The end result provides simple safety model for applications that use BPF: - to load tracing program types BPF_PROG_TYPE_{KPROBE, TRACEPOINT, PERF_EVENT, RAW_TRACEPOINT, etc} use CAP_BPF and CAP_PERFMON - to load networking program types BPF_PROG_TYPE_{SCHED_CLS, XDP, SK_SKB, etc} use CAP_BPF and CAP_NET_ADMIN There are few exceptions from this rule: - bpf_trace_printk() is allowed in networking programs, but it's using tracing mechanism, hence this helper needs additional CAP_PERFMON if networking program is using this helper. - BPF_F_ZERO_SEED flag for hash/lru map is allowed under CAP_SYS_ADMIN only to discourage production use. - BPF HW offload is allowed under CAP_SYS_ADMIN. - bpf_probe_write_user() is allowed under CAP_SYS_ADMIN only. CAPs are not checked at attach/detach time with two exceptions: - loading BPF_PROG_TYPE_CGROUP_SKB is allowed for unprivileged users, hence CAP_NET_ADMIN is required at attach time. - flow_dissector detach doesn't check prog FD at detach, hence CAP_NET_ADMIN is required at detach time. CAP_SYS_ADMIN is required to iterate BPF objects (progs, maps, links) via get_next_id command and convert them to file descriptor via GET_FD_BY_ID command. This restriction guarantees that mutliple tasks with CAP_BPF are not able to affect each other. That leads to clean isolation of tasks. For example: task A with CAP_BPF and CAP_NET_ADMIN loads and attaches a firewall via bpf_link. task B with the same capabilities cannot detach that firewall unless task A explicitly passed link FD to task B via scm_rights or bpffs. CAP_SYS_ADMIN can still detach/unload everything. Two networking user apps with CAP_SYS_ADMIN and CAP_NET_ADMIN can accidentely mess with each other programs and maps. Two networking user apps with CAP_NET_ADMIN and CAP_BPF cannot affect each other. CAP_NET_ADMIN + CAP_BPF allows networking programs access only packet data. Such networking progs cannot access arbitrary kernel memory or leak pointers. bpftool, bpftrace, bcc tools binaries should NOT be installed with CAP_BPF and CAP_PERFMON, since unpriv users will be able to read kernel secrets. But users with these two permissions will be able to use these tracing tools. CAP_PERFMON is least secure, since it allows kprobes and kernel memory access. CAP_NET_ADMIN can stop network traffic via iproute2. CAP_BPF is the safest from security point of view and harmless on its own. Having CAP_BPF and/or CAP_NET_ADMIN is not enough to write into arbitrary map and if that map is used by firewall-like bpf prog. CAP_BPF allows many bpf prog_load commands in parallel. The verifier may consume large amount of memory and significantly slow down the system. Existing unprivileged BPF operations are not affected. In particular unprivileged users are allowed to load socket_filter and cg_skb program types and to create array, hash, prog_array, map-in-map map types. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-2-alexei.starovoitov@gmail.com --- include/linux/capability.h | 5 +++++ include/uapi/linux/capability.h | 34 +++++++++++++++++++++++++++++++++- security/selinux/include/classmap.h | 4 ++-- 3 files changed, 40 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/capability.h b/include/linux/capability.h index 027d7e4a853b..b4345b38a6be 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -256,6 +256,11 @@ static inline bool perfmon_capable(void) return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); } +static inline bool bpf_capable(void) +{ + return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); +} + /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index e58c9636741b..c7372180a0a9 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -274,6 +274,7 @@ struct vfs_ns_cap_data { arbitrary SCSI commands */ /* Allow setting encryption key on loopback filesystem */ /* Allow setting zone reclaim policy */ +/* Allow everything under CAP_BPF and CAP_PERFMON for backward compatibility */ #define CAP_SYS_ADMIN 21 @@ -374,7 +375,38 @@ struct vfs_ns_cap_data { #define CAP_PERFMON 38 -#define CAP_LAST_CAP CAP_PERFMON +/* + * CAP_BPF allows the following BPF operations: + * - Creating all types of BPF maps + * - Advanced verifier features + * - Indirect variable access + * - Bounded loops + * - BPF to BPF function calls + * - Scalar precision tracking + * - Larger complexity limits + * - Dead code elimination + * - And potentially other features + * - Loading BPF Type Format (BTF) data + * - Retrieve xlated and JITed code of BPF programs + * - Use bpf_spin_lock() helper + * + * CAP_PERFMON relaxes the verifier checks further: + * - BPF progs can use of pointer-to-integer conversions + * - speculation attack hardening measures are bypassed + * - bpf_probe_read to read arbitrary kernel memory is allowed + * - bpf_trace_printk to print kernel memory is allowed + * + * CAP_SYS_ADMIN is required to use bpf_probe_write_user. + * + * CAP_SYS_ADMIN is required to iterate system wide loaded + * programs, maps, links, BTFs and convert their IDs to file descriptors. + * + * CAP_PERFMON and CAP_BPF are required to load tracing programs. + * CAP_NET_ADMIN and CAP_BPF are required to load networking programs. + */ +#define CAP_BPF 39 + +#define CAP_LAST_CAP CAP_BPF #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index d233ab3f1533..98e1513b608a 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,9 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read", "perfmon" + "wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf" -#if CAP_LAST_CAP > CAP_PERFMON +#if CAP_LAST_CAP > CAP_BPF #error New capability defined, please update COMMON_CAP2_PERMS. #endif -- cgit v1.2.3 From f8ab1807a9c9aa14478920e64d1c9d3685aae26f Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 15 May 2020 14:40:11 +0300 Subject: net: sched: introduce terse dump flag Add new TCA_DUMP_FLAGS attribute and use it in cls API to request terse filter output from classifiers with TCA_DUMP_FLAGS_TERSE flag. This option is intended to be used to improve performance of TC filter dump when userland only needs to obtain stats and not the whole classifier/action data. Extend struct tcf_proto_ops with new terse_dump() callback that must be defined by supporting classifier implementations. Support of the options in specific classifiers and actions is implemented in following patches in the series. Signed-off-by: Vlad Buslov Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 4 ++++ include/uapi/linux/rtnetlink.h | 6 ++++++ net/sched/cls_api.c | 39 +++++++++++++++++++++++++++++++-------- 3 files changed, 41 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index ab87a8b86a32..c510b03b9751 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -330,6 +330,10 @@ struct tcf_proto_ops { int (*dump)(struct net*, struct tcf_proto*, void *, struct sk_buff *skb, struct tcmsg*, bool); + int (*terse_dump)(struct net *net, + struct tcf_proto *tp, void *fh, + struct sk_buff *skb, + struct tcmsg *t, bool rtnl_held); int (*tmplt_dump)(struct sk_buff *skb, struct net *net, void *tmplt_priv); diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 4a8c5b745157..073e71ef6bdd 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -609,11 +609,17 @@ enum { TCA_HW_OFFLOAD, TCA_INGRESS_BLOCK, TCA_EGRESS_BLOCK, + TCA_DUMP_FLAGS, __TCA_MAX }; #define TCA_MAX (__TCA_MAX - 1) +#define TCA_DUMP_FLAGS_TERSE (1 << 0) /* Means that in dump user gets only basic + * data necessary to identify the objects + * (handle, cookie, etc.) and stats. + */ + #define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg)))) #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg)) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 299b963c796e..cb2c10e0fee5 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1851,7 +1851,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, u32 portid, u32 seq, u16 flags, int event, - bool rtnl_held) + bool terse_dump, bool rtnl_held) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1878,6 +1878,14 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; if (!fh) { tcm->tcm_handle = 0; + } else if (terse_dump) { + if (tp->ops->terse_dump) { + if (tp->ops->terse_dump(net, tp, fh, skb, tcm, + rtnl_held) < 0) + goto nla_put_failure; + } else { + goto cls_op_not_supp; + } } else { if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) @@ -1888,6 +1896,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, out_nlmsg_trim: nla_put_failure: +cls_op_not_supp: nlmsg_trim(skb, b); return -1; } @@ -1908,7 +1917,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event, - rtnl_held) <= 0) { + false, rtnl_held) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1940,7 +1949,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - rtnl_held) <= 0) { + false, rtnl_held) <= 0) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; @@ -2501,6 +2510,7 @@ struct tcf_dump_args { struct tcf_block *block; struct Qdisc *q; u32 parent; + bool terse_dump; }; static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) @@ -2511,12 +2521,12 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, true); + RTM_NEWTFILTER, a->terse_dump, true); } static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, struct sk_buff *skb, struct netlink_callback *cb, - long index_start, long *p_index) + long index_start, long *p_index, bool terse) { struct net *net = sock_net(skb->sk); struct tcf_block *block = chain->block; @@ -2545,7 +2555,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, true) <= 0) + RTM_NEWTFILTER, false, true) <= 0) goto errout; cb->args[1] = 1; } @@ -2561,6 +2571,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.skip = cb->args[1] - 1; arg.w.count = 0; arg.w.cookie = cb->args[2]; + arg.terse_dump = terse; tp->ops->walk(tp, &arg.w, true); cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; @@ -2574,6 +2585,10 @@ errout: return false; } +static const struct nla_policy tcf_tfilter_dump_policy[TCA_MAX + 1] = { + [TCA_DUMP_FLAGS] = NLA_POLICY_BITFIELD32(TCA_DUMP_FLAGS_TERSE), +}; + /* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { @@ -2583,6 +2598,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) struct Qdisc *q = NULL; struct tcf_block *block; struct tcmsg *tcm = nlmsg_data(cb->nlh); + bool terse_dump = false; long index_start; long index; u32 parent; @@ -2592,10 +2608,17 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX, - NULL, cb->extack); + tcf_tfilter_dump_policy, cb->extack); if (err) return err; + if (tca[TCA_DUMP_FLAGS]) { + struct nla_bitfield32 flags = + nla_get_bitfield32(tca[TCA_DUMP_FLAGS]); + + terse_dump = flags.value & TCA_DUMP_FLAGS_TERSE; + } + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) @@ -2653,7 +2676,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; if (!tcf_chain_dump(chain, q, parent, skb, cb, - index_start, &index)) { + index_start, &index, terse_dump)) { tcf_chain_put(chain); err = -EMSGSIZE; break; -- cgit v1.2.3 From 0d9b5b3af134cddfdc1dd31d41946a0ad389bbf2 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 15 May 2020 18:38:04 +0200 Subject: io_uring: add 'cq_flags' field for the CQ ring This patch adds the new 'cq_flags' field that should be written by the application and read by the kernel. This new field is available to the userspace application through 'cq_off.flags'. We are using 4-bytes previously reserved and set to zero. This means that if the application finds this field to zero, then the new functionality is not supported. In the next patch we will introduce the first flag available. Signed-off-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++++++- include/uapi/linux/io_uring.h | 4 +++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/fs/io_uring.c b/fs/io_uring.c index 982066844c5a..02250693a406 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -142,7 +142,7 @@ struct io_rings { */ u32 sq_dropped; /* - * Runtime flags + * Runtime SQ flags * * Written by the kernel, shouldn't be modified by the * application. @@ -151,6 +151,13 @@ struct io_rings { * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ u32 sq_flags; + /* + * Runtime CQ flags + * + * Written by the application, shouldn't be modified by the + * kernel. + */ + u32 cq_flags; /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure @@ -7930,6 +7937,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->cq_off.flags = offsetof(struct io_rings, cq_flags); p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e48d746b8e2a..602bb0ece607 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -204,7 +204,9 @@ struct io_cqring_offsets { __u32 ring_entries; __u32 overflow; __u32 cqes; - __u64 resv[2]; + __u32 flags; + __u32 resv1; + __u64 resv2; }; /* -- cgit v1.2.3 From 7e55a19cf6e70ce08964b46dbbfbdb07fbc995fc Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 15 May 2020 18:38:05 +0200 Subject: io_uring: add IORING_CQ_EVENTFD_DISABLED to the CQ ring flags This new flag should be set/clear from the application to disable/enable eventfd notifications when a request is completed and queued to the CQ ring. Before this patch, notifications were always sent if an eventfd is registered, so IORING_CQ_EVENTFD_DISABLED is not set during the initialization. It will be up to the application to set the flag after initialization if no notifications are required at the beginning. Signed-off-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ include/uapi/linux/io_uring.h | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/fs/io_uring.c b/fs/io_uring.c index 02250693a406..f800b0b4498f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1153,6 +1153,8 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) { if (!ctx->cq_ev_fd) return false; + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return false; if (!ctx->eventfd_async) return true; return io_wq_current_is_worker(); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 602bb0ece607..8c5775df08b8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -209,6 +209,13 @@ struct io_cqring_offsets { __u64 resv2; }; +/* + * cq_ring->flags + */ + +/* disable eventfd notifications */ +#define IORING_CQ_EVENTFD_DISABLED (1U << 0) + /* * io_uring_enter(2) flags */ -- cgit v1.2.3 From f2a8d5c7a218b9c24befb756c4eb30aa550ce822 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 17 May 2020 14:18:06 +0300 Subject: io_uring: add tee(2) support Add IORING_OP_TEE implementing tee(2) support. Almost identical to splice bits, but without offsets. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 62 ++++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/io_uring.h | 1 + 2 files changed, 60 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/fs/io_uring.c b/fs/io_uring.c index c43340396f6d..026350b9c33f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -853,6 +853,11 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_PROVIDE_BUFFERS] = {}, [IORING_OP_REMOVE_BUFFERS] = {}, + [IORING_OP_TEE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2748,7 +2753,8 @@ out_free: return ret; } -static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_splice_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { struct io_splice* sp = &req->splice; unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; @@ -2758,8 +2764,6 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; sp->file_in = NULL; - sp->off_in = READ_ONCE(sqe->splice_off_in); - sp->off_out = READ_ONCE(sqe->off); sp->len = READ_ONCE(sqe->len); sp->flags = READ_ONCE(sqe->splice_flags); @@ -2778,6 +2782,46 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +static int io_tee_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) + return -EINVAL; + return __io_splice_prep(req, sqe); +} + +static int io_tee(struct io_kiocb *req, bool force_nonblock) +{ + struct io_splice *sp = &req->splice; + struct file *in = sp->file_in; + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + long ret = 0; + + if (force_nonblock) + return -EAGAIN; + if (sp->len) + ret = do_tee(in, out, sp->len, flags); + + io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); + req->flags &= ~REQ_F_NEED_CLEANUP; + + io_cqring_add_event(req, ret); + if (ret != sp->len) + req_set_fail_links(req); + io_put_req(req); + return 0; +} + +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_splice* sp = &req->splice; + + sp->off_in = READ_ONCE(sqe->splice_off_in); + sp->off_out = READ_ONCE(sqe->off); + return __io_splice_prep(req, sqe); +} + static int io_splice(struct io_kiocb *req, bool force_nonblock) { struct io_splice *sp = &req->splice; @@ -5085,6 +5129,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_REMOVE_BUFFERS: ret = io_remove_buffers_prep(req, sqe); break; + case IORING_OP_TEE: + ret = io_tee_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -5158,6 +5205,7 @@ static void io_cleanup_req(struct io_kiocb *req) putname(req->open.filename); break; case IORING_OP_SPLICE: + case IORING_OP_TEE: io_put_file(req, req->splice.file_in, (req->splice.flags & SPLICE_F_FD_IN_FIXED)); break; @@ -5388,6 +5436,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_remove_buffers(req, force_nonblock); break; + case IORING_OP_TEE: + if (sqe) { + ret = io_tee_prep(req, sqe); + if (ret < 0) + break; + } + ret = io_tee(req, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8c5775df08b8..92c22699a5a7 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -129,6 +129,7 @@ enum { IORING_OP_SPLICE, IORING_OP_PROVIDE_BUFFERS, IORING_OP_REMOVE_BUFFERS, + IORING_OP_TEE, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From 926645d43fd43622a2b056471a2cf41cc19cbf4c Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Sat, 9 May 2020 11:04:48 +0200 Subject: media: v4l2-ctrls: Add camera orientation and rotation Add support for the newly defined V4L2_CID_CAMERA_ORIENTATION and V4L2_CID_CAMERA_SENSOR_ROTATION read-only controls used to report the camera device mounting position and orientation respectively. Reviewed-by: Laurent Pinchart Signed-off-by: Jacopo Mondi Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/v4l2-ctrls.c | 13 +++++++++++++ include/uapi/linux/v4l2-controls.h | 7 +++++++ 2 files changed, 20 insertions(+) (limited to 'include/uapi') diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 1c617b42a944..92c3e39efc28 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -583,6 +583,12 @@ const char * const *v4l2_ctrl_get_menu(u32 id) "Annex B Start Code", NULL, }; + static const char * const camera_orientation[] = { + "Front", + "Back", + "External", + NULL, + }; switch (id) { case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: @@ -708,6 +714,8 @@ const char * const *v4l2_ctrl_get_menu(u32 id) return hevc_decode_mode; case V4L2_CID_MPEG_VIDEO_HEVC_START_CODE: return hevc_start_code; + case V4L2_CID_CAMERA_ORIENTATION: + return camera_orientation; default: return NULL; } @@ -1020,6 +1028,8 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_PAN_SPEED: return "Pan, Speed"; case V4L2_CID_TILT_SPEED: return "Tilt, Speed"; case V4L2_CID_UNIT_CELL_SIZE: return "Unit Cell Size"; + case V4L2_CID_CAMERA_ORIENTATION: return "Camera Orientation"; + case V4L2_CID_CAMERA_SENSOR_ROTATION: return "Camera Sensor Rotation"; /* FM Radio Modulator controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ @@ -1293,6 +1303,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, case V4L2_CID_MPEG_VIDEO_HEVC_LOOP_FILTER_MODE: case V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE: case V4L2_CID_MPEG_VIDEO_HEVC_START_CODE: + case V4L2_CID_CAMERA_ORIENTATION: *type = V4L2_CTRL_TYPE_MENU; break; case V4L2_CID_LINK_FREQ: @@ -1482,6 +1493,8 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, case V4L2_CID_RDS_RX_TRAFFIC_ANNOUNCEMENT: case V4L2_CID_RDS_RX_TRAFFIC_PROGRAM: case V4L2_CID_RDS_RX_MUSIC_SPEECH: + case V4L2_CID_CAMERA_ORIENTATION: + case V4L2_CID_CAMERA_SENSOR_ROTATION: *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; case V4L2_CID_RF_TUNER_PLL_LOCK: diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 0ba1005c9651..62271418c1be 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -923,6 +923,13 @@ enum v4l2_auto_focus_range { #define V4L2_CID_PAN_SPEED (V4L2_CID_CAMERA_CLASS_BASE+32) #define V4L2_CID_TILT_SPEED (V4L2_CID_CAMERA_CLASS_BASE+33) +#define V4L2_CID_CAMERA_ORIENTATION (V4L2_CID_CAMERA_CLASS_BASE+34) +#define V4L2_CAMERA_ORIENTATION_FRONT 0 +#define V4L2_CAMERA_ORIENTATION_BACK 1 +#define V4L2_CAMERA_ORIENTATION_EXTERNAL 2 + +#define V4L2_CID_CAMERA_SENSOR_ROTATION (V4L2_CID_CAMERA_CLASS_BASE+35) + /* FM Modulator class control IDs */ #define V4L2_CID_FM_TX_CLASS_BASE (V4L2_CTRL_CLASS_FM_TX | 0x900) -- cgit v1.2.3 From b0d1f8741b812352fe0e5f3b2381427085f23e19 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:46 +0800 Subject: iommu/vt-d: Add nested translation helper function Nested translation mode is supported in VT-d 3.0 Spec.CH 3.8. With PASID granular translation type set to 0x11b, translation result from the first level(FL) also subject to a second level(SL) page table translation. This mode is used for SVA virtualization, where FL performs guest virtual to guest physical translation and SL performs guest physical to host physical translation. This patch adds a helper function for setting up nested translation where second level comes from a domain and first level comes from a guest PGD. Signed-off-by: Jacob Pan Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200516062101.29541-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 25 ------- drivers/iommu/intel-pasid.c | 174 +++++++++++++++++++++++++++++++++++++++++++- drivers/iommu/intel-pasid.h | 10 +++ include/linux/intel-iommu.h | 20 +++++ include/uapi/linux/iommu.h | 5 ++ 5 files changed, 206 insertions(+), 28 deletions(-) (limited to 'include/uapi') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 8027f21073eb..7e85c09eec71 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -296,31 +296,6 @@ static inline void context_clear_entry(struct context_entry *context) static struct dmar_domain *si_domain; static int hw_pass_through = 1; -/* si_domain contains mulitple devices */ -#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0) - -/* - * This is a DMA domain allocated through the iommu domain allocation - * interface. But one or more devices belonging to this domain have - * been chosen to use a private domain. We should avoid to use the - * map/unmap/iova_to_phys APIs on it. - */ -#define DOMAIN_FLAG_LOSE_CHILDREN BIT(1) - -/* - * When VT-d works in the scalable mode, it allows DMA translation to - * happen through either first level or second level page table. This - * bit marks that the DMA translation for the domain goes through the - * first level page table, otherwise, it goes through the second level. - */ -#define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2) - -/* - * Domain represents a virtual machine which demands iommu nested - * translation mode support. - */ -#define DOMAIN_FLAG_NESTING_MODE BIT(3) - #define for_each_domain_iommu(idx, domain) \ for (idx = 0; idx < g_num_of_iommus; idx++) \ if (domain->iommu_refcnt[idx]) diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index d9cea3011b58..c7fa1b79eaf7 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -359,6 +359,16 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value) pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); } +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, int pasid) @@ -492,7 +502,7 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); /* Setup Present and PASID Granular Transfer Type: */ - pasid_set_translation_type(pte, 1); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); pasid_set_present(pte); pasid_flush_caches(iommu, pte, pasid, did); @@ -561,7 +571,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, pasid_set_domain_id(pte, did); pasid_set_slptr(pte, pgd_val); pasid_set_address_width(pte, agaw); - pasid_set_translation_type(pte, 2); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); @@ -595,7 +605,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, pasid_clear_entry(pte); pasid_set_domain_id(pte, did); pasid_set_address_width(pte, iommu->agaw); - pasid_set_translation_type(pte, 4); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); @@ -609,3 +619,161 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return 0; } + +static int +intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte, + struct iommu_gpasid_bind_data_vtd *pasid_data) +{ + /* + * Not all guest PASID table entry fields are passed down during bind, + * here we only set up the ones that are dependent on guest settings. + * Execution related bits such as NXE, SMEP are not supported. + * Other fields, such as snoop related, are set based on host needs + * regardless of guest settings. + */ + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_SRE) { + if (!ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + pasid_set_sre(pte); + } + + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) { + if (!ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + pasid_set_eafe(pte); + } + + /* + * Memory type is only applicable to devices inside processor coherent + * domain. Will add MTS support once coherent devices are available. + */ + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_MTS_MASK) { + pr_warn_ratelimited("No memory type support %s\n", + iommu->name); + return -EINVAL; + } + + return 0; +} + +/** + * intel_pasid_setup_nested() - Set up PASID entry for nested translation. + * This could be used for guest shared virtual address. In this case, the + * first level page tables are used for GVA-GPA translation in the guest, + * second level page tables are used for GPA-HPA translation. + * + * @iommu: IOMMU which the device belong to + * @dev: Device to be set up for translation + * @gpgd: FLPTPTR: First Level Page translation pointer in GPA + * @pasid: PASID to be programmed in the device PASID table + * @pasid_data: Additional PASID info from the guest bind request + * @domain: Domain info for setting up second level page tables + * @addr_width: Address width of the first level (guest) + */ +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + pgd_t *gpgd, int pasid, + struct iommu_gpasid_bind_data_vtd *pasid_data, + struct dmar_domain *domain, int addr_width) +{ + struct pasid_entry *pte; + struct dma_pte *pgd; + int ret = 0; + u64 pgd_val; + int agaw; + u16 did; + + if (!ecap_nest(iommu->ecap)) { + pr_err_ratelimited("IOMMU: %s: No nested translation support\n", + iommu->name); + return -EINVAL; + } + + if (!(domain->flags & DOMAIN_FLAG_NESTING_MODE)) { + pr_err_ratelimited("Domain is not in nesting mode, %x\n", + domain->flags); + return -EINVAL; + } + + pte = intel_pasid_get_entry(dev, pasid); + if (WARN_ON(!pte)) + return -EINVAL; + + /* + * Caller must ensure PASID entry is not in use, i.e. not bind the + * same PASID to the same device twice. + */ + if (pasid_pte_is_present(pte)) + return -EBUSY; + + pasid_clear_entry(pte); + + /* Sanity checking performed by caller to make sure address + * width matching in two dimensions: + * 1. CPU vs. IOMMU + * 2. Guest vs. Host. + */ + switch (addr_width) { +#ifdef CONFIG_X86 + case ADDR_WIDTH_5LEVEL: + if (!cpu_feature_enabled(X86_FEATURE_LA57) || + !cap_5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + + pasid_set_flpm(pte, 1); + break; +#endif + case ADDR_WIDTH_4LEVEL: + pasid_set_flpm(pte, 0); + break; + default: + dev_err_ratelimited(dev, "Invalid guest address width %d\n", + addr_width); + return -EINVAL; + } + + /* First level PGD is in GPA, must be supported by the second level */ + if ((unsigned long long)gpgd > domain->max_addr) { + dev_err_ratelimited(dev, + "Guest PGD %llx not supported, max %llx\n", + (unsigned long long)gpgd, domain->max_addr); + return -EINVAL; + } + pasid_set_flptr(pte, (u64)gpgd); + + ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data); + if (ret) + return ret; + + /* Setup the second level based on the given domain */ + pgd = domain->pgd; + + agaw = iommu_skip_agaw(domain, iommu, &pgd); + if (agaw < 0) { + dev_err_ratelimited(dev, "Invalid domain page table\n"); + return -EINVAL; + } + pgd_val = virt_to_phys(pgd); + pasid_set_slptr(pte, pgd_val); + pasid_set_fault_enable(pte); + + did = domain->iommu_did[iommu->seq_id]; + pasid_set_domain_id(pte, did); + + pasid_set_address_width(pte, agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); + pasid_flush_caches(iommu, pte, pasid, did); + + return ret; +} diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h index 92de6df24ccb..ccd50c2ae75c 100644 --- a/drivers/iommu/intel-pasid.h +++ b/drivers/iommu/intel-pasid.h @@ -36,6 +36,7 @@ * to vmalloc or even module mappings. */ #define PASID_FLAG_SUPERVISOR_MODE BIT(0) +#define PASID_FLAG_NESTED BIT(1) /* * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first- @@ -51,6 +52,11 @@ struct pasid_entry { u64 val[8]; }; +#define PASID_ENTRY_PGTT_FL_ONLY (1) +#define PASID_ENTRY_PGTT_SL_ONLY (2) +#define PASID_ENTRY_PGTT_NESTED (3) +#define PASID_ENTRY_PGTT_PT (4) + /* The representative of a PASID table */ struct pasid_table { void *table; /* pasid table pointer */ @@ -99,6 +105,10 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, int pasid); +int intel_pasid_setup_nested(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, int pasid, + struct iommu_gpasid_bind_data_vtd *pasid_data, + struct dmar_domain *domain, int addr_width); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, int pasid); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index ed7171d2ae1f..e0d1fed7cbe4 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -42,6 +42,9 @@ #define DMA_FL_PTE_PRESENT BIT_ULL(0) #define DMA_FL_PTE_XD BIT_ULL(63) +#define ADDR_WIDTH_5LEVEL (57) +#define ADDR_WIDTH_4LEVEL (48) + #define CONTEXT_TT_MULTI_LEVEL 0 #define CONTEXT_TT_DEV_IOTLB 1 #define CONTEXT_TT_PASS_THROUGH 2 @@ -480,6 +483,23 @@ struct context_entry { u64 hi; }; +/* si_domain contains mulitple devices */ +#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0) + +/* + * When VT-d works in the scalable mode, it allows DMA translation to + * happen through either first level or second level page table. This + * bit marks that the DMA translation for the domain goes through the + * first level page table, otherwise, it goes through the second level. + */ +#define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(1) + +/* + * Domain represents a virtual machine which demands iommu nested + * translation mode support. + */ +#define DOMAIN_FLAG_NESTING_MODE BIT(2) + struct dmar_domain { int nid; /* node id */ diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index 4ad3496e5c43..e907b7091a46 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -285,6 +285,11 @@ struct iommu_gpasid_bind_data_vtd { __u32 emt; }; +#define IOMMU_SVA_VTD_GPASID_MTS_MASK (IOMMU_SVA_VTD_GPASID_CD | \ + IOMMU_SVA_VTD_GPASID_EMTE | \ + IOMMU_SVA_VTD_GPASID_PCD | \ + IOMMU_SVA_VTD_GPASID_PWT) + /** * struct iommu_gpasid_bind_data - Information about device and guest PASID binding * @version: Version of this data structure -- cgit v1.2.3 From 43c8546bcd854806736d8a635a0d696504dd4c21 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 28 Apr 2020 01:28:43 -0400 Subject: drm/amdgpu: Add a UAPI flag for user to call mem_sync When this flag is set in the CS IB flags, it causes a memory cache flush of the GFX. v2: Move new flag to drm_amdgpu_cs_chunk_ib.flags Bump up UAPI version Remove condition on job != null to emit mem_sync Signed-off-by: Andrey Grodzovsky Reviewed-by: Luben Tuikov Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 3 +++ include/uapi/drm/amdgpu_drm.h | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index beb35dd12964..a0e5b54b6e47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -86,9 +86,10 @@ * - 3.35.0 - Add drm_amdgpu_info_device::tcc_disabled_mask * - 3.36.0 - Allow reading more status registers on si/cik * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness + * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 37 +#define KMS_DRIVER_MINOR 38 #define KMS_DRIVER_PATCHLEVEL 0 int amdgpu_vram_limit = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index c24366aacf3a..b91853fd66d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -189,6 +189,9 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, dma_fence_put(tmp); } + if ((ib->flags & AMDGPU_IB_FLAG_EMIT_MEM_SYNC) && ring->funcs->emit_mem_sync) + ring->funcs->emit_mem_sync(ring); + if (ring->funcs->insert_start) ring->funcs->insert_start(ring); diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index e01b673f0449..4e873dcbe68f 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -602,6 +602,10 @@ union drm_amdgpu_cs { */ #define AMDGPU_IB_FLAGS_SECURE (1 << 5) +/* Tell KMD to flush and invalidate caches + */ +#define AMDGPU_IB_FLAG_EMIT_MEM_SYNC (1 << 6) + struct drm_amdgpu_cs_chunk_ib { __u32 _pad; /** AMDGPU_IB_FLAG_* */ -- cgit v1.2.3 From ab723b7a992a19b843f798b183f53f7472f598c8 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Fri, 24 Jan 2020 00:57:10 +0100 Subject: drm/msm: Add syncobj support. This 1) Enables core DRM syncobj support. 2) Adds options to the submission ioctl to wait/signal syncobjs. Just like the wait fence fd, this does inline waits. Using the scheduler would be nice but I believe it is out of scope for this work. Support for timeline syncobjs is implemented and the interface is ready for it, but I'm not enabling it yet until there is some code for turnip to use it. The reset is mostly in there because in the presence of waiting and signalling the same semaphores, resetting them after signalling can become very annoying. v2: - Fixed style issues - Removed a cleanup issue in a failure case - Moved to a copy_from_user per syncobj v3: - Fixed a missing declaration introduced in v2 - Reworked to use ERR_PTR/PTR_ERR - Simplified failure gotos. Used by: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/2769 Signed-off-by: Bas Nieuwenhuizen Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_drv.c | 6 +- drivers/gpu/drm/msm/msm_gem_submit.c | 232 ++++++++++++++++++++++++++++++++++- include/uapi/drm/msm_drm.h | 24 +++- 3 files changed, 258 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 29295dee2a2e..f6ce40bf3699 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -37,9 +37,10 @@ * - 1.4.0 - softpin, MSM_RELOC_BO_DUMP, and GEM_INFO support to set/get * GEM object's debug name * - 1.5.0 - Add SUBMITQUERY_QUERY ioctl + * - 1.6.0 - Syncobj support */ #define MSM_VERSION_MAJOR 1 -#define MSM_VERSION_MINOR 5 +#define MSM_VERSION_MINOR 6 #define MSM_VERSION_PATCHLEVEL 0 static const struct drm_mode_config_funcs mode_config_funcs = { @@ -1002,7 +1003,8 @@ static struct drm_driver msm_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER | DRIVER_ATOMIC | - DRIVER_MODESET, + DRIVER_MODESET | + DRIVER_SYNCOBJ, .open = msm_open, .postclose = msm_postclose, .lastclose = drm_fb_helper_lastclose, diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 385d4965a8d0..6630aa817505 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -8,7 +8,9 @@ #include #include +#include #include +#include #include "msm_drv.h" #include "msm_gpu.h" @@ -391,6 +393,186 @@ static void submit_cleanup(struct msm_gem_submit *submit) } } + +struct msm_submit_post_dep { + struct drm_syncobj *syncobj; + uint64_t point; + struct dma_fence_chain *chain; +}; + +static struct drm_syncobj **msm_wait_deps(struct drm_device *dev, + struct drm_file *file, + uint64_t in_syncobjs_addr, + uint32_t nr_in_syncobjs, + size_t syncobj_stride, + struct msm_ringbuffer *ring) +{ + struct drm_syncobj **syncobjs = NULL; + struct drm_msm_gem_submit_syncobj syncobj_desc = {0}; + int ret = 0; + uint32_t i, j; + + syncobjs = kcalloc(nr_in_syncobjs, sizeof(*syncobjs), + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!syncobjs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_in_syncobjs; ++i) { + uint64_t address = in_syncobjs_addr + i * syncobj_stride; + struct dma_fence *fence; + + if (copy_from_user(&syncobj_desc, + u64_to_user_ptr(address), + min(syncobj_stride, sizeof(syncobj_desc)))) { + ret = -EFAULT; + break; + } + + if (syncobj_desc.point && + !drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE)) { + ret = -EOPNOTSUPP; + break; + } + + if (syncobj_desc.flags & ~MSM_SUBMIT_SYNCOBJ_FLAGS) { + ret = -EINVAL; + break; + } + + ret = drm_syncobj_find_fence(file, syncobj_desc.handle, + syncobj_desc.point, 0, &fence); + if (ret) + break; + + if (!dma_fence_match_context(fence, ring->fctx->context)) + ret = dma_fence_wait(fence, true); + + dma_fence_put(fence); + if (ret) + break; + + if (syncobj_desc.flags & MSM_SUBMIT_SYNCOBJ_RESET) { + syncobjs[i] = + drm_syncobj_find(file, syncobj_desc.handle); + if (!syncobjs[i]) { + ret = -EINVAL; + break; + } + } + } + + if (ret) { + for (j = 0; j <= i; ++j) { + if (syncobjs[j]) + drm_syncobj_put(syncobjs[j]); + } + kfree(syncobjs); + return ERR_PTR(ret); + } + return syncobjs; +} + +static void msm_reset_syncobjs(struct drm_syncobj **syncobjs, + uint32_t nr_syncobjs) +{ + uint32_t i; + + for (i = 0; syncobjs && i < nr_syncobjs; ++i) { + if (syncobjs[i]) + drm_syncobj_replace_fence(syncobjs[i], NULL); + } +} + +static struct msm_submit_post_dep *msm_parse_post_deps(struct drm_device *dev, + struct drm_file *file, + uint64_t syncobjs_addr, + uint32_t nr_syncobjs, + size_t syncobj_stride) +{ + struct msm_submit_post_dep *post_deps; + struct drm_msm_gem_submit_syncobj syncobj_desc = {0}; + int ret = 0; + uint32_t i, j; + + post_deps = kmalloc_array(nr_syncobjs, sizeof(*post_deps), + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!post_deps) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_syncobjs; ++i) { + uint64_t address = syncobjs_addr + i * syncobj_stride; + + if (copy_from_user(&syncobj_desc, + u64_to_user_ptr(address), + min(syncobj_stride, sizeof(syncobj_desc)))) { + ret = -EFAULT; + break; + } + + post_deps[i].point = syncobj_desc.point; + post_deps[i].chain = NULL; + + if (syncobj_desc.flags) { + ret = -EINVAL; + break; + } + + if (syncobj_desc.point) { + if (!drm_core_check_feature(dev, + DRIVER_SYNCOBJ_TIMELINE)) { + ret = -EOPNOTSUPP; + break; + } + + post_deps[i].chain = + kmalloc(sizeof(*post_deps[i].chain), + GFP_KERNEL); + if (!post_deps[i].chain) { + ret = -ENOMEM; + break; + } + } + + post_deps[i].syncobj = + drm_syncobj_find(file, syncobj_desc.handle); + if (!post_deps[i].syncobj) { + ret = -EINVAL; + break; + } + } + + if (ret) { + for (j = 0; j <= i; ++j) { + kfree(post_deps[j].chain); + if (post_deps[j].syncobj) + drm_syncobj_put(post_deps[j].syncobj); + } + + kfree(post_deps); + return ERR_PTR(ret); + } + + return post_deps; +} + +static void msm_process_post_deps(struct msm_submit_post_dep *post_deps, + uint32_t count, struct dma_fence *fence) +{ + uint32_t i; + + for (i = 0; post_deps && i < count; ++i) { + if (post_deps[i].chain) { + drm_syncobj_add_point(post_deps[i].syncobj, + post_deps[i].chain, + fence, post_deps[i].point); + post_deps[i].chain = NULL; + } else { + drm_syncobj_replace_fence(post_deps[i].syncobj, + fence); + } + } +} + int msm_ioctl_gem_submit(struct drm_device *dev, void *data, struct drm_file *file) { @@ -403,6 +585,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, struct sync_file *sync_file = NULL; struct msm_gpu_submitqueue *queue; struct msm_ringbuffer *ring; + struct msm_submit_post_dep *post_deps = NULL; + struct drm_syncobj **syncobjs_to_reset = NULL; int out_fence_fd = -1; struct pid *pid = get_pid(task_pid(current)); bool has_ww_ticket = false; @@ -411,6 +595,9 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, if (!gpu) return -ENXIO; + if (args->pad) + return -EINVAL; + /* for now, we just have 3d pipe.. eventually this would need to * be more clever to dispatch to appropriate gpu module: */ @@ -458,9 +645,29 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, return ret; } + if (args->flags & MSM_SUBMIT_SYNCOBJ_IN) { + syncobjs_to_reset = msm_wait_deps(dev, file, + args->in_syncobjs, + args->nr_in_syncobjs, + args->syncobj_stride, ring); + if (IS_ERR(syncobjs_to_reset)) + return PTR_ERR(syncobjs_to_reset); + } + + if (args->flags & MSM_SUBMIT_SYNCOBJ_OUT) { + post_deps = msm_parse_post_deps(dev, file, + args->out_syncobjs, + args->nr_out_syncobjs, + args->syncobj_stride); + if (IS_ERR(post_deps)) { + ret = PTR_ERR(post_deps); + goto out_post_unlock; + } + } + ret = mutex_lock_interruptible(&dev->struct_mutex); if (ret) - return ret; + goto out_post_unlock; if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) { out_fence_fd = get_unused_fd_flags(O_CLOEXEC); @@ -587,6 +794,11 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, args->fence_fd = out_fence_fd; } + msm_reset_syncobjs(syncobjs_to_reset, args->nr_in_syncobjs); + msm_process_post_deps(post_deps, args->nr_out_syncobjs, + submit->fence); + + out: submit_cleanup(submit); if (has_ww_ticket) @@ -597,5 +809,23 @@ out_unlock: if (ret && (out_fence_fd >= 0)) put_unused_fd(out_fence_fd); mutex_unlock(&dev->struct_mutex); + +out_post_unlock: + if (!IS_ERR_OR_NULL(post_deps)) { + for (i = 0; i < args->nr_out_syncobjs; ++i) { + kfree(post_deps[i].chain); + drm_syncobj_put(post_deps[i].syncobj); + } + kfree(post_deps); + } + + if (!IS_ERR_OR_NULL(syncobjs_to_reset)) { + for (i = 0; i < args->nr_in_syncobjs; ++i) { + if (syncobjs_to_reset[i]) + drm_syncobj_put(syncobjs_to_reset[i]); + } + kfree(syncobjs_to_reset); + } + return ret; } diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h index 0b85ed6a3710..19806eb3a8e8 100644 --- a/include/uapi/drm/msm_drm.h +++ b/include/uapi/drm/msm_drm.h @@ -217,13 +217,28 @@ struct drm_msm_gem_submit_bo { #define MSM_SUBMIT_FENCE_FD_IN 0x40000000 /* enable input fence_fd */ #define MSM_SUBMIT_FENCE_FD_OUT 0x20000000 /* enable output fence_fd */ #define MSM_SUBMIT_SUDO 0x10000000 /* run submitted cmds from RB */ +#define MSM_SUBMIT_SYNCOBJ_IN 0x08000000 /* enable input syncobj */ +#define MSM_SUBMIT_SYNCOBJ_OUT 0x04000000 /* enable output syncobj */ #define MSM_SUBMIT_FLAGS ( \ MSM_SUBMIT_NO_IMPLICIT | \ MSM_SUBMIT_FENCE_FD_IN | \ MSM_SUBMIT_FENCE_FD_OUT | \ MSM_SUBMIT_SUDO | \ + MSM_SUBMIT_SYNCOBJ_IN | \ + MSM_SUBMIT_SYNCOBJ_OUT | \ 0) +#define MSM_SUBMIT_SYNCOBJ_RESET 0x00000001 /* Reset syncobj after wait. */ +#define MSM_SUBMIT_SYNCOBJ_FLAGS ( \ + MSM_SUBMIT_SYNCOBJ_RESET | \ + 0) + +struct drm_msm_gem_submit_syncobj { + __u32 handle; /* in, syncobj handle. */ + __u32 flags; /* in, from MSM_SUBMIT_SYNCOBJ_FLAGS */ + __u64 point; /* in, timepoint for timeline syncobjs. */ +}; + /* Each cmdstream submit consists of a table of buffers involved, and * one or more cmdstream buffers. This allows for conditional execution * (context-restore), and IB buffers needed for per tile/bin draw cmds. @@ -236,7 +251,14 @@ struct drm_msm_gem_submit { __u64 bos; /* in, ptr to array of submit_bo's */ __u64 cmds; /* in, ptr to array of submit_cmd's */ __s32 fence_fd; /* in/out fence fd (see MSM_SUBMIT_FENCE_FD_IN/OUT) */ - __u32 queueid; /* in, submitqueue id */ + __u32 queueid; /* in, submitqueue id */ + __u64 in_syncobjs; /* in, ptr to to array of drm_msm_gem_submit_syncobj */ + __u64 out_syncobjs; /* in, ptr to to array of drm_msm_gem_submit_syncobj */ + __u32 nr_in_syncobjs; /* in, number of entries in in_syncobj */ + __u32 nr_out_syncobjs; /* in, number of entries in out_syncobj. */ + __u32 syncobj_stride; /* in, stride of syncobj arrays. */ + __u32 pad; /*in, reserved for future use, always 0. */ + }; /* The normal way to synchronize with the GPU is just to CPU_PREP on -- cgit v1.2.3 From 25e7aeba601c1776cd21d610e3afc8768d0c7f2e Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Tue, 31 Mar 2020 22:46:36 +0300 Subject: habanalabs: Add INFO IOCTL opcode for time sync information Add a new opcode to the INFO IOCTL that retrieves the device time alongside the host time, to allow a user application that want to measure device time together with host time (such as a profiler) to synchronize these times. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 10 +++- drivers/misc/habanalabs/goya/goyaP.h | 1 + drivers/misc/habanalabs/habanalabs.h | 2 + drivers/misc/habanalabs/habanalabs_ioctl.c | 19 ++++++++ .../habanalabs/include/goya/asic_reg/goya_regs.h | 1 + .../include/goya/asic_reg/psoc_timestamp_regs.h | 56 ++++++++++++++++++++++ include/uapi/misc/habanalabs.h | 8 ++++ 7 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 drivers/misc/habanalabs/include/goya/asic_reg/psoc_timestamp_regs.h (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 19c3bdf4c358..ddc506d74988 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -5200,6 +5200,13 @@ static void goya_set_dma_mask_from_fw(struct hl_device *hdev) } } +u64 goya_get_device_time(struct hl_device *hdev) +{ + u64 device_time = ((u64) RREG32(mmPSOC_TIMESTAMP_CNTCVU)) << 32; + + return device_time | RREG32(mmPSOC_TIMESTAMP_CNTCVL); +} + static const struct hl_asic_funcs goya_funcs = { .early_init = goya_early_init, .early_fini = goya_early_fini, @@ -5263,7 +5270,8 @@ static const struct hl_asic_funcs goya_funcs = { .get_queue_id_for_cq = goya_get_queue_id_for_cq, .read_device_fw_version = goya_read_device_fw_version, .load_firmware_to_device = goya_load_firmware_to_device, - .set_dma_mask_from_fw = goya_set_dma_mask_from_fw + .set_dma_mask_from_fw = goya_set_dma_mask_from_fw, + .get_device_time = goya_get_device_time }; /* diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index 86857cdd36b1..d36f8d90c9c9 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -231,5 +231,6 @@ void goya_mmu_remove_device_cpu_mappings(struct hl_device *hdev); int goya_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk); u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx); +u64 goya_get_device_time(struct hl_device *hdev); #endif /* GOYAP_H_ */ diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 8db955485609..a8ee241b2fce 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -554,6 +554,7 @@ enum hl_pll_frequency { * @load_firmware_to_device: load the firmware to the device's memory * @set_dma_mask_from_fw: set the DMA mask in the driver according to the * firmware configuration + * @get_device_time: Get the device time. */ struct hl_asic_funcs { int (*early_init)(struct hl_device *hdev); @@ -646,6 +647,7 @@ struct hl_asic_funcs { enum hl_fw_component fwc); int (*load_firmware_to_device)(struct hl_device *hdev); void (*set_dma_mask_from_fw)(struct hl_device *hdev); + u64 (*get_device_time)(struct hl_device *hdev); }; diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c index 6474b868ef27..f5993698d315 100644 --- a/drivers/misc/habanalabs/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/habanalabs_ioctl.c @@ -258,6 +258,22 @@ static int get_reset_count(struct hl_device *hdev, struct hl_info_args *args) min((size_t) max_size, sizeof(reset_count))) ? -EFAULT : 0; } +static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args) +{ + struct hl_info_time_sync time_sync = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + time_sync.device_time = hdev->asic_funcs->get_device_time(hdev); + time_sync.host_time = ktime_get_raw_ns(); + + return copy_to_user(out, &time_sync, + min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -315,6 +331,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, rc = get_clk_rate(hdev, args); break; + case HL_INFO_TIME_SYNC: + return time_sync_info(hdev, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h b/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h index fce490e6a231..ce65c9da5c60 100644 --- a/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h +++ b/drivers/misc/habanalabs/include/goya/asic_reg/goya_regs.h @@ -18,6 +18,7 @@ #include "psoc_mme_pll_regs.h" #include "psoc_pci_pll_regs.h" #include "psoc_emmc_pll_regs.h" +#include "psoc_timestamp_regs.h" #include "cpu_if_regs.h" #include "cpu_ca53_cfg_regs.h" #include "cpu_pll_regs.h" diff --git a/drivers/misc/habanalabs/include/goya/asic_reg/psoc_timestamp_regs.h b/drivers/misc/habanalabs/include/goya/asic_reg/psoc_timestamp_regs.h new file mode 100644 index 000000000000..9ce24597d4b0 --- /dev/null +++ b/drivers/misc/habanalabs/include/goya/asic_reg/psoc_timestamp_regs.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2016-2018 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +/************************************ + ** This is an auto-generated file ** + ** DO NOT EDIT BELOW ** + ************************************/ + +#ifndef ASIC_REG_PSOC_TIMESTAMP_REGS_H_ +#define ASIC_REG_PSOC_TIMESTAMP_REGS_H_ + +/* + ***************************************** + * PSOC_TIMESTAMP (Prototype: TIMESTAMP) + ***************************************** + */ + +#define mmPSOC_TIMESTAMP_CNTCR 0xC49000 + +#define mmPSOC_TIMESTAMP_CNTSR 0xC49004 + +#define mmPSOC_TIMESTAMP_CNTCVL 0xC49008 + +#define mmPSOC_TIMESTAMP_CNTCVU 0xC4900C + +#define mmPSOC_TIMESTAMP_CNTFID0 0xC49020 + +#define mmPSOC_TIMESTAMP_PIDR4 0xC49FD0 + +#define mmPSOC_TIMESTAMP_PIDR5 0xC49FD4 + +#define mmPSOC_TIMESTAMP_PIDR6 0xC49FD8 + +#define mmPSOC_TIMESTAMP_PIDR7 0xC49FDC + +#define mmPSOC_TIMESTAMP_PIDR0 0xC49FE0 + +#define mmPSOC_TIMESTAMP_PIDR1 0xC49FE4 + +#define mmPSOC_TIMESTAMP_PIDR2 0xC49FE8 + +#define mmPSOC_TIMESTAMP_PIDR3 0xC49FEC + +#define mmPSOC_TIMESTAMP_CIDR0 0xC49FF0 + +#define mmPSOC_TIMESTAMP_CIDR1 0xC49FF4 + +#define mmPSOC_TIMESTAMP_CIDR2 0xC49FF8 + +#define mmPSOC_TIMESTAMP_CIDR3 0xC49FFC + +#endif /* ASIC_REG_PSOC_TIMESTAMP_REGS_H_ */ diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 4faa2c9767e5..4d593050c42b 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -101,6 +101,8 @@ enum hl_device_status { * HL_INFO_RESET_COUNT - Retrieve the counts of the soft and hard reset * operations performed on the device since the last * time the driver was loaded. + * HL_INFO_TIME_SYNC - Retrieve the device's time alongside the host's time + * for synchronization. */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -111,6 +113,7 @@ enum hl_device_status { #define HL_INFO_HW_EVENTS_AGGREGATE 7 #define HL_INFO_CLK_RATE 8 #define HL_INFO_RESET_COUNT 9 +#define HL_INFO_TIME_SYNC 10 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -169,6 +172,11 @@ struct hl_info_reset_count { __u32 soft_reset_cnt; }; +struct hl_info_time_sync { + __u64 device_time; + __u64 host_time; +}; + struct hl_info_args { /* Location of relevant struct in userspace */ __u64 return_pointer; -- cgit v1.2.3 From 39b425170d35ff0841084007423f1b82f3b3e5ac Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Fri, 17 Apr 2020 12:12:13 +0300 Subject: habanalabs: leave space for 2xMSG_PROT in CB The user must leave space for 2xMSG_PROT in the external CB, so adjust the define of max size accordingly. The driver, however, can still create a CB with the maximum size of 2MB. Therefore, we need to add a check specifically for the user requested size. Reviewed-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_buffer.c | 24 +++++++++++++++++------- include/uapi/misc/habanalabs.h | 3 ++- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/command_buffer.c b/drivers/misc/habanalabs/command_buffer.c index 53fddbd8e693..6cb92efce4d9 100644 --- a/drivers/misc/habanalabs/command_buffer.c +++ b/drivers/misc/habanalabs/command_buffer.c @@ -105,10 +105,9 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, goto out_err; } - if (cb_size > HL_MAX_CB_SIZE) { - dev_err(hdev->dev, - "CB size %d must be less then %d\n", - cb_size, HL_MAX_CB_SIZE); + if (cb_size > SZ_2M) { + dev_err(hdev->dev, "CB size %d must be less than %d\n", + cb_size, SZ_2M); rc = -EINVAL; goto out_err; } @@ -211,7 +210,7 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data) { union hl_cb_args *args = data; struct hl_device *hdev = hpriv->hdev; - u64 handle; + u64 handle = 0; int rc; if (hl_device_disabled_or_in_reset(hdev)) { @@ -223,15 +222,26 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data) switch (args->in.op) { case HL_CB_OP_CREATE: - rc = hl_cb_create(hdev, &hpriv->cb_mgr, args->in.cb_size, - &handle, hpriv->ctx->asid); + if (args->in.cb_size > HL_MAX_CB_SIZE) { + dev_err(hdev->dev, + "User requested CB size %d must be less than %d\n", + args->in.cb_size, HL_MAX_CB_SIZE); + rc = -EINVAL; + } else { + rc = hl_cb_create(hdev, &hpriv->cb_mgr, + args->in.cb_size, &handle, + hpriv->ctx->asid); + } + memset(args, 0, sizeof(*args)); args->out.cb_handle = handle; break; + case HL_CB_OP_DESTROY: rc = hl_cb_destroy(hdev, &hpriv->cb_mgr, args->in.cb_handle); break; + default: rc = -ENOTTY; break; diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 4d593050c42b..523e511e6cff 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -209,7 +209,8 @@ struct hl_info_args { /* Opcode to destroy previously created command buffer */ #define HL_CB_OP_DESTROY 1 -#define HL_MAX_CB_SIZE 0x200000 /* 2MB */ +/* 2MB minus 32 bytes for 2xMSG_PROT */ +#define HL_MAX_CB_SIZE (0x200000 - 32) struct hl_cb_in { /* Handle of CB or 0 if we want to create one */ -- cgit v1.2.3 From f9e5f29518c1821d794bb7ec7e7c91650f4ded14 Mon Sep 17 00:00:00 2001 From: Omer Shpigelman Date: Thu, 7 May 2020 13:41:16 +0300 Subject: uapi: habanalabs: add signal/wait operations This is a pre-requisite to upstreaming GAUDI support. Signal/wait operations are done by the user to perform sync between two Primary Queues (PQs). The sync is done using the sync manager and it is usually resolved inside the device, but sometimes it can be resolved in the host, i.e. the user should be able to wait in the host until a signal has been completed. The mechanism to define signal and wait operations is done by the driver because it needs atomicity and serialization, which is already done in the driver when submitting work to the different queues. To implement this feature, the driver "takes" a couple of h/w resources, and this is reflected by the defines added to the uapi file. The signal/wait operations are done via the existing CS IOCTL, and they use the same data structure. There is a difference in the meaning of some of the parameters, and for that we added unions to make the code more readable. Signed-off-by: Omer Shpigelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 13 +++++- drivers/misc/habanalabs/habanalabs.h | 2 + include/uapi/misc/habanalabs.h | 67 +++++++++++++++++++++------- 3 files changed, 65 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index 6680e183d881..f7d03a35e6a8 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -11,6 +11,8 @@ #include #include +#define HL_CS_FLAGS_SIG_WAIT (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT) + static void job_wq_completion(struct work_struct *work); static long _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq); @@ -659,7 +661,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) union hl_cs_args *args = data; struct hl_ctx *ctx = hpriv->ctx; void __user *chunks_execute, *chunks_restore; - u32 num_chunks_execute, num_chunks_restore; + u32 num_chunks_execute, num_chunks_restore, sig_wait_flags; u64 cs_seq = ULONG_MAX; int rc, do_ctx_switch; bool need_soft_reset = false; @@ -672,6 +674,15 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) goto out; } + sig_wait_flags = args->in.cs_flags & HL_CS_FLAGS_SIG_WAIT; + + if (unlikely((sig_wait_flags & HL_CS_FLAGS_SIG_WAIT) && + (!hdev->supports_sync_stream))) { + dev_err(hdev->dev, "Sync stream CS is not supported\n"); + rc = -EINVAL; + goto out; + } + chunks_execute = (void __user *) (uintptr_t) args->in.chunks_execute; num_chunks_execute = args->in.num_chunks_execute; diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index b1dc6a22ba0d..7cd9a8d72451 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -1347,6 +1347,7 @@ struct hl_device_idle_busy_ts { * only to POWER9 machines. * @cdev_sysfs_created: were char devices and sysfs nodes created. * @stop_on_err: true if engines should stop on error. + * @supports_sync_stream: is sync stream supported. */ struct hl_device { struct pci_dev *pdev; @@ -1429,6 +1430,7 @@ struct hl_device { u8 power9_64bit_dma_enable; u8 cdev_sysfs_created; u8 stop_on_err; + u8 supports_sync_stream; /* Parameters for bring-up */ u8 mmu_enable; diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 523e511e6cff..70dfccea7038 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note * - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2020 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -241,52 +241,87 @@ union hl_cb_args { * compatibility */ struct hl_cs_chunk { - /* - * For external queue, this represents a Handle of CB on the Host - * For internal queue, this represents an SRAM or DRAM address of the - * internal CB - */ - __u64 cb_handle; + union { + /* For external queue, this represents a Handle of CB on the + * Host. + * For internal queue in Goya, this represents an SRAM or + * a DRAM address of the internal CB. In Gaudi, this might also + * represent a mapped host address of the CB. + * + * A mapped host address is in the device address space, after + * a host address was mapped by the device MMU. + */ + __u64 cb_handle; + + /* Relevant only when HL_CS_FLAGS_WAIT is set. + * This holds address of array of u64 values that contain + * signal CS sequence numbers. The wait described by this job + * will listen on all those signals (wait event per signal) + */ + __u64 signal_seq_arr; + }; + /* Index of queue to put the CB on */ __u32 queue_index; - /* - * Size of command buffer with valid packets - * Can be smaller then actual CB size - */ - __u32 cb_size; + + union { + /* + * Size of command buffer with valid packets + * Can be smaller then actual CB size + */ + __u32 cb_size; + + /* Relevant only when HL_CS_FLAGS_WAIT is set. + * Number of entries in signal_seq_arr + */ + __u32 num_signal_seq_arr; + }; + /* HL_CS_CHUNK_FLAGS_* */ __u32 cs_chunk_flags; + /* Align structure to 64 bytes */ __u32 pad[11]; }; +/* SIGNAL and WAIT flags are mutually exclusive */ #define HL_CS_FLAGS_FORCE_RESTORE 0x1 +#define HL_CS_FLAGS_SIGNAL 0x2 +#define HL_CS_FLAGS_WAIT 0x4 #define HL_CS_STATUS_SUCCESS 0 #define HL_MAX_JOBS_PER_CS 512 struct hl_cs_in { + /* this holds address of array of hl_cs_chunk for restore phase */ __u64 chunks_restore; - /* this holds address of array of hl_cs_chunk for execution phase */ + + /* holds address of array of hl_cs_chunk for execution phase */ __u64 chunks_execute; + /* this holds address of array of hl_cs_chunk for store phase - * Currently not in use */ __u64 chunks_store; + /* Number of chunks in restore phase array. Maximum number is * HL_MAX_JOBS_PER_CS */ __u32 num_chunks_restore; + /* Number of chunks in execution array. Maximum number is * HL_MAX_JOBS_PER_CS */ __u32 num_chunks_execute; + /* Number of chunks in restore phase array - Currently not in use */ __u32 num_chunks_store; + /* HL_CS_FLAGS_* */ __u32 cs_flags; + /* Context ID - Currently not in use */ __u32 ctx_id; }; @@ -597,8 +632,8 @@ struct hl_debug_args { * For jobs on external queues, the user needs to create command buffers * through the CB ioctl and give the CB's handle to the CS ioctl. For jobs on * internal queues, the user needs to prepare a "command buffer" with packets - * on either the SRAM or DRAM, and give the device address of that buffer to - * the CS ioctl. + * on either the device SRAM/DRAM or the host, and give the device address of + * that buffer to the CS ioctl. * * This IOCTL is asynchronous in regard to the actual execution of the CS. This * means it returns immediately after ALL the JOBS were enqueued on their @@ -610,7 +645,7 @@ struct hl_debug_args { * external JOBS have been completed. Note that if the CS has internal JOBS * which can execute AFTER the external JOBS have finished, the driver might * report that the CS has finished executing BEFORE the internal JOBS have - * actually finish executing. + * actually finished executing. * * Even though the sequence number increments per CS, the user can NOT * automatically assume that if CS with sequence number N finished, then CS -- cgit v1.2.3 From fca72fbb661f95bed34aff2b9eb8806acab4643e Mon Sep 17 00:00:00 2001 From: Omer Shpigelman Date: Sun, 3 May 2020 17:35:54 +0300 Subject: habanalabs: get card type, location from F/W For Gaudi the driver gets two new additional properties from the F/W: 1. The card's type - PCI or PMC 2. The card's location in the Gaudi's box (relevant only for PMC). The card's location is also passed to the user in the HW IP info structure as it needs this property for establishing communication between Gaudis. Signed-off-by: Omer Shpigelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/habanalabs_ioctl.c | 2 ++ drivers/misc/habanalabs/include/armcp_if.h | 22 +++++++++++++++++++--- include/uapi/misc/habanalabs.h | 3 ++- 3 files changed, 23 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c index f5993698d315..52eedd3a6c3a 100644 --- a/drivers/misc/habanalabs/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/habanalabs_ioctl.c @@ -71,6 +71,8 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args) min(CARD_NAME_MAX_LEN, HL_INFO_CARD_NAME_MAX_LEN)); hw_ip.armcp_cpld_version = le32_to_cpu(prop->armcp_info.cpld_version); + hw_ip.module_id = le32_to_cpu(prop->armcp_info.card_location); + hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr; hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf; hw_ip.psoc_pci_pll_od = prop->psoc_pci_pll_od; diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h index 9e3bc21f20a0..a34fc39ad87e 100644 --- a/drivers/misc/habanalabs/include/armcp_if.h +++ b/drivers/misc/habanalabs/include/armcp_if.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2020 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -35,7 +35,8 @@ struct hl_eq_entry { enum pq_init_status { PQ_INIT_STATUS_NA = 0, PQ_INIT_STATUS_READY_FOR_CP, - PQ_INIT_STATUS_READY_FOR_HOST + PQ_INIT_STATUS_READY_FOR_HOST, + PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI }; /* @@ -350,11 +351,24 @@ struct armcp_sensor { __le32 flags; }; +/** + * struct armcp_card_types - ASIC card type. + * @armcp_card_type_pci: PCI card. + * @armcp_card_type_pmc: PCI Mezzanine Card. + */ +enum armcp_card_types { + armcp_card_type_pci, + armcp_card_type_pmc +}; + /** * struct armcp_info - Info from ArmCP that is necessary to the host's driver * @sensors: available sensors description. * @kernel_version: ArmCP linux kernel version. * @reserved: reserved field. + * @card_type: card configuration type. + * @card_location: in a server, each card has different connections topology + * depending on its location (relevant for PMC card type) * @cpld_version: CPLD programmed F/W version. * @infineon_version: Infineon main DC-DC version. * @fuse_version: silicon production FUSE information. @@ -366,7 +380,9 @@ struct armcp_sensor { struct armcp_info { struct armcp_sensor sensors[ARMCP_MAX_SENSORS]; __u8 kernel_version[VERSION_MAX_LEN]; - __le32 reserved[3]; + __le32 reserved; + __le32 card_type; + __le32 card_location; __le32 cpld_version; __le32 infineon_version; __u8 fuse_version[VERSION_MAX_LEN]; diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 70dfccea7038..079613dd7aae 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -125,7 +125,8 @@ struct hl_info_hw_ip_info { __u32 sram_size; __u32 num_of_events; __u32 device_id; /* PCI Device ID */ - __u32 reserved[3]; + __u32 module_id; /* For mezzanine cards in servers (From OCP spec.) */ + __u32 reserved[2]; __u32 armcp_cpld_version; __u32 psoc_pci_pll_nr; __u32 psoc_pci_pll_nf; -- cgit v1.2.3 From 466c7822b054ffe5bb425c8f98d08676501836e8 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 11 May 2020 10:32:10 +0300 Subject: uapi: habanalabs: add gaudi defines Add the new defines for GAUDI uapi interface. It includes the queue IDs, the engine IDs, SRAM reserved space and Sync Manager reserved resources. There is no new IOCTL or additional operations in existing IOCTLs. Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 164 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 162 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 079613dd7aae..f6267a8d7416 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -15,10 +15,13 @@ * Defines that are asic-specific but constitutes as ABI between kernel driver * and userspace */ -#define GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START 0x8000 /* 32KB */ +#define GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START 0x8000 /* 32KB */ +#define GAUDI_DRIVER_SRAM_RESERVED_SIZE_FROM_START 0x80 /* 128 bytes */ +#define GAUDI_FIRST_AVAILABLE_W_S_SYNC_OBJECT 48 +#define GAUDI_FIRST_AVAILABLE_W_S_MONITOR 24 /* - * Queue Numbering + * Goya queue Numbering * * The external queues (PCI DMA channels) MUST be before the internal queues * and each group (PCI DMA channels and internal) must be contiguous inside @@ -45,6 +48,129 @@ enum goya_queue_id { GOYA_QUEUE_ID_SIZE }; +/* + * Gaudi queue Numbering + * External queues (PCI DMA channels) are DMA_0_*, DMA_1_* and DMA_5_*. + * Except one CPU queue, all the rest are internal queues. + */ + +enum gaudi_queue_id { + GAUDI_QUEUE_ID_DMA_0_0 = 0, /* external */ + GAUDI_QUEUE_ID_DMA_0_1 = 1, /* external */ + GAUDI_QUEUE_ID_DMA_0_2 = 2, /* external */ + GAUDI_QUEUE_ID_DMA_0_3 = 3, /* external */ + GAUDI_QUEUE_ID_DMA_1_0 = 4, /* external */ + GAUDI_QUEUE_ID_DMA_1_1 = 5, /* external */ + GAUDI_QUEUE_ID_DMA_1_2 = 6, /* external */ + GAUDI_QUEUE_ID_DMA_1_3 = 7, /* external */ + GAUDI_QUEUE_ID_CPU_PQ = 8, /* CPU */ + GAUDI_QUEUE_ID_DMA_2_0 = 9, /* internal */ + GAUDI_QUEUE_ID_DMA_2_1 = 10, /* internal */ + GAUDI_QUEUE_ID_DMA_2_2 = 11, /* internal */ + GAUDI_QUEUE_ID_DMA_2_3 = 12, /* internal */ + GAUDI_QUEUE_ID_DMA_3_0 = 13, /* internal */ + GAUDI_QUEUE_ID_DMA_3_1 = 14, /* internal */ + GAUDI_QUEUE_ID_DMA_3_2 = 15, /* internal */ + GAUDI_QUEUE_ID_DMA_3_3 = 16, /* internal */ + GAUDI_QUEUE_ID_DMA_4_0 = 17, /* internal */ + GAUDI_QUEUE_ID_DMA_4_1 = 18, /* internal */ + GAUDI_QUEUE_ID_DMA_4_2 = 19, /* internal */ + GAUDI_QUEUE_ID_DMA_4_3 = 20, /* internal */ + GAUDI_QUEUE_ID_DMA_5_0 = 21, /* external */ + GAUDI_QUEUE_ID_DMA_5_1 = 22, /* external */ + GAUDI_QUEUE_ID_DMA_5_2 = 23, /* external */ + GAUDI_QUEUE_ID_DMA_5_3 = 24, /* external */ + GAUDI_QUEUE_ID_DMA_6_0 = 25, /* internal */ + GAUDI_QUEUE_ID_DMA_6_1 = 26, /* internal */ + GAUDI_QUEUE_ID_DMA_6_2 = 27, /* internal */ + GAUDI_QUEUE_ID_DMA_6_3 = 28, /* internal */ + GAUDI_QUEUE_ID_DMA_7_0 = 29, /* internal */ + GAUDI_QUEUE_ID_DMA_7_1 = 30, /* internal */ + GAUDI_QUEUE_ID_DMA_7_2 = 31, /* internal */ + GAUDI_QUEUE_ID_DMA_7_3 = 32, /* internal */ + GAUDI_QUEUE_ID_MME_0_0 = 33, /* internal */ + GAUDI_QUEUE_ID_MME_0_1 = 34, /* internal */ + GAUDI_QUEUE_ID_MME_0_2 = 35, /* internal */ + GAUDI_QUEUE_ID_MME_0_3 = 36, /* internal */ + GAUDI_QUEUE_ID_MME_1_0 = 37, /* internal */ + GAUDI_QUEUE_ID_MME_1_1 = 38, /* internal */ + GAUDI_QUEUE_ID_MME_1_2 = 39, /* internal */ + GAUDI_QUEUE_ID_MME_1_3 = 40, /* internal */ + GAUDI_QUEUE_ID_TPC_0_0 = 41, /* internal */ + GAUDI_QUEUE_ID_TPC_0_1 = 42, /* internal */ + GAUDI_QUEUE_ID_TPC_0_2 = 43, /* internal */ + GAUDI_QUEUE_ID_TPC_0_3 = 44, /* internal */ + GAUDI_QUEUE_ID_TPC_1_0 = 45, /* internal */ + GAUDI_QUEUE_ID_TPC_1_1 = 46, /* internal */ + GAUDI_QUEUE_ID_TPC_1_2 = 47, /* internal */ + GAUDI_QUEUE_ID_TPC_1_3 = 48, /* internal */ + GAUDI_QUEUE_ID_TPC_2_0 = 49, /* internal */ + GAUDI_QUEUE_ID_TPC_2_1 = 50, /* internal */ + GAUDI_QUEUE_ID_TPC_2_2 = 51, /* internal */ + GAUDI_QUEUE_ID_TPC_2_3 = 52, /* internal */ + GAUDI_QUEUE_ID_TPC_3_0 = 53, /* internal */ + GAUDI_QUEUE_ID_TPC_3_1 = 54, /* internal */ + GAUDI_QUEUE_ID_TPC_3_2 = 55, /* internal */ + GAUDI_QUEUE_ID_TPC_3_3 = 56, /* internal */ + GAUDI_QUEUE_ID_TPC_4_0 = 57, /* internal */ + GAUDI_QUEUE_ID_TPC_4_1 = 58, /* internal */ + GAUDI_QUEUE_ID_TPC_4_2 = 59, /* internal */ + GAUDI_QUEUE_ID_TPC_4_3 = 60, /* internal */ + GAUDI_QUEUE_ID_TPC_5_0 = 61, /* internal */ + GAUDI_QUEUE_ID_TPC_5_1 = 62, /* internal */ + GAUDI_QUEUE_ID_TPC_5_2 = 63, /* internal */ + GAUDI_QUEUE_ID_TPC_5_3 = 64, /* internal */ + GAUDI_QUEUE_ID_TPC_6_0 = 65, /* internal */ + GAUDI_QUEUE_ID_TPC_6_1 = 66, /* internal */ + GAUDI_QUEUE_ID_TPC_6_2 = 67, /* internal */ + GAUDI_QUEUE_ID_TPC_6_3 = 68, /* internal */ + GAUDI_QUEUE_ID_TPC_7_0 = 69, /* internal */ + GAUDI_QUEUE_ID_TPC_7_1 = 70, /* internal */ + GAUDI_QUEUE_ID_TPC_7_2 = 71, /* internal */ + GAUDI_QUEUE_ID_TPC_7_3 = 72, /* internal */ + GAUDI_QUEUE_ID_NIC_0_0 = 73, /* internal */ + GAUDI_QUEUE_ID_NIC_0_1 = 74, /* internal */ + GAUDI_QUEUE_ID_NIC_0_2 = 75, /* internal */ + GAUDI_QUEUE_ID_NIC_0_3 = 76, /* internal */ + GAUDI_QUEUE_ID_NIC_1_0 = 77, /* internal */ + GAUDI_QUEUE_ID_NIC_1_1 = 78, /* internal */ + GAUDI_QUEUE_ID_NIC_1_2 = 79, /* internal */ + GAUDI_QUEUE_ID_NIC_1_3 = 80, /* internal */ + GAUDI_QUEUE_ID_NIC_2_0 = 81, /* internal */ + GAUDI_QUEUE_ID_NIC_2_1 = 82, /* internal */ + GAUDI_QUEUE_ID_NIC_2_2 = 83, /* internal */ + GAUDI_QUEUE_ID_NIC_2_3 = 84, /* internal */ + GAUDI_QUEUE_ID_NIC_3_0 = 85, /* internal */ + GAUDI_QUEUE_ID_NIC_3_1 = 86, /* internal */ + GAUDI_QUEUE_ID_NIC_3_2 = 87, /* internal */ + GAUDI_QUEUE_ID_NIC_3_3 = 88, /* internal */ + GAUDI_QUEUE_ID_NIC_4_0 = 89, /* internal */ + GAUDI_QUEUE_ID_NIC_4_1 = 90, /* internal */ + GAUDI_QUEUE_ID_NIC_4_2 = 91, /* internal */ + GAUDI_QUEUE_ID_NIC_4_3 = 92, /* internal */ + GAUDI_QUEUE_ID_NIC_5_0 = 93, /* internal */ + GAUDI_QUEUE_ID_NIC_5_1 = 94, /* internal */ + GAUDI_QUEUE_ID_NIC_5_2 = 95, /* internal */ + GAUDI_QUEUE_ID_NIC_5_3 = 96, /* internal */ + GAUDI_QUEUE_ID_NIC_6_0 = 97, /* internal */ + GAUDI_QUEUE_ID_NIC_6_1 = 98, /* internal */ + GAUDI_QUEUE_ID_NIC_6_2 = 99, /* internal */ + GAUDI_QUEUE_ID_NIC_6_3 = 100, /* internal */ + GAUDI_QUEUE_ID_NIC_7_0 = 101, /* internal */ + GAUDI_QUEUE_ID_NIC_7_1 = 102, /* internal */ + GAUDI_QUEUE_ID_NIC_7_2 = 103, /* internal */ + GAUDI_QUEUE_ID_NIC_7_3 = 104, /* internal */ + GAUDI_QUEUE_ID_NIC_8_0 = 105, /* internal */ + GAUDI_QUEUE_ID_NIC_8_1 = 106, /* internal */ + GAUDI_QUEUE_ID_NIC_8_2 = 107, /* internal */ + GAUDI_QUEUE_ID_NIC_8_3 = 108, /* internal */ + GAUDI_QUEUE_ID_NIC_9_0 = 109, /* internal */ + GAUDI_QUEUE_ID_NIC_9_1 = 110, /* internal */ + GAUDI_QUEUE_ID_NIC_9_2 = 111, /* internal */ + GAUDI_QUEUE_ID_NIC_9_3 = 112, /* internal */ + GAUDI_QUEUE_ID_SIZE +}; + /* * Engine Numbering * @@ -69,6 +195,40 @@ enum goya_engine_id { GOYA_ENGINE_ID_SIZE }; +enum gaudi_engine_id { + GAUDI_ENGINE_ID_DMA_0 = 0, + GAUDI_ENGINE_ID_DMA_1, + GAUDI_ENGINE_ID_DMA_2, + GAUDI_ENGINE_ID_DMA_3, + GAUDI_ENGINE_ID_DMA_4, + GAUDI_ENGINE_ID_DMA_5, + GAUDI_ENGINE_ID_DMA_6, + GAUDI_ENGINE_ID_DMA_7, + GAUDI_ENGINE_ID_MME_0, + GAUDI_ENGINE_ID_MME_1, + GAUDI_ENGINE_ID_MME_2, + GAUDI_ENGINE_ID_MME_3, + GAUDI_ENGINE_ID_TPC_0, + GAUDI_ENGINE_ID_TPC_1, + GAUDI_ENGINE_ID_TPC_2, + GAUDI_ENGINE_ID_TPC_3, + GAUDI_ENGINE_ID_TPC_4, + GAUDI_ENGINE_ID_TPC_5, + GAUDI_ENGINE_ID_TPC_6, + GAUDI_ENGINE_ID_TPC_7, + GAUDI_ENGINE_ID_NIC_0, + GAUDI_ENGINE_ID_NIC_1, + GAUDI_ENGINE_ID_NIC_2, + GAUDI_ENGINE_ID_NIC_3, + GAUDI_ENGINE_ID_NIC_4, + GAUDI_ENGINE_ID_NIC_5, + GAUDI_ENGINE_ID_NIC_6, + GAUDI_ENGINE_ID_NIC_7, + GAUDI_ENGINE_ID_NIC_8, + GAUDI_ENGINE_ID_NIC_9, + GAUDI_ENGINE_ID_SIZE +}; + enum hl_device_status { HL_DEVICE_STATUS_OPERATIONAL, HL_DEVICE_STATUS_IN_RESET, -- cgit v1.2.3 From 0858caa419e6cf9d31e734d09d70b34f64443ef6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2020 13:58:35 +0000 Subject: uapi: General notification queue definitions Add UAPI definitions for the general notification queue, including the following pieces: (*) struct watch_notification. This is the metadata header for notification messages. It includes a type and subtype that indicate the source of the message (eg. WATCH_TYPE_MOUNT_NOTIFY) and the kind of the message (eg. NOTIFY_MOUNT_NEW_MOUNT). The header also contains an information field that conveys the following information: - WATCH_INFO_LENGTH. The size of the entry (entries are variable length). - WATCH_INFO_ID. The watch ID specified when the watchpoint was set. - WATCH_INFO_TYPE_INFO. (Sub)type-specific information. - WATCH_INFO_FLAG_*. Flag bits overlain on the type-specific information. For use by the type. All the information in the header can be used in filtering messages at the point of writing into the buffer. (*) struct watch_notification_removal This is an extended watch-removal notification record that includes an 'id' field that can indicate the identifier of the object being removed if available (for instance, a keyring serial number). Signed-off-by: David Howells --- include/uapi/linux/watch_queue.h | 55 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 include/uapi/linux/watch_queue.h (limited to 'include/uapi') diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h new file mode 100644 index 000000000000..5f3d21e8a34b --- /dev/null +++ b/include/uapi/linux/watch_queue.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_WATCH_QUEUE_H +#define _UAPI_LINUX_WATCH_QUEUE_H + +#include + +enum watch_notification_type { + WATCH_TYPE_META = 0, /* Special record */ + WATCH_TYPE__NR = 1 +}; + +enum watch_meta_notification_subtype { + WATCH_META_REMOVAL_NOTIFICATION = 0, /* Watched object was removed */ + WATCH_META_LOSS_NOTIFICATION = 1, /* Data loss occurred */ +}; + +/* + * Notification record header. This is aligned to 64-bits so that subclasses + * can contain __u64 fields. + */ +struct watch_notification { + __u32 type:24; /* enum watch_notification_type */ + __u32 subtype:8; /* Type-specific subtype (filterable) */ + __u32 info; +#define WATCH_INFO_LENGTH 0x0000007f /* Length of record */ +#define WATCH_INFO_LENGTH__SHIFT 0 +#define WATCH_INFO_ID 0x0000ff00 /* ID of watchpoint */ +#define WATCH_INFO_ID__SHIFT 8 +#define WATCH_INFO_TYPE_INFO 0xffff0000 /* Type-specific info */ +#define WATCH_INFO_TYPE_INFO__SHIFT 16 +#define WATCH_INFO_FLAG_0 0x00010000 /* Type-specific info, flag bit 0 */ +#define WATCH_INFO_FLAG_1 0x00020000 /* ... */ +#define WATCH_INFO_FLAG_2 0x00040000 +#define WATCH_INFO_FLAG_3 0x00080000 +#define WATCH_INFO_FLAG_4 0x00100000 +#define WATCH_INFO_FLAG_5 0x00200000 +#define WATCH_INFO_FLAG_6 0x00400000 +#define WATCH_INFO_FLAG_7 0x00800000 +}; + + +/* + * Extended watch removal notification. This is used optionally if the type + * wants to indicate an identifier for the object being watched, if there is + * such. This can be distinguished by the length. + * + * type -> WATCH_TYPE_META + * subtype -> WATCH_META_REMOVAL_NOTIFICATION + */ +struct watch_notification_removal { + struct watch_notification watch; + __u64 id; /* Type-dependent identifier */ +}; + +#endif /* _UAPI_LINUX_WATCH_QUEUE_H */ -- cgit v1.2.3 From b580b93664f91db8cb503429030df0f1c1e53528 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2020 13:58:35 +0000 Subject: pipe: Add O_NOTIFICATION_PIPE Add an O_NOTIFICATION_PIPE flag that can be passed to pipe2() to indicate that the pipe being created is going to be used for notifications. This suppresses the use of splice(), vmsplice(), tee() and sendfile() on the pipe as calling iov_iter_revert() on a pipe when a kernel notification message has been inserted into the middle of a multi-buffer splice will be messy. The flag is given the same value as O_EXCL as it seems unlikely that this flag will ever be applicable to pipes and I don't want to use up another O_* bit unnecessarily. An alternative could be to add a pipe3() system call. Signed-off-by: David Howells --- include/uapi/linux/watch_queue.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h index 5f3d21e8a34b..9df72227f515 100644 --- a/include/uapi/linux/watch_queue.h +++ b/include/uapi/linux/watch_queue.h @@ -3,6 +3,9 @@ #define _UAPI_LINUX_WATCH_QUEUE_H #include +#include + +#define O_NOTIFICATION_PIPE O_EXCL /* Parameter to pipe2() selecting notification pipe */ enum watch_notification_type { WATCH_TYPE_META = 0, /* Special record */ -- cgit v1.2.3 From c73be61cede5882f9605a852414db559c0ebedfd Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: pipe: Add general notification queue support Make it possible to have a general notification queue built on top of a standard pipe. Notifications are 'spliced' into the pipe and then read out. splice(), vmsplice() and sendfile() are forbidden on pipes used for notifications as post_one_notification() cannot take pipe->mutex. This means that notifications could be posted in between individual pipe buffers, making iov_iter_revert() difficult to effect. The way the notification queue is used is: (1) An application opens a pipe with a special flag and indicates the number of messages it wishes to be able to queue at once (this can only be set once): pipe2(fds, O_NOTIFICATION_PIPE); ioctl(fds[0], IOC_WATCH_QUEUE_SET_SIZE, queue_depth); (2) The application then uses poll() and read() as normal to extract data from the pipe. read() will return multiple notifications if the buffer is big enough, but it will not split a notification across buffers - rather it will return a short read or EMSGSIZE. Notification messages include a length in the header so that the caller can split them up. Each message has a header that describes it: struct watch_notification { __u32 type:24; __u32 subtype:8; __u32 info; }; The type indicates the source (eg. mount tree changes, superblock events, keyring changes, block layer events) and the subtype indicates the event type (eg. mount, unmount; EIO, EDQUOT; link, unlink). The info field indicates a number of things, including the entry length, an ID assigned to a watchpoint contributing to this buffer and type-specific flags. Supplementary data, such as the key ID that generated an event, can be attached in additional slots. The maximum message size is 127 bytes. Messages may not be padded or aligned, so there is no guarantee, for example, that the notification type will be on a 4-byte bounary. Signed-off-by: David Howells --- Documentation/userspace-api/ioctl/ioctl-number.rst | 1 + Documentation/watch_queue.rst | 339 +++++++++++ fs/pipe.c | 206 ++++--- fs/splice.c | 12 +- include/linux/pipe_fs_i.h | 19 +- include/linux/watch_queue.h | 127 ++++ include/uapi/linux/watch_queue.h | 20 + init/Kconfig | 12 + kernel/Makefile | 1 + kernel/watch_queue.c | 657 +++++++++++++++++++++ 10 files changed, 1318 insertions(+), 76 deletions(-) create mode 100644 Documentation/watch_queue.rst create mode 100644 include/linux/watch_queue.h create mode 100644 kernel/watch_queue.c (limited to 'include/uapi') diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index f759edafd938..9425377615ce 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -201,6 +201,7 @@ Code Seq# Include File Comments 'W' 00-1F linux/wanrouter.h conflict! (pre 3.9) 'W' 00-3F sound/asound.h conflict! 'W' 40-5F drivers/pci/switch/switchtec.c +'W' 60-61 linux/watch_queue.h 'X' all fs/xfs/xfs_fs.h, conflict! fs/xfs/linux-2.6/xfs_ioctl32.h, include/linux/falloc.h, diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst new file mode 100644 index 000000000000..849fad6893ef --- /dev/null +++ b/Documentation/watch_queue.rst @@ -0,0 +1,339 @@ +============================== +General notification mechanism +============================== + +The general notification mechanism is built on top of the standard pipe driver +whereby it effectively splices notification messages from the kernel into pipes +opened by userspace. This can be used in conjunction with:: + + * Key/keyring notifications + + +The notifications buffers can be enabled by: + + "General setup"/"General notification queue" + (CONFIG_WATCH_QUEUE) + +This document has the following sections: + +.. contents:: :local: + + +Overview +======== + +This facility appears as a pipe that is opened in a special mode. The pipe's +internal ring buffer is used to hold messages that are generated by the kernel. +These messages are then read out by read(). Splice and similar are disabled on +such pipes due to them wanting to, under some circumstances, revert their +additions to the ring - which might end up interleaved with notification +messages. + +The owner of the pipe has to tell the kernel which sources it would like to +watch through that pipe. Only sources that have been connected to a pipe will +insert messages into it. Note that a source may be bound to multiple pipes and +insert messages into all of them simultaneously. + +Filters may also be emplaced on a pipe so that certain source types and +subevents can be ignored if they're not of interest. + +A message will be discarded if there isn't a slot available in the ring or if +no preallocated message buffer is available. In both of these cases, read() +will insert a WATCH_META_LOSS_NOTIFICATION message into the output buffer after +the last message currently in the buffer has been read. + +Note that when producing a notification, the kernel does not wait for the +consumers to collect it, but rather just continues on. This means that +notifications can be generated whilst spinlocks are held and also protects the +kernel from being held up indefinitely by a userspace malfunction. + + +Message Structure +================= + +Notification messages begin with a short header:: + + struct watch_notification { + __u32 type:24; + __u32 subtype:8; + __u32 info; + }; + +"type" indicates the source of the notification record and "subtype" indicates +the type of record from that source (see the Watch Sources section below). The +type may also be "WATCH_TYPE_META". This is a special record type generated +internally by the watch queue itself. There are two subtypes: + + * WATCH_META_REMOVAL_NOTIFICATION + * WATCH_META_LOSS_NOTIFICATION + +The first indicates that an object on which a watch was installed was removed +or destroyed and the second indicates that some messages have been lost. + +"info" indicates a bunch of things, including: + + * The length of the message in bytes, including the header (mask with + WATCH_INFO_LENGTH and shift by WATCH_INFO_LENGTH__SHIFT). This indicates + the size of the record, which may be between 8 and 127 bytes. + + * The watch ID (mask with WATCH_INFO_ID and shift by WATCH_INFO_ID__SHIFT). + This indicates that caller's ID of the watch, which may be between 0 + and 255. Multiple watches may share a queue, and this provides a means to + distinguish them. + + * A type-specific field (WATCH_INFO_TYPE_INFO). This is set by the + notification producer to indicate some meaning specific to the type and + subtype. + +Everything in info apart from the length can be used for filtering. + +The header can be followed by supplementary information. The format of this is +at the discretion is defined by the type and subtype. + + +Watch List (Notification Source) API +==================================== + +A "watch list" is a list of watchers that are subscribed to a source of +notifications. A list may be attached to an object (say a key or a superblock) +or may be global (say for device events). From a userspace perspective, a +non-global watch list is typically referred to by reference to the object it +belongs to (such as using KEYCTL_NOTIFY and giving it a key serial number to +watch that specific key). + +To manage a watch list, the following functions are provided: + + * ``void init_watch_list(struct watch_list *wlist, + void (*release_watch)(struct watch *wlist));`` + + Initialise a watch list. If ``release_watch`` is not NULL, then this + indicates a function that should be called when the watch_list object is + destroyed to discard any references the watch list holds on the watched + object. + + * ``void remove_watch_list(struct watch_list *wlist);`` + + This removes all of the watches subscribed to a watch_list and frees them + and then destroys the watch_list object itself. + + +Watch Queue (Notification Output) API +===================================== + +A "watch queue" is the buffer allocated by an application that notification +records will be written into. The workings of this are hidden entirely inside +of the pipe device driver, but it is necessary to gain a reference to it to set +a watch. These can be managed with: + + * ``struct watch_queue *get_watch_queue(int fd);`` + + Since watch queues are indicated to the kernel by the fd of the pipe that + implements the buffer, userspace must hand that fd through a system call. + This can be used to look up an opaque pointer to the watch queue from the + system call. + + * ``void put_watch_queue(struct watch_queue *wqueue);`` + + This discards the reference obtained from ``get_watch_queue()``. + + +Watch Subscription API +====================== + +A "watch" is a subscription on a watch list, indicating the watch queue, and +thus the buffer, into which notification records should be written. The watch +queue object may also carry filtering rules for that object, as set by +userspace. Some parts of the watch struct can be set by the driver:: + + struct watch { + union { + u32 info_id; /* ID to be OR'd in to info field */ + ... + }; + void *private; /* Private data for the watched object */ + u64 id; /* Internal identifier */ + ... + }; + +The ``info_id`` value should be an 8-bit number obtained from userspace and +shifted by WATCH_INFO_ID__SHIFT. This is OR'd into the WATCH_INFO_ID field of +struct watch_notification::info when and if the notification is written into +the associated watch queue buffer. + +The ``private`` field is the driver's data associated with the watch_list and +is cleaned up by the ``watch_list::release_watch()`` method. + +The ``id`` field is the source's ID. Notifications that are posted with a +different ID are ignored. + +The following functions are provided to manage watches: + + * ``void init_watch(struct watch *watch, struct watch_queue *wqueue);`` + + Initialise a watch object, setting its pointer to the watch queue, using + appropriate barriering to avoid lockdep complaints. + + * ``int add_watch_to_object(struct watch *watch, struct watch_list *wlist);`` + + Subscribe a watch to a watch list (notification source). The + driver-settable fields in the watch struct must have been set before this + is called. + + * ``int remove_watch_from_object(struct watch_list *wlist, + struct watch_queue *wqueue, + u64 id, false);`` + + Remove a watch from a watch list, where the watch must match the specified + watch queue (``wqueue``) and object identifier (``id``). A notification + (``WATCH_META_REMOVAL_NOTIFICATION``) is sent to the watch queue to + indicate that the watch got removed. + + * ``int remove_watch_from_object(struct watch_list *wlist, NULL, 0, true);`` + + Remove all the watches from a watch list. It is expected that this will be + called preparatory to destruction and that the watch list will be + inaccessible to new watches by this point. A notification + (``WATCH_META_REMOVAL_NOTIFICATION``) is sent to the watch queue of each + subscribed watch to indicate that the watch got removed. + + +Notification Posting API +======================== + +To post a notification to watch list so that the subscribed watches can see it, +the following function should be used:: + + void post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id); + +The notification should be preformatted and a pointer to the header (``n``) +should be passed in. The notification may be larger than this and the size in +units of buffer slots is noted in ``n->info & WATCH_INFO_LENGTH``. + +The ``cred`` struct indicates the credentials of the source (subject) and is +passed to the LSMs, such as SELinux, to allow or suppress the recording of the +note in each individual queue according to the credentials of that queue +(object). + +The ``id`` is the ID of the source object (such as the serial number on a key). +Only watches that have the same ID set in them will see this notification. + + +Watch Sources +============= + +Any particular buffer can be fed from multiple sources. Sources include: + + * WATCH_TYPE_KEY_NOTIFY + + Notifications of this type indicate changes to keys and keyrings, including + the changes of keyring contents or the attributes of keys. + + See Documentation/security/keys/core.rst for more information. + + +Event Filtering +=============== + +Once a watch queue has been created, a set of filters can be applied to limit +the events that are received using:: + + struct watch_notification_filter filter = { + ... + }; + ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) + +The filter description is a variable of type:: + + struct watch_notification_filter { + __u32 nr_filters; + __u32 __reserved; + struct watch_notification_type_filter filters[]; + }; + +Where "nr_filters" is the number of filters in filters[] and "__reserved" +should be 0. The "filters" array has elements of the following type:: + + struct watch_notification_type_filter { + __u32 type; + __u32 info_filter; + __u32 info_mask; + __u32 subtype_filter[8]; + }; + +Where: + + * ``type`` is the event type to filter for and should be something like + "WATCH_TYPE_KEY_NOTIFY" + + * ``info_filter`` and ``info_mask`` act as a filter on the info field of the + notification record. The notification is only written into the buffer if:: + + (watch.info & info_mask) == info_filter + + This could be used, for example, to ignore events that are not exactly on + the watched point in a mount tree. + + * ``subtype_filter`` is a bitmask indicating the subtypes that are of + interest. Bit 0 of subtype_filter[0] corresponds to subtype 0, bit 1 to + subtype 1, and so on. + +If the argument to the ioctl() is NULL, then the filters will be removed and +all events from the watched sources will come through. + + +Userspace Code Example +====================== + +A buffer is created with something like the following:: + + pipe2(fds, O_TMPFILE); + ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256); + +It can then be set to receive keyring change notifications:: + + keyctl(KEYCTL_WATCH_KEY, KEY_SPEC_SESSION_KEYRING, fds[1], 0x01); + +The notifications can then be consumed by something like the following:: + + static void consumer(int rfd, struct watch_queue_buffer *buf) + { + unsigned char buffer[128]; + ssize_t buf_len; + + while (buf_len = read(rfd, buffer, sizeof(buffer)), + buf_len > 0 + ) { + void *p = buffer; + void *end = buffer + buf_len; + while (p < end) { + union { + struct watch_notification n; + unsigned char buf1[128]; + } n; + size_t largest, len; + + largest = end - p; + if (largest > 128) + largest = 128; + memcpy(&n, p, largest); + + len = (n->info & WATCH_INFO_LENGTH) >> + WATCH_INFO_LENGTH__SHIFT; + if (len == 0 || len > largest) + return; + + switch (n.n.type) { + case WATCH_TYPE_META: + got_meta(&n.n); + case WATCH_TYPE_KEY_NOTIFY: + saw_key_change(&n.n); + break; + } + + p += len; + } + } + } diff --git a/fs/pipe.c b/fs/pipe.c index 16fb72e9abf7..da9bc1f21fd1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -459,6 +460,13 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) { + ret = -EXDEV; + goto out; + } +#endif + /* * Only wake up if the pipe started out empty, since * otherwise there should be no readers waiting. @@ -628,22 +636,37 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) int count, head, tail, mask; switch (cmd) { - case FIONREAD: - __pipe_lock(pipe); - count = 0; - head = pipe->head; - tail = pipe->tail; - mask = pipe->ring_size - 1; + case FIONREAD: + __pipe_lock(pipe); + count = 0; + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; - while (tail != head) { - count += pipe->bufs[tail & mask].len; - tail++; - } - __pipe_unlock(pipe); + while (tail != head) { + count += pipe->bufs[tail & mask].len; + tail++; + } + __pipe_unlock(pipe); - return put_user(count, (int __user *)arg); - default: - return -ENOIOCTLCMD; + return put_user(count, (int __user *)arg); + +#ifdef CONFIG_WATCH_QUEUE + case IOC_WATCH_QUEUE_SET_SIZE: { + int ret; + __pipe_lock(pipe); + ret = watch_queue_set_size(pipe, arg); + __pipe_unlock(pipe); + return ret; + } + + case IOC_WATCH_QUEUE_SET_FILTER: + return watch_queue_set_filter( + pipe, (struct watch_notification_filter __user *)arg); +#endif + + default: + return -ENOIOCTLCMD; } } @@ -754,27 +777,27 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } -static unsigned long account_pipe_buffers(struct user_struct *user, - unsigned long old, unsigned long new) +unsigned long account_pipe_buffers(struct user_struct *user, + unsigned long old, unsigned long new) { return atomic_long_add_return(new - old, &user->pipe_bufs); } -static bool too_many_pipe_buffers_soft(unsigned long user_bufs) +bool too_many_pipe_buffers_soft(unsigned long user_bufs) { unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); return soft_limit && user_bufs > soft_limit; } -static bool too_many_pipe_buffers_hard(unsigned long user_bufs) +bool too_many_pipe_buffers_hard(unsigned long user_bufs) { unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); return hard_limit && user_bufs > hard_limit; } -static bool is_unprivileged_user(void) +bool pipe_is_unprivileged_user(void) { return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); } @@ -796,12 +819,12 @@ struct pipe_inode_info *alloc_pipe_info(void) user_bufs = account_pipe_buffers(user, 0, pipe_bufs); - if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { + if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) { user_bufs = account_pipe_buffers(user, pipe_bufs, 1); pipe_bufs = 1; } - if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) + if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), @@ -813,6 +836,7 @@ struct pipe_inode_info *alloc_pipe_info(void) pipe->r_counter = pipe->w_counter = 1; pipe->max_usage = pipe_bufs; pipe->ring_size = pipe_bufs; + pipe->nr_accounted = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); return pipe; @@ -830,7 +854,14 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; - (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0); +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) { + watch_queue_clear(pipe->watch_queue); + put_watch_queue(pipe->watch_queue); + } +#endif + + (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); free_uid(pipe->user); for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = pipe->bufs + i; @@ -906,6 +937,17 @@ int create_pipe_files(struct file **res, int flags) if (!inode) return -ENFILE; + if (flags & O_NOTIFICATION_PIPE) { +#ifdef CONFIG_WATCH_QUEUE + if (watch_queue_init(inode->i_pipe) < 0) { + iput(inode); + return -ENOMEM; + } +#else + return -ENOPKG; +#endif + } + f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), &pipefifo_fops); @@ -936,7 +978,7 @@ static int __do_pipe_flags(int *fd, struct file **files, int flags) int error; int fdw, fdr; - if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) + if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) return -EINVAL; error = create_pipe_files(files, flags); @@ -1184,42 +1226,12 @@ unsigned int round_pipe_size(unsigned long size) } /* - * Allocate a new array of pipe buffers and copy the info over. Returns the - * pipe size if successful, or return -ERROR on error. + * Resize the pipe ring to a number of slots. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) { struct pipe_buffer *bufs; - unsigned int size, nr_slots, head, tail, mask, n; - unsigned long user_bufs; - long ret = 0; - - size = round_pipe_size(arg); - nr_slots = size >> PAGE_SHIFT; - - if (!nr_slots) - return -EINVAL; - - /* - * If trying to increase the pipe capacity, check that an - * unprivileged user is not trying to exceed various limits - * (soft limit check here, hard limit check just below). - * Decreasing the pipe capacity is always permitted, even - * if the user is currently over a limit. - */ - if (nr_slots > pipe->ring_size && - size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots); - - if (nr_slots > pipe->ring_size && - (too_many_pipe_buffers_hard(user_bufs) || - too_many_pipe_buffers_soft(user_bufs)) && - is_unprivileged_user()) { - ret = -EPERM; - goto out_revert_acct; - } + unsigned int head, tail, mask, n; /* * We can shrink the pipe, if arg is greater than the ring occupancy. @@ -1231,17 +1243,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) head = pipe->head; tail = pipe->tail; n = pipe_occupancy(pipe->head, pipe->tail); - if (nr_slots < n) { - ret = -EBUSY; - goto out_revert_acct; - } + if (nr_slots < n) + return -EBUSY; bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); - if (unlikely(!bufs)) { - ret = -ENOMEM; - goto out_revert_acct; - } + if (unlikely(!bufs)) + return -ENOMEM; /* * The pipe array wraps around, so just start the new one at zero @@ -1269,16 +1277,68 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) kfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; - pipe->max_usage = nr_slots; + if (pipe->max_usage > nr_slots) + pipe->max_usage = nr_slots; pipe->tail = tail; pipe->head = head; /* This might have made more room for writers */ wake_up_interruptible(&pipe->wr_wait); + return 0; +} + +/* + * Allocate a new array of pipe buffers and copy the info over. Returns the + * pipe size if successful, or return -ERROR on error. + */ +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +{ + unsigned long user_bufs; + unsigned int nr_slots, size; + long ret = 0; + +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) + return -EBUSY; +#endif + + size = round_pipe_size(arg); + nr_slots = size >> PAGE_SHIFT; + + if (!nr_slots) + return -EINVAL; + + /* + * If trying to increase the pipe capacity, check that an + * unprivileged user is not trying to exceed various limits + * (soft limit check here, hard limit check just below). + * Decreasing the pipe capacity is always permitted, even + * if the user is currently over a limit. + */ + if (nr_slots > pipe->max_usage && + size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots); + + if (nr_slots > pipe->max_usage && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + pipe_is_unprivileged_user()) { + ret = -EPERM; + goto out_revert_acct; + } + + ret = pipe_resize_ring(pipe, nr_slots); + if (ret < 0) + goto out_revert_acct; + + pipe->max_usage = nr_slots; + pipe->nr_accounted = nr_slots; return pipe->max_usage * PAGE_SIZE; out_revert_acct: - (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size); + (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted); return ret; } @@ -1287,9 +1347,17 @@ out_revert_acct: * location, so checking ->i_pipe is not enough to verify that this is a * pipe. */ -struct pipe_inode_info *get_pipe_info(struct file *file) +struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { - return file->f_op == &pipefifo_fops ? file->private_data : NULL; + struct pipe_inode_info *pipe = file->private_data; + + if (file->f_op != &pipefifo_fops || !pipe) + return NULL; +#ifdef CONFIG_WATCH_QUEUE + if (for_splice && pipe->watch_queue) + return NULL; +#endif + return pipe; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) @@ -1297,7 +1365,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) struct pipe_inode_info *pipe; long ret; - pipe = get_pipe_info(file); + pipe = get_pipe_info(file, false); if (!pipe) return -EBADF; diff --git a/fs/splice.c b/fs/splice.c index fd0a1e7e5959..50f3c0260c00 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1122,8 +1122,8 @@ long do_splice(struct file *in, loff_t __user *off_in, !(out->f_mode & FMODE_WRITE))) return -EBADF; - ipipe = get_pipe_info(in); - opipe = get_pipe_info(out); + ipipe = get_pipe_info(in, true); + opipe = get_pipe_info(out, true); if (ipipe && opipe) { if (off_in || off_out) @@ -1273,7 +1273,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, static long vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) { - struct pipe_inode_info *pipe = get_pipe_info(file); + struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { .total_len = iov_iter_count(iter), .flags = flags, @@ -1308,7 +1308,7 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, if (flags & SPLICE_F_GIFT) buf_flag = PIPE_BUF_FLAG_GIFT; - pipe = get_pipe_info(file); + pipe = get_pipe_info(file, true); if (!pipe) return -EBADF; @@ -1757,8 +1757,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct pipe_inode_info *ipipe = get_pipe_info(in); - struct pipe_inode_info *opipe = get_pipe_info(out); + struct pipe_inode_info *ipipe = get_pipe_info(in, true); + struct pipe_inode_info *opipe = get_pipe_info(out, true); int ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index ae58fad7f1e0..1d3eaa233f4a 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -35,6 +35,7 @@ struct pipe_buffer { * @tail: The point of buffer consumption * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) + * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs * @tmp_page: cached released page * @readers: number of current readers of this pipe * @writers: number of current writers of this pipe @@ -45,6 +46,7 @@ struct pipe_buffer { * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers * @user: the user who created this pipe + * @watch_queue: If this pipe is a watch_queue, this is the stuff for that **/ struct pipe_inode_info { struct mutex mutex; @@ -53,6 +55,7 @@ struct pipe_inode_info { unsigned int tail; unsigned int max_usage; unsigned int ring_size; + unsigned int nr_accounted; unsigned int readers; unsigned int writers; unsigned int files; @@ -63,6 +66,9 @@ struct pipe_inode_info { struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; struct user_struct *user; +#ifdef CONFIG_WATCH_QUEUE + struct watch_queue *watch_queue; +#endif }; /* @@ -237,9 +243,20 @@ void pipe_buf_mark_unmergeable(struct pipe_buffer *buf); extern const struct pipe_buf_operations nosteal_pipe_buf_ops; +#ifdef CONFIG_WATCH_QUEUE +unsigned long account_pipe_buffers(struct user_struct *user, + unsigned long old, unsigned long new); +bool too_many_pipe_buffers_soft(unsigned long user_bufs); +bool too_many_pipe_buffers_hard(unsigned long user_bufs); +bool pipe_is_unprivileged_user(void); +#endif + /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ +#ifdef CONFIG_WATCH_QUEUE +int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots); +#endif long pipe_fcntl(struct file *, unsigned int, unsigned long arg); -struct pipe_inode_info *get_pipe_info(struct file *file); +struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice); int create_pipe_files(struct file **, int); unsigned int round_pipe_size(unsigned long size); diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h new file mode 100644 index 000000000000..5e08db2adc31 --- /dev/null +++ b/include/linux/watch_queue.h @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* User-mappable watch queue + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * See Documentation/watch_queue.rst + */ + +#ifndef _LINUX_WATCH_QUEUE_H +#define _LINUX_WATCH_QUEUE_H + +#include +#include +#include + +#ifdef CONFIG_WATCH_QUEUE + +struct cred; + +struct watch_type_filter { + enum watch_notification_type type; + __u32 subtype_filter[1]; /* Bitmask of subtypes to filter on */ + __u32 info_filter; /* Filter on watch_notification::info */ + __u32 info_mask; /* Mask of relevant bits in info_filter */ +}; + +struct watch_filter { + union { + struct rcu_head rcu; + unsigned long type_filter[2]; /* Bitmask of accepted types */ + }; + u32 nr_filters; /* Number of filters */ + struct watch_type_filter filters[]; +}; + +struct watch_queue { + struct rcu_head rcu; + struct watch_filter __rcu *filter; + struct pipe_inode_info *pipe; /* The pipe we're using as a buffer */ + struct hlist_head watches; /* Contributory watches */ + struct page **notes; /* Preallocated notifications */ + unsigned long *notes_bitmap; /* Allocation bitmap for notes */ + struct kref usage; /* Object usage count */ + spinlock_t lock; + unsigned int nr_notes; /* Number of notes */ + unsigned int nr_pages; /* Number of pages in notes[] */ + bool defunct; /* T when queues closed */ +}; + +/* + * Representation of a watch on an object. + */ +struct watch { + union { + struct rcu_head rcu; + u32 info_id; /* ID to be OR'd in to info field */ + }; + struct watch_queue __rcu *queue; /* Queue to post events to */ + struct hlist_node queue_node; /* Link in queue->watches */ + struct watch_list __rcu *watch_list; + struct hlist_node list_node; /* Link in watch_list->watchers */ + const struct cred *cred; /* Creds of the owner of the watch */ + void *private; /* Private data for the watched object */ + u64 id; /* Internal identifier */ + struct kref usage; /* Object usage count */ +}; + +/* + * List of watches on an object. + */ +struct watch_list { + struct rcu_head rcu; + struct hlist_head watchers; + void (*release_watch)(struct watch *); + spinlock_t lock; +}; + +extern void __post_watch_notification(struct watch_list *, + struct watch_notification *, + const struct cred *, + u64); +extern struct watch_queue *get_watch_queue(int); +extern void put_watch_queue(struct watch_queue *); +extern void init_watch(struct watch *, struct watch_queue *); +extern int add_watch_to_object(struct watch *, struct watch_list *); +extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool); +extern long watch_queue_set_size(struct pipe_inode_info *, unsigned int); +extern long watch_queue_set_filter(struct pipe_inode_info *, + struct watch_notification_filter __user *); +extern int watch_queue_init(struct pipe_inode_info *); +extern void watch_queue_clear(struct watch_queue *); + +static inline void init_watch_list(struct watch_list *wlist, + void (*release_watch)(struct watch *)) +{ + INIT_HLIST_HEAD(&wlist->watchers); + spin_lock_init(&wlist->lock); + wlist->release_watch = release_watch; +} + +static inline void post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id) +{ + if (unlikely(wlist)) + __post_watch_notification(wlist, n, cred, id); +} + +static inline void remove_watch_list(struct watch_list *wlist, u64 id) +{ + if (wlist) { + remove_watch_from_object(wlist, NULL, id, true); + kfree_rcu(wlist, rcu); + } +} + +/** + * watch_sizeof - Calculate the information part of the size of a watch record, + * given the structure size. + */ +#define watch_sizeof(STRUCT) (sizeof(STRUCT) << WATCH_INFO_LENGTH__SHIFT) + +#endif + +#endif /* _LINUX_WATCH_QUEUE_H */ diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h index 9df72227f515..3a5790f1f05d 100644 --- a/include/uapi/linux/watch_queue.h +++ b/include/uapi/linux/watch_queue.h @@ -4,9 +4,13 @@ #include #include +#include #define O_NOTIFICATION_PIPE O_EXCL /* Parameter to pipe2() selecting notification pipe */ +#define IOC_WATCH_QUEUE_SET_SIZE _IO('W', 0x60) /* Set the size in pages */ +#define IOC_WATCH_QUEUE_SET_FILTER _IO('W', 0x61) /* Set the filter */ + enum watch_notification_type { WATCH_TYPE_META = 0, /* Special record */ WATCH_TYPE__NR = 1 @@ -41,6 +45,22 @@ struct watch_notification { #define WATCH_INFO_FLAG_7 0x00800000 }; +/* + * Notification filtering rules (IOC_WATCH_QUEUE_SET_FILTER). + */ +struct watch_notification_type_filter { + __u32 type; /* Type to apply filter to */ + __u32 info_filter; /* Filter on watch_notification::info */ + __u32 info_mask; /* Mask of relevant bits in info_filter */ + __u32 subtype_filter[8]; /* Bitmask of subtypes to filter on */ +}; + +struct watch_notification_filter { + __u32 nr_filters; /* Number of filters */ + __u32 __reserved; /* Must be 0 */ + struct watch_notification_type_filter filters[]; +}; + /* * Extended watch removal notification. This is used optionally if the type diff --git a/init/Kconfig b/init/Kconfig index 74a5ac65644f..c95a2a5654a9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -326,6 +326,18 @@ config POSIX_MQUEUE_SYSCTL depends on SYSCTL default y +config WATCH_QUEUE + bool "General notification queue" + default n + help + + This is a general notification queue for the kernel to pass events to + userspace by splicing them into pipes. It can be used in conjunction + with watches for key/keyring change notifications and device + notifications. + + See Documentation/watch_queue.rst + config CROSS_MEMORY_ATTACH bool "Enable process_vm_readv/writev syscalls" depends on MMU diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..41e7e3ae07ec 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -115,6 +115,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c new file mode 100644 index 000000000000..c103e34f8705 --- /dev/null +++ b/kernel/watch_queue.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Watch queue and general notification mechanism, built on pipes + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * See Documentation/watch_queue.rst + */ + +#define pr_fmt(fmt) "watchq: " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_DESCRIPTION("Watch queue"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +#define WATCH_QUEUE_NOTE_SIZE 128 +#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) + +static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct watch_queue *wqueue = (struct watch_queue *)buf->private; + struct page *page; + unsigned int bit; + + /* We need to work out which note within the page this refers to, but + * the note might have been maximum size, so merely ANDing the offset + * off doesn't work. OTOH, the note must've been more than zero size. + */ + bit = buf->offset + buf->len; + if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) + bit -= WATCH_QUEUE_NOTE_SIZE; + bit /= WATCH_QUEUE_NOTE_SIZE; + + page = buf->page; + bit += page->index; + + set_bit(bit, wqueue->notes_bitmap); +} + +static int watch_queue_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return -1; /* No. */ +} + +/* New data written to a pipe may be appended to a buffer with this type. */ +static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { + .confirm = generic_pipe_buf_confirm, + .release = watch_queue_pipe_buf_release, + .steal = watch_queue_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +/* + * Post a notification to a watch queue. + */ +static bool post_one_notification(struct watch_queue *wqueue, + struct watch_notification *n) +{ + void *p; + struct pipe_inode_info *pipe = wqueue->pipe; + struct pipe_buffer *buf; + struct page *page; + unsigned int head, tail, mask, note, offset, len; + bool done = false; + + if (!pipe) + return false; + + spin_lock_irq(&pipe->rd_wait.lock); + + if (wqueue->defunct) + goto out; + + mask = pipe->ring_size - 1; + head = pipe->head; + tail = pipe->tail; + if (pipe_full(head, tail, pipe->ring_size)) + goto lost; + + note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); + if (note >= wqueue->nr_notes) + goto lost; + + page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; + offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; + get_page(page); + len = n->info & WATCH_INFO_LENGTH; + p = kmap_atomic(page); + memcpy(p + offset, n, len); + kunmap_atomic(p); + + buf = &pipe->bufs[head & mask]; + buf->page = page; + buf->private = (unsigned long)wqueue; + buf->ops = &watch_queue_pipe_buf_ops; + buf->offset = offset; + buf->len = len; + buf->flags = 0; + pipe->head = head + 1; + + if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { + spin_unlock_irq(&pipe->rd_wait.lock); + BUG(); + } + wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); + done = true; + +out: + spin_unlock_irq(&pipe->rd_wait.lock); + if (done) + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + return done; + +lost: + goto out; +} + +/* + * Apply filter rules to a notification. + */ +static bool filter_watch_notification(const struct watch_filter *wf, + const struct watch_notification *n) +{ + const struct watch_type_filter *wt; + unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8; + unsigned int st_index = n->subtype / st_bits; + unsigned int st_bit = 1U << (n->subtype % st_bits); + int i; + + if (!test_bit(n->type, wf->type_filter)) + return false; + + for (i = 0; i < wf->nr_filters; i++) { + wt = &wf->filters[i]; + if (n->type == wt->type && + (wt->subtype_filter[st_index] & st_bit) && + (n->info & wt->info_mask) == wt->info_filter) + return true; + } + + return false; /* If there is a filter, the default is to reject. */ +} + +/** + * __post_watch_notification - Post an event notification + * @wlist: The watch list to post the event to. + * @n: The notification record to post. + * @cred: The creds of the process that triggered the notification. + * @id: The ID to match on the watch. + * + * Post a notification of an event into a set of watch queues and let the users + * know. + * + * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and + * should be in units of sizeof(*n). + */ +void __post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + u64 id) +{ + const struct watch_filter *wf; + struct watch_queue *wqueue; + struct watch *watch; + + if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) { + WARN_ON(1); + return; + } + + rcu_read_lock(); + + hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) { + if (watch->id != id) + continue; + n->info &= ~WATCH_INFO_ID; + n->info |= watch->info_id; + + wqueue = rcu_dereference(watch->queue); + wf = rcu_dereference(wqueue->filter); + if (wf && !filter_watch_notification(wf, n)) + continue; + + if (security_post_notification(watch->cred, cred, n) < 0) + continue; + + post_one_notification(wqueue, n); + } + + rcu_read_unlock(); +} +EXPORT_SYMBOL(__post_watch_notification); + +/* + * Allocate sufficient pages to preallocation for the requested number of + * notifications. + */ +long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) +{ + struct watch_queue *wqueue = pipe->watch_queue; + struct page **pages; + unsigned long *bitmap; + unsigned long user_bufs; + unsigned int bmsize; + int ret, i, nr_pages; + + if (!wqueue) + return -ENODEV; + if (wqueue->notes) + return -EBUSY; + + if (nr_notes < 1 || + nr_notes > 512) /* TODO: choose a better hard limit */ + return -EINVAL; + + nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); + nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; + user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); + + if (nr_pages > pipe->max_usage && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + pipe_is_unprivileged_user()) { + ret = -EPERM; + goto error; + } + + ret = pipe_resize_ring(pipe, nr_notes); + if (ret < 0) + goto error; + + pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); + if (!pages) + goto error; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto error_p; + pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; + } + + bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG; + bmsize *= sizeof(unsigned long); + bitmap = kmalloc(bmsize, GFP_KERNEL); + if (!bitmap) + goto error_p; + + memset(bitmap, 0xff, bmsize); + wqueue->notes = pages; + wqueue->notes_bitmap = bitmap; + wqueue->nr_pages = nr_pages; + wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + return 0; + +error_p: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kfree(pages); +error: + (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); + return ret; +} + +/* + * Set the filter on a watch queue. + */ +long watch_queue_set_filter(struct pipe_inode_info *pipe, + struct watch_notification_filter __user *_filter) +{ + struct watch_notification_type_filter *tf; + struct watch_notification_filter filter; + struct watch_type_filter *q; + struct watch_filter *wfilter; + struct watch_queue *wqueue = pipe->watch_queue; + int ret, nr_filter = 0, i; + + if (!wqueue) + return -ENODEV; + + if (!_filter) { + /* Remove the old filter */ + wfilter = NULL; + goto set; + } + + /* Grab the user's filter specification */ + if (copy_from_user(&filter, _filter, sizeof(filter)) != 0) + return -EFAULT; + if (filter.nr_filters == 0 || + filter.nr_filters > 16 || + filter.__reserved != 0) + return -EINVAL; + + tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf)); + if (IS_ERR(tf)) + return PTR_ERR(tf); + + ret = -EINVAL; + for (i = 0; i < filter.nr_filters; i++) { + if ((tf[i].info_filter & ~tf[i].info_mask) || + tf[i].info_mask & WATCH_INFO_LENGTH) + goto err_filter; + /* Ignore any unknown types */ + if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + continue; + nr_filter++; + } + + /* Now we need to build the internal filter from only the relevant + * user-specified filters. + */ + ret = -ENOMEM; + wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL); + if (!wfilter) + goto err_filter; + wfilter->nr_filters = nr_filter; + + q = wfilter->filters; + for (i = 0; i < filter.nr_filters; i++) { + if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + continue; + + q->type = tf[i].type; + q->info_filter = tf[i].info_filter; + q->info_mask = tf[i].info_mask; + q->subtype_filter[0] = tf[i].subtype_filter[0]; + __set_bit(q->type, wfilter->type_filter); + q++; + } + + kfree(tf); +set: + pipe_lock(pipe); + wfilter = rcu_replace_pointer(wqueue->filter, wfilter, + lockdep_is_held(&pipe->mutex)); + pipe_unlock(pipe); + if (wfilter) + kfree_rcu(wfilter, rcu); + return 0; + +err_filter: + kfree(tf); + return ret; +} + +static void __put_watch_queue(struct kref *kref) +{ + struct watch_queue *wqueue = + container_of(kref, struct watch_queue, usage); + struct watch_filter *wfilter; + int i; + + for (i = 0; i < wqueue->nr_pages; i++) + __free_page(wqueue->notes[i]); + + wfilter = rcu_access_pointer(wqueue->filter); + if (wfilter) + kfree_rcu(wfilter, rcu); + kfree_rcu(wqueue, rcu); +} + +/** + * put_watch_queue - Dispose of a ref on a watchqueue. + * @wqueue: The watch queue to unref. + */ +void put_watch_queue(struct watch_queue *wqueue) +{ + kref_put(&wqueue->usage, __put_watch_queue); +} +EXPORT_SYMBOL(put_watch_queue); + +static void free_watch(struct rcu_head *rcu) +{ + struct watch *watch = container_of(rcu, struct watch, rcu); + + put_watch_queue(rcu_access_pointer(watch->queue)); + put_cred(watch->cred); +} + +static void __put_watch(struct kref *kref) +{ + struct watch *watch = container_of(kref, struct watch, usage); + + call_rcu(&watch->rcu, free_watch); +} + +/* + * Discard a watch. + */ +static void put_watch(struct watch *watch) +{ + kref_put(&watch->usage, __put_watch); +} + +/** + * init_watch_queue - Initialise a watch + * @watch: The watch to initialise. + * @wqueue: The queue to assign. + * + * Initialise a watch and set the watch queue. + */ +void init_watch(struct watch *watch, struct watch_queue *wqueue) +{ + kref_init(&watch->usage); + INIT_HLIST_NODE(&watch->list_node); + INIT_HLIST_NODE(&watch->queue_node); + rcu_assign_pointer(watch->queue, wqueue); +} + +/** + * add_watch_to_object - Add a watch on an object to a watch list + * @watch: The watch to add + * @wlist: The watch list to add to + * + * @watch->queue must have been set to point to the queue to post notifications + * to and the watch list of the object to be watched. @watch->cred must also + * have been set to the appropriate credentials and a ref taken on them. + * + * The caller must pin the queue and the list both and must hold the list + * locked against racing watch additions/removals. + */ +int add_watch_to_object(struct watch *watch, struct watch_list *wlist) +{ + struct watch_queue *wqueue = rcu_access_pointer(watch->queue); + struct watch *w; + + hlist_for_each_entry(w, &wlist->watchers, list_node) { + struct watch_queue *wq = rcu_access_pointer(w->queue); + if (wqueue == wq && watch->id == w->id) + return -EBUSY; + } + + watch->cred = get_current_cred(); + rcu_assign_pointer(watch->watch_list, wlist); + + spin_lock_bh(&wqueue->lock); + kref_get(&wqueue->usage); + kref_get(&watch->usage); + hlist_add_head(&watch->queue_node, &wqueue->watches); + spin_unlock_bh(&wqueue->lock); + + hlist_add_head(&watch->list_node, &wlist->watchers); + return 0; +} +EXPORT_SYMBOL(add_watch_to_object); + +/** + * remove_watch_from_object - Remove a watch or all watches from an object. + * @wlist: The watch list to remove from + * @wq: The watch queue of interest (ignored if @all is true) + * @id: The ID of the watch to remove (ignored if @all is true) + * @all: True to remove all objects + * + * Remove a specific watch or all watches from an object. A notification is + * sent to the watcher to tell them that this happened. + */ +int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, + u64 id, bool all) +{ + struct watch_notification_removal n; + struct watch_queue *wqueue; + struct watch *watch; + int ret = -EBADSLT; + + rcu_read_lock(); + +again: + spin_lock(&wlist->lock); + hlist_for_each_entry(watch, &wlist->watchers, list_node) { + if (all || + (watch->id == id && rcu_access_pointer(watch->queue) == wq)) + goto found; + } + spin_unlock(&wlist->lock); + goto out; + +found: + ret = 0; + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + spin_unlock(&wlist->lock); + + /* We now own the reference on watch that used to belong to wlist. */ + + n.watch.type = WATCH_TYPE_META; + n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION; + n.watch.info = watch->info_id | watch_sizeof(n.watch); + n.id = id; + if (id != 0) + n.watch.info = watch->info_id | watch_sizeof(n); + + wqueue = rcu_dereference(watch->queue); + + /* We don't need the watch list lock for the next bit as RCU is + * protecting *wqueue from deallocation. + */ + if (wqueue) { + post_one_notification(wqueue, &n.watch); + + spin_lock_bh(&wqueue->lock); + + if (!hlist_unhashed(&watch->queue_node)) { + hlist_del_init_rcu(&watch->queue_node); + put_watch(watch); + } + + spin_unlock_bh(&wqueue->lock); + } + + if (wlist->release_watch) { + void (*release_watch)(struct watch *); + + release_watch = wlist->release_watch; + rcu_read_unlock(); + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + + if (all && !hlist_empty(&wlist->watchers)) + goto again; +out: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(remove_watch_from_object); + +/* + * Remove all the watches that are contributory to a queue. This has the + * potential to race with removal of the watches by the destruction of the + * objects being watched or with the distribution of notifications. + */ +void watch_queue_clear(struct watch_queue *wqueue) +{ + struct watch_list *wlist; + struct watch *watch; + bool release; + + rcu_read_lock(); + spin_lock_bh(&wqueue->lock); + + /* Prevent new additions and prevent notifications from happening */ + wqueue->defunct = true; + + while (!hlist_empty(&wqueue->watches)) { + watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); + hlist_del_init_rcu(&watch->queue_node); + /* We now own a ref on the watch. */ + spin_unlock_bh(&wqueue->lock); + + /* We can't do the next bit under the queue lock as we need to + * get the list lock - which would cause a deadlock if someone + * was removing from the opposite direction at the same time or + * posting a notification. + */ + wlist = rcu_dereference(watch->watch_list); + if (wlist) { + void (*release_watch)(struct watch *); + + spin_lock(&wlist->lock); + + release = !hlist_unhashed(&watch->list_node); + if (release) { + hlist_del_init_rcu(&watch->list_node); + rcu_assign_pointer(watch->watch_list, NULL); + + /* We now own a second ref on the watch. */ + } + + release_watch = wlist->release_watch; + spin_unlock(&wlist->lock); + + if (release) { + if (release_watch) { + rcu_read_unlock(); + /* This might need to call dput(), so + * we have to drop all the locks. + */ + (*release_watch)(watch); + rcu_read_lock(); + } + put_watch(watch); + } + } + + put_watch(watch); + spin_lock_bh(&wqueue->lock); + } + + spin_unlock_bh(&wqueue->lock); + rcu_read_unlock(); +} + +/** + * get_watch_queue - Get a watch queue from its file descriptor. + * @fd: The fd to query. + */ +struct watch_queue *get_watch_queue(int fd) +{ + struct pipe_inode_info *pipe; + struct watch_queue *wqueue = ERR_PTR(-EINVAL); + struct fd f; + + f = fdget(fd); + if (f.file) { + pipe = get_pipe_info(f.file, false); + if (pipe && pipe->watch_queue) { + wqueue = pipe->watch_queue; + kref_get(&wqueue->usage); + } + fdput(f); + } + + return wqueue; +} +EXPORT_SYMBOL(get_watch_queue); + +/* + * Initialise a watch queue + */ +int watch_queue_init(struct pipe_inode_info *pipe) +{ + struct watch_queue *wqueue; + + wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL); + if (!wqueue) + return -ENOMEM; + + wqueue->pipe = pipe; + kref_init(&wqueue->usage); + spin_lock_init(&wqueue->lock); + INIT_HLIST_HEAD(&wqueue->watches); + + pipe->watch_queue = wqueue; + return 0; +} -- cgit v1.2.3 From f7e47677e39a03057dcced2016c92a9c868693ec Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jan 2020 17:07:11 +0000 Subject: watch_queue: Add a key/keyring notification facility Add a key/keyring change notification facility whereby notifications about changes in key and keyring content and attributes can be received. Firstly, an event queue needs to be created: pipe2(fds, O_NOTIFICATION_PIPE); ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256); then a notification can be set up to report notifications via that queue: struct watch_notification_filter filter = { .nr_filters = 1, .filters = { [0] = { .type = WATCH_TYPE_KEY_NOTIFY, .subtype_filter[0] = UINT_MAX, }, }, }; ioctl(fds[1], IOC_WATCH_QUEUE_SET_FILTER, &filter); keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fds[1], 0x01); After that, records will be placed into the queue when events occur in which keys are changed in some way. Records are of the following format: struct key_notification { struct watch_notification watch; __u32 key_id; __u32 aux; } *n; Where: n->watch.type will be WATCH_TYPE_KEY_NOTIFY. n->watch.subtype will indicate the type of event, such as NOTIFY_KEY_REVOKED. n->watch.info & WATCH_INFO_LENGTH will indicate the length of the record. n->watch.info & WATCH_INFO_ID will be the second argument to keyctl_watch_key(), shifted. n->key will be the ID of the affected key. n->aux will hold subtype-dependent information, such as the key being linked into the keyring specified by n->key in the case of NOTIFY_KEY_LINKED. Note that it is permissible for event records to be of variable length - or, at least, the length may be dependent on the subtype. Note also that the queue can be shared between multiple notifications of various types. Signed-off-by: David Howells Reviewed-by: James Morris --- Documentation/security/keys/core.rst | 57 +++++++++++++++++++++ include/linux/key.h | 3 ++ include/uapi/linux/keyctl.h | 2 + include/uapi/linux/watch_queue.h | 28 +++++++++- security/keys/Kconfig | 9 ++++ security/keys/compat.c | 3 ++ security/keys/gc.c | 5 ++ security/keys/internal.h | 30 ++++++++++- security/keys/key.c | 38 +++++++++----- security/keys/keyctl.c | 99 ++++++++++++++++++++++++++++++++++-- security/keys/keyring.c | 20 +++++--- security/keys/request_key.c | 4 +- 12 files changed, 270 insertions(+), 28 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst index d9b0b859018b..6cff2b5f88ed 100644 --- a/Documentation/security/keys/core.rst +++ b/Documentation/security/keys/core.rst @@ -1026,6 +1026,63 @@ The keyctl syscall functions are: written into the output buffer. Verification returns 0 on success. + * Watch a key or keyring for changes:: + + long keyctl(KEYCTL_WATCH_KEY, key_serial_t key, int queue_fd, + const struct watch_notification_filter *filter); + + This will set or remove a watch for changes on the specified key or + keyring. + + "key" is the ID of the key to be watched. + + "queue_fd" is a file descriptor referring to an open "/dev/watch_queue" + which manages the buffer into which notifications will be delivered. + + "filter" is either NULL to remove a watch or a filter specification to + indicate what events are required from the key. + + See Documentation/watch_queue.rst for more information. + + Note that only one watch may be emplaced for any particular { key, + queue_fd } combination. + + Notification records look like:: + + struct key_notification { + struct watch_notification watch; + __u32 key_id; + __u32 aux; + }; + + In this, watch::type will be "WATCH_TYPE_KEY_NOTIFY" and subtype will be + one of:: + + NOTIFY_KEY_INSTANTIATED + NOTIFY_KEY_UPDATED + NOTIFY_KEY_LINKED + NOTIFY_KEY_UNLINKED + NOTIFY_KEY_CLEARED + NOTIFY_KEY_REVOKED + NOTIFY_KEY_INVALIDATED + NOTIFY_KEY_SETATTR + + Where these indicate a key being instantiated/rejected, updated, a link + being made in a keyring, a link being removed from a keyring, a keyring + being cleared, a key being revoked, a key being invalidated or a key + having one of its attributes changed (user, group, perm, timeout, + restriction). + + If a watched key is deleted, a basic watch_notification will be issued + with "type" set to WATCH_TYPE_META and "subtype" set to + watch_meta_removal_notification. The watchpoint ID will be set in the + "info" field. + + This needs to be configured by enabling: + + "Provide key/keyring change notifications" (KEY_NOTIFICATIONS) + + Kernel Services =============== diff --git a/include/linux/key.h b/include/linux/key.h index 6cf8e71cf8b7..b99b40db08fc 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -176,6 +176,9 @@ struct key { struct list_head graveyard_link; struct rb_node serial_node; }; +#ifdef CONFIG_KEY_NOTIFICATIONS + struct watch_list *watchers; /* Entities watching this key for changes */ +#endif struct rw_semaphore sem; /* change vs change sem */ struct key_user *user; /* owner of this key */ void *security; /* security data for this key */ diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index ed3d5893830d..4c8884eea808 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -69,6 +69,7 @@ #define KEYCTL_RESTRICT_KEYRING 29 /* Restrict keys allowed to link to a keyring */ #define KEYCTL_MOVE 30 /* Move keys between keyrings */ #define KEYCTL_CAPABILITIES 31 /* Find capabilities of keyrings subsystem */ +#define KEYCTL_WATCH_KEY 32 /* Watch a key or ring of keys for changes */ /* keyctl structures */ struct keyctl_dh_params { @@ -130,5 +131,6 @@ struct keyctl_pkey_params { #define KEYCTL_CAPS0_MOVE 0x80 /* KEYCTL_MOVE supported */ #define KEYCTL_CAPS1_NS_KEYRING_NAME 0x01 /* Keyring names are per-user_namespace */ #define KEYCTL_CAPS1_NS_KEY_TAG 0x02 /* Key indexing can include a namespace tag */ +#define KEYCTL_CAPS1_NOTIFICATIONS 0x04 /* Keys generate watchable notifications */ #endif /* _LINUX_KEYCTL_H */ diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h index 3a5790f1f05d..c3d8320b5d3a 100644 --- a/include/uapi/linux/watch_queue.h +++ b/include/uapi/linux/watch_queue.h @@ -13,7 +13,8 @@ enum watch_notification_type { WATCH_TYPE_META = 0, /* Special record */ - WATCH_TYPE__NR = 1 + WATCH_TYPE_KEY_NOTIFY = 1, /* Key change event notification */ + WATCH_TYPE__NR = 2 }; enum watch_meta_notification_subtype { @@ -75,4 +76,29 @@ struct watch_notification_removal { __u64 id; /* Type-dependent identifier */ }; +/* + * Type of key/keyring change notification. + */ +enum key_notification_subtype { + NOTIFY_KEY_INSTANTIATED = 0, /* Key was instantiated (aux is error code) */ + NOTIFY_KEY_UPDATED = 1, /* Key was updated */ + NOTIFY_KEY_LINKED = 2, /* Key (aux) was added to watched keyring */ + NOTIFY_KEY_UNLINKED = 3, /* Key (aux) was removed from watched keyring */ + NOTIFY_KEY_CLEARED = 4, /* Keyring was cleared */ + NOTIFY_KEY_REVOKED = 5, /* Key was revoked */ + NOTIFY_KEY_INVALIDATED = 6, /* Key was invalidated */ + NOTIFY_KEY_SETATTR = 7, /* Key's attributes got changed */ +}; + +/* + * Key/keyring notification record. + * - watch.type = WATCH_TYPE_KEY_NOTIFY + * - watch.subtype = enum key_notification_type + */ +struct key_notification { + struct watch_notification watch; + __u32 key_id; /* The key/keyring affected */ + __u32 aux; /* Per-type auxiliary data */ +}; + #endif /* _UAPI_LINUX_WATCH_QUEUE_H */ diff --git a/security/keys/Kconfig b/security/keys/Kconfig index 47c041563d41..d4dc5ea208af 100644 --- a/security/keys/Kconfig +++ b/security/keys/Kconfig @@ -116,3 +116,12 @@ config KEY_DH_OPERATIONS in the kernel. If you are unsure as to whether this is required, answer N. + +config KEY_NOTIFICATIONS + bool "Provide key/keyring change notifications" + depends on KEYS && WATCH_QUEUE + help + This option provides support for getting change notifications on keys + and keyrings on which the caller has View permission. This makes use + of the /dev/watch_queue misc device to handle the notification + buffer and provides KEYCTL_WATCH_KEY to enable/disable watches. diff --git a/security/keys/compat.c b/security/keys/compat.c index b975f8f11124..6ee9d8f6a4a5 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -156,6 +156,9 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option, case KEYCTL_CAPABILITIES: return keyctl_capabilities(compat_ptr(arg2), arg3); + case KEYCTL_WATCH_KEY: + return keyctl_watch_key(arg2, arg3, arg4); + default: return -EOPNOTSUPP; } diff --git a/security/keys/gc.c b/security/keys/gc.c index 671dd730ecfc..3c90807476eb 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -131,6 +131,11 @@ static noinline void key_gc_unused_keys(struct list_head *keys) kdebug("- %u", key->serial); key_check(key); +#ifdef CONFIG_KEY_NOTIFICATIONS + remove_watch_list(key->watchers, key->serial); + key->watchers = NULL; +#endif + /* Throw away the key data if the key is instantiated */ if (state == KEY_IS_POSITIVE && key->type->destroy) key->type->destroy(key); diff --git a/security/keys/internal.h b/security/keys/internal.h index 6d0ca48ae9a5..28e17f4f3328 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -99,7 +100,8 @@ extern int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit); extern int __key_link_check_live_key(struct key *keyring, struct key *key); -extern void __key_link(struct key *key, struct assoc_array_edit **_edit); +extern void __key_link(struct key *keyring, struct key *key, + struct assoc_array_edit **_edit); extern void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit); @@ -183,6 +185,23 @@ extern int key_task_permission(const key_ref_t key_ref, const struct cred *cred, key_perm_t perm); +static inline void notify_key(struct key *key, + enum key_notification_subtype subtype, u32 aux) +{ +#ifdef CONFIG_KEY_NOTIFICATIONS + struct key_notification n = { + .watch.type = WATCH_TYPE_KEY_NOTIFY, + .watch.subtype = subtype, + .watch.info = watch_sizeof(n), + .key_id = key_serial(key), + .aux = aux, + }; + + post_watch_notification(key->watchers, &n.watch, current_cred(), + n.key_id); +#endif +} + /* * Check to see whether permission is granted to use a key in the desired way. */ @@ -333,6 +352,15 @@ static inline long keyctl_pkey_e_d_s(int op, extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen); +#ifdef CONFIG_KEY_NOTIFICATIONS +extern long keyctl_watch_key(key_serial_t, int, int); +#else +static inline long keyctl_watch_key(key_serial_t key_id, int watch_fd, int watch_id) +{ + return -EOPNOTSUPP; +} +#endif + /* * Debugging key validation */ diff --git a/security/keys/key.c b/security/keys/key.c index e959b3c96b48..e282c6179b21 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -444,6 +444,7 @@ static int __key_instantiate_and_link(struct key *key, /* mark the key as being instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, 0); + notify_key(key, NOTIFY_KEY_INSTANTIATED, 0); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; @@ -453,7 +454,7 @@ static int __key_instantiate_and_link(struct key *key, if (test_bit(KEY_FLAG_KEEP, &keyring->flags)) set_bit(KEY_FLAG_KEEP, &key->flags); - __key_link(key, _edit); + __key_link(keyring, key, _edit); } /* disable the authorisation key */ @@ -601,6 +602,7 @@ int key_reject_and_link(struct key *key, /* mark the key as being negatively instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, -error); + notify_key(key, NOTIFY_KEY_INSTANTIATED, -error); key->expiry = ktime_get_real_seconds() + timeout; key_schedule_gc(key->expiry + key_gc_delay); @@ -611,7 +613,7 @@ int key_reject_and_link(struct key *key, /* and link it into the destination keyring */ if (keyring && link_ret == 0) - __key_link(key, &edit); + __key_link(keyring, key, &edit); /* disable the authorisation key */ if (authkey) @@ -764,9 +766,11 @@ static inline key_ref_t __key_update(key_ref_t key_ref, down_write(&key->sem); ret = key->type->update(key, prep); - if (ret == 0) + if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); + notify_key(key, NOTIFY_KEY_UPDATED, 0); + } up_write(&key->sem); @@ -1023,9 +1027,11 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen) down_write(&key->sem); ret = key->type->update(key, &prep); - if (ret == 0) + if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); + notify_key(key, NOTIFY_KEY_UPDATED, 0); + } up_write(&key->sem); @@ -1057,15 +1063,17 @@ void key_revoke(struct key *key) * instantiated */ down_write_nested(&key->sem, 1); - if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags) && - key->type->revoke) - key->type->revoke(key); - - /* set the death time to no more than the expiry time */ - time = ktime_get_real_seconds(); - if (key->revoked_at == 0 || key->revoked_at > time) { - key->revoked_at = time; - key_schedule_gc(key->revoked_at + key_gc_delay); + if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags)) { + notify_key(key, NOTIFY_KEY_REVOKED, 0); + if (key->type->revoke) + key->type->revoke(key); + + /* set the death time to no more than the expiry time */ + time = ktime_get_real_seconds(); + if (key->revoked_at == 0 || key->revoked_at > time) { + key->revoked_at = time; + key_schedule_gc(key->revoked_at + key_gc_delay); + } } up_write(&key->sem); @@ -1087,8 +1095,10 @@ void key_invalidate(struct key *key) if (!test_bit(KEY_FLAG_INVALIDATED, &key->flags)) { down_write_nested(&key->sem, 1); - if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) + if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) { + notify_key(key, NOTIFY_KEY_INVALIDATED, 0); key_schedule_gc_links(); + } up_write(&key->sem); } } diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 5e01192e222a..7d8de1c9a478 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -37,7 +37,9 @@ static const unsigned char keyrings_capabilities[2] = { KEYCTL_CAPS0_MOVE ), [1] = (KEYCTL_CAPS1_NS_KEYRING_NAME | - KEYCTL_CAPS1_NS_KEY_TAG), + KEYCTL_CAPS1_NS_KEY_TAG | + (IS_ENABLED(CONFIG_KEY_NOTIFICATIONS) ? KEYCTL_CAPS1_NOTIFICATIONS : 0) + ), }; static int key_get_type_from_user(char *type, @@ -1039,6 +1041,7 @@ long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group) if (group != (gid_t) -1) key->gid = gid; + notify_key(key, NOTIFY_KEY_SETATTR, 0); ret = 0; error_put: @@ -1089,6 +1092,7 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm) /* if we're not the sysadmin, we can only change a key that we own */ if (capable(CAP_SYS_ADMIN) || uid_eq(key->uid, current_fsuid())) { key->perm = perm; + notify_key(key, NOTIFY_KEY_SETATTR, 0); ret = 0; } @@ -1480,10 +1484,12 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout) okay: key = key_ref_to_ptr(key_ref); ret = 0; - if (test_bit(KEY_FLAG_KEEP, &key->flags)) + if (test_bit(KEY_FLAG_KEEP, &key->flags)) { ret = -EPERM; - else + } else { key_set_timeout(key, timeout); + notify_key(key, NOTIFY_KEY_SETATTR, 0); + } key_put(key); error: @@ -1757,6 +1763,90 @@ error: return ret; } +#ifdef CONFIG_KEY_NOTIFICATIONS +/* + * Watch for changes to a key. + * + * The caller must have View permission to watch a key or keyring. + */ +long keyctl_watch_key(key_serial_t id, int watch_queue_fd, int watch_id) +{ + struct watch_queue *wqueue; + struct watch_list *wlist = NULL; + struct watch *watch = NULL; + struct key *key; + key_ref_t key_ref; + long ret; + + if (watch_id < -1 || watch_id > 0xff) + return -EINVAL; + + key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_VIEW); + if (IS_ERR(key_ref)) + return PTR_ERR(key_ref); + key = key_ref_to_ptr(key_ref); + + wqueue = get_watch_queue(watch_queue_fd); + if (IS_ERR(wqueue)) { + ret = PTR_ERR(wqueue); + goto err_key; + } + + if (watch_id >= 0) { + ret = -ENOMEM; + if (!key->watchers) { + wlist = kzalloc(sizeof(*wlist), GFP_KERNEL); + if (!wlist) + goto err_wqueue; + init_watch_list(wlist, NULL); + } + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (!watch) + goto err_wlist; + + init_watch(watch, wqueue); + watch->id = key->serial; + watch->info_id = (u32)watch_id << WATCH_INFO_ID__SHIFT; + + ret = security_watch_key(key); + if (ret < 0) + goto err_watch; + + down_write(&key->sem); + if (!key->watchers) { + key->watchers = wlist; + wlist = NULL; + } + + ret = add_watch_to_object(watch, key->watchers); + up_write(&key->sem); + + if (ret == 0) + watch = NULL; + } else { + ret = -EBADSLT; + if (key->watchers) { + down_write(&key->sem); + ret = remove_watch_from_object(key->watchers, + wqueue, key_serial(key), + false); + up_write(&key->sem); + } + } + +err_watch: + kfree(watch); +err_wlist: + kfree(wlist); +err_wqueue: + put_watch_queue(wqueue); +err_key: + key_put(key); + return ret; +} +#endif /* CONFIG_KEY_NOTIFICATIONS */ + /* * Get keyrings subsystem capabilities. */ @@ -1926,6 +2016,9 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, case KEYCTL_CAPABILITIES: return keyctl_capabilities((unsigned char __user *)arg2, (size_t)arg3); + case KEYCTL_WATCH_KEY: + return keyctl_watch_key((key_serial_t)arg2, (int)arg3, (int)arg4); + default: return -EOPNOTSUPP; } diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 5ca620d31cd3..14abfe765b7e 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1056,12 +1056,14 @@ int keyring_restrict(key_ref_t keyring_ref, const char *type, down_write(&keyring->sem); down_write(&keyring_serialise_restrict_sem); - if (keyring->restrict_link) + if (keyring->restrict_link) { ret = -EEXIST; - else if (keyring_detect_restriction_cycle(keyring, restrict_link)) + } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) { ret = -EDEADLK; - else + } else { keyring->restrict_link = restrict_link; + notify_key(keyring, NOTIFY_KEY_SETATTR, 0); + } up_write(&keyring_serialise_restrict_sem); up_write(&keyring->sem); @@ -1362,12 +1364,14 @@ int __key_link_check_live_key(struct key *keyring, struct key *key) * holds at most one link to any given key of a particular type+description * combination. */ -void __key_link(struct key *key, struct assoc_array_edit **_edit) +void __key_link(struct key *keyring, struct key *key, + struct assoc_array_edit **_edit) { __key_get(key); assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); assoc_array_apply_edit(*_edit); *_edit = NULL; + notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key)); } /* @@ -1451,7 +1455,7 @@ int key_link(struct key *keyring, struct key *key) if (ret == 0) ret = __key_link_check_live_key(keyring, key); if (ret == 0) - __key_link(key, &edit); + __key_link(keyring, key, &edit); error_end: __key_link_end(keyring, &key->index_key, edit); @@ -1483,7 +1487,7 @@ static int __key_unlink_begin(struct key *keyring, struct key *key, struct assoc_array_edit *edit; BUG_ON(*_edit != NULL); - + edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, &key->index_key); if (IS_ERR(edit)) @@ -1503,6 +1507,7 @@ static void __key_unlink(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { assoc_array_apply_edit(*_edit); + notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key)); *_edit = NULL; key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } @@ -1621,7 +1626,7 @@ int key_move(struct key *key, goto error; __key_unlink(from_keyring, key, &from_edit); - __key_link(key, &to_edit); + __key_link(to_keyring, key, &to_edit); error: __key_link_end(to_keyring, &key->index_key, to_edit); __key_unlink_end(from_keyring, key, from_edit); @@ -1655,6 +1660,7 @@ int keyring_clear(struct key *keyring) } else { if (edit) assoc_array_apply_edit(edit); + notify_key(keyring, NOTIFY_KEY_CLEARED, 0); key_payload_reserve(keyring, 0); ret = 0; } diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 957b9e3e1492..e1b9f1a80676 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -418,7 +418,7 @@ static int construct_alloc_key(struct keyring_search_context *ctx, goto key_already_present; if (dest_keyring) - __key_link(key, &edit); + __key_link(dest_keyring, key, &edit); mutex_unlock(&key_construction_mutex); if (dest_keyring) @@ -437,7 +437,7 @@ key_already_present: if (dest_keyring) { ret = __key_link_check_live_key(dest_keyring, key); if (ret == 0) - __key_link(key, &edit); + __key_link(dest_keyring, key, &edit); __key_link_end(dest_keyring, &ctx->index_key, edit); if (ret < 0) goto link_check_failed; -- cgit v1.2.3 From e3b1078bedd323df343894a27eb3b3c34944dfd1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 May 2020 13:41:41 -0700 Subject: fscrypt: add support for IV_INO_LBLK_32 policies The eMMC inline crypto standard will only specify 32 DUN bits (a.k.a. IV bits), unlike UFS's 64. IV_INO_LBLK_64 is therefore not applicable, but an encryption format which uses one key per policy and permits the moving of encrypted file contents (as f2fs's garbage collector requires) is still desirable. To support such hardware, add a new encryption format IV_INO_LBLK_32 that makes the best use of the 32 bits: the IV is set to 'SipHash-2-4(inode_number) + file_logical_block_number mod 2^32', where the SipHash key is derived from the fscrypt master key. We hash only the inode number and not also the block number, because we need to maintain contiguity of DUNs to merge bios. Unlike with IV_INO_LBLK_64, with this format IV reuse is possible; this is unavoidable given the size of the DUN. This means this format should only be used where the requirements of the first paragraph apply. However, the hash spreads out the IVs in the whole usable range, and the use of a keyed hash makes it difficult for an attacker to determine which files use which IVs. Besides the above differences, this flag works like IV_INO_LBLK_64 in that on ext4 it is only allowed if the stable_inodes feature has been enabled to prevent inode numbers and the filesystem UUID from changing. Link: https://lore.kernel.org/r/20200515204141.251098-1-ebiggers@kernel.org Reviewed-by: Theodore Ts'o Reviewed-by: Paul Crowley Signed-off-by: Eric Biggers --- Documentation/filesystems/fscrypt.rst | 33 ++++++++++++-- fs/crypto/crypto.c | 6 ++- fs/crypto/fscrypt_private.h | 20 ++++++--- fs/crypto/keyring.c | 5 ++- fs/crypto/keysetup.c | 85 +++++++++++++++++++++++++++-------- fs/crypto/policy.c | 51 +++++++++++++++------ include/uapi/linux/fscrypt.h | 3 +- 7 files changed, 157 insertions(+), 46 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index aa072112cfff..f517af8ec11c 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -292,8 +292,22 @@ files' data differently, inode numbers are included in the IVs. Consequently, shrinking the filesystem may not be allowed. This format is optimized for use with inline encryption hardware -compliant with the UFS or eMMC standards, which support only 64 IV -bits per I/O request and may have only a small number of keyslots. +compliant with the UFS standard, which supports only 64 IV bits per +I/O request and may have only a small number of keyslots. + +IV_INO_LBLK_32 policies +----------------------- + +IV_INO_LBLK_32 policies work like IV_INO_LBLK_64, except that for +IV_INO_LBLK_32, the inode number is hashed with SipHash-2-4 (where the +SipHash key is derived from the master key) and added to the file +logical block number mod 2^32 to produce a 32-bit IV. + +This format is optimized for use with inline encryption hardware +compliant with the eMMC v5.2 standard, which supports only 32 IV bits +per I/O request and may have only a small number of keyslots. This +format results in some level of IV reuse, so it should only be used +when necessary due to hardware limitations. Key identifiers --------------- @@ -369,6 +383,10 @@ a little endian number, except that: to 32 bits and is placed in bits 0-31 of the IV. The inode number (which is also limited to 32 bits) is placed in bits 32-63. +- With `IV_INO_LBLK_32 policies`_, the logical block number is limited + to 32 bits and is placed in bits 0-31 of the IV. The inode number + is then hashed and added mod 2^32. + Note that because file logical block numbers are included in the IVs, filesystems must enforce that blocks are never shifted around within encrypted files, e.g. via "collapse range" or "insert range". @@ -465,8 +483,15 @@ This structure must be initialized as follows: (0x3). - FSCRYPT_POLICY_FLAG_DIRECT_KEY: See `DIRECT_KEY policies`_. - FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64: See `IV_INO_LBLK_64 - policies`_. This is mutually exclusive with DIRECT_KEY and is not - supported on v1 policies. + policies`_. + - FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32: See `IV_INO_LBLK_32 + policies`_. + + v1 encryption policies only support the PAD_* and DIRECT_KEY flags. + The other flags are only supported by v2 encryption policies. + + The DIRECT_KEY, IV_INO_LBLK_64, and IV_INO_LBLK_32 flags are + mutually exclusive. - For v2 encryption policies, ``__reserved`` must be zeroed. diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 40c2821a341e..ed015cb66c7c 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -77,8 +77,12 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, memset(iv, 0, ci->ci_mode->ivsize); if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { - WARN_ON_ONCE((u32)lblk_num != lblk_num); + WARN_ON_ONCE(lblk_num > U32_MAX); + WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX); lblk_num |= (u64)ci->ci_inode->i_ino << 32; + } else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { + WARN_ON_ONCE(lblk_num > U32_MAX); + lblk_num = (u32)(ci->ci_hashed_ino + lblk_num); } else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE); } diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 855ea935f5a6..eb7fcd2b7fb8 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -222,6 +222,9 @@ struct fscrypt_info { /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + + /* Hashed inode number. Only set for IV_INO_LBLK_32 */ + u32 ci_hashed_ino; }; typedef enum { @@ -290,6 +293,8 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, #define HKDF_CONTEXT_DIRECT_KEY 3 #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 #define HKDF_CONTEXT_DIRHASH_KEY 5 +#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 +#define HKDF_CONTEXT_INODE_HASH_KEY 7 int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, @@ -386,14 +391,17 @@ struct fscrypt_master_key { struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; - /* Crypto API transforms for DIRECT_KEY policies, allocated on-demand */ - struct crypto_skcipher *mk_direct_tfms[__FSCRYPT_MODE_MAX + 1]; - /* - * Crypto API transforms for filesystem-layer implementation of - * IV_INO_LBLK_64 policies, allocated on-demand. + * Per-mode encryption keys for the various types of encryption policies + * that use them. Allocated and derived on-demand. */ - struct crypto_skcipher *mk_iv_ino_lblk_64_tfms[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_direct_keys[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1]; + + /* Hash key for inode numbers. Initialized only when needed. */ + siphash_key_t mk_ino_hash_key; + bool mk_ino_hash_key_initialized; } __randomize_layout; diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index c983ddfde8ad..e24eb48bfbe1 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -45,8 +45,9 @@ static void free_master_key(struct fscrypt_master_key *mk) wipe_master_key_secret(&mk->mk_secret); for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) { - crypto_free_skcipher(mk->mk_direct_tfms[i]); - crypto_free_skcipher(mk->mk_iv_ino_lblk_64_tfms[i]); + crypto_free_skcipher(mk->mk_direct_keys[i]); + crypto_free_skcipher(mk->mk_iv_ino_lblk_64_keys[i]); + crypto_free_skcipher(mk->mk_iv_ino_lblk_32_keys[i]); } key_put(mk->mk_users); diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 675479f8e6f3..1129adfa097d 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -46,6 +46,8 @@ struct fscrypt_mode fscrypt_modes[] = { }, }; +static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); + static struct fscrypt_mode * select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) @@ -130,7 +132,7 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; - struct crypto_skcipher *tfm, *prev_tfm; + struct crypto_skcipher *tfm; u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; @@ -139,10 +141,17 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX)) return -EINVAL; - /* pairs with cmpxchg() below */ + /* pairs with smp_store_release() below */ tfm = READ_ONCE(tfms[mode_num]); - if (likely(tfm != NULL)) - goto done; + if (likely(tfm != NULL)) { + ci->ci_ctfm = tfm; + return 0; + } + + mutex_lock(&fscrypt_mode_key_setup_mutex); + + if (tfms[mode_num]) + goto done_unlock; BUILD_BUG_ON(sizeof(mode_num) != 1); BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); @@ -157,21 +166,21 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, hkdf_context, hkdf_info, hkdf_infolen, mode_key, mode->keysize); if (err) - return err; + goto out_unlock; tfm = fscrypt_allocate_skcipher(mode, mode_key, inode); memzero_explicit(mode_key, mode->keysize); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - /* pairs with READ_ONCE() above */ - prev_tfm = cmpxchg(&tfms[mode_num], NULL, tfm); - if (prev_tfm != NULL) { - crypto_free_skcipher(tfm); - tfm = prev_tfm; + if (IS_ERR(tfm)) { + err = PTR_ERR(tfm); + goto out_unlock; } -done: + /* pairs with READ_ONCE() above */ + smp_store_release(&tfms[mode_num], tfm); +done_unlock: ci->ci_ctfm = tfm; - return 0; + err = 0; +out_unlock: + mutex_unlock(&fscrypt_mode_key_setup_mutex); + return err; } int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, @@ -189,6 +198,43 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, return 0; } +static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, + struct fscrypt_master_key *mk) +{ + int err; + + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys, + HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true); + if (err) + return err; + + /* pairs with smp_store_release() below */ + if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) { + + mutex_lock(&fscrypt_mode_key_setup_mutex); + + if (mk->mk_ino_hash_key_initialized) + goto unlock; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, + (u8 *)&mk->mk_ino_hash_key, + sizeof(mk->mk_ino_hash_key)); + if (err) + goto unlock; + /* pairs with smp_load_acquire() above */ + smp_store_release(&mk->mk_ino_hash_key_initialized, true); +unlock: + mutex_unlock(&fscrypt_mode_key_setup_mutex); + if (err) + return err; + } + + ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, + &mk->mk_ino_hash_key); + return 0; +} + static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk) { @@ -203,7 +249,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * encryption key. This ensures that the master key is * consistently used only for HKDF, avoiding key reuse issues. */ - err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_tfms, + err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys, HKDF_CONTEXT_DIRECT_KEY, false); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { @@ -211,11 +257,14 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * IV_INO_LBLK_64: encryption keys are derived from (master_key, * mode_num, filesystem_uuid), and inode number is included in * the IVs. This format is optimized for use with inline - * encryption hardware compliant with the UFS or eMMC standards. + * encryption hardware compliant with the UFS standard. */ - err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms, + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys, HKDF_CONTEXT_IV_INO_LBLK_64_KEY, true); + } else if (ci->ci_policy.v2.flags & + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { + err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk); } else { u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index a15aec8e068c..d23ff162c78b 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -69,18 +69,14 @@ static bool supported_direct_key_modes(const struct inode *inode, return true; } -static bool supported_iv_ino_lblk_64_policy( - const struct fscrypt_policy_v2 *policy, - const struct inode *inode) +static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, + const struct inode *inode, + const char *type, + int max_ino_bits, int max_lblk_bits) { struct super_block *sb = inode->i_sb; int ino_bits = 64, lblk_bits = 64; - if (policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { - fscrypt_warn(inode, - "The DIRECT_KEY and IV_INO_LBLK_64 flags are mutually exclusive"); - return false; - } /* * It's unsafe to include inode numbers in the IVs if the filesystem can * potentially renumber inodes, e.g. via filesystem shrinking. @@ -88,16 +84,22 @@ static bool supported_iv_ino_lblk_64_policy( if (!sb->s_cop->has_stable_inodes || !sb->s_cop->has_stable_inodes(sb)) { fscrypt_warn(inode, - "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't have stable inode numbers", - sb->s_id); + "Can't use %s policy on filesystem '%s' because it doesn't have stable inode numbers", + type, sb->s_id); return false; } if (sb->s_cop->get_ino_and_lblk_bits) sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); - if (ino_bits > 32 || lblk_bits > 32) { + if (ino_bits > max_ino_bits) { + fscrypt_warn(inode, + "Can't use %s policy on filesystem '%s' because its inode numbers are too long", + type, sb->s_id); + return false; + } + if (lblk_bits > max_lblk_bits) { fscrypt_warn(inode, - "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't use 32-bit inode and block numbers", - sb->s_id); + "Can't use %s policy on filesystem '%s' because its block numbers are too long", + type, sb->s_id); return false; } return true; @@ -140,6 +142,8 @@ static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { + int count = 0; + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, @@ -155,13 +159,29 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, return false; } + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY); + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64); + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32); + if (count > 1) { + fscrypt_warn(inode, "Mutually exclusive encryption flags (0x%02x)", + policy->flags); + return false; + } + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && - !supported_iv_ino_lblk_64_policy(policy, inode)) + !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64", + 32, 32)) + return false; + + if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && + /* This uses hashed inode numbers, so ino_bits doesn't matter. */ + !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", + INT_MAX, 32)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { @@ -366,6 +386,9 @@ static int set_encryption_policy(struct inode *inode, policy->v2.master_key_identifier); if (err) return err; + if (policy->v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) + pr_warn_once("%s (pid %d) is setting an IV_INO_LBLK_32 encryption policy. This should only be used if there are certain hardware limitations.\n", + current->comm, current->pid); break; default: WARN_ON(1); diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h index a10e3cdc2839..7875709ccfeb 100644 --- a/include/uapi/linux/fscrypt.h +++ b/include/uapi/linux/fscrypt.h @@ -19,7 +19,8 @@ #define FSCRYPT_POLICY_FLAGS_PAD_MASK 0x03 #define FSCRYPT_POLICY_FLAG_DIRECT_KEY 0x04 #define FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 0x08 -#define FSCRYPT_POLICY_FLAGS_VALID 0x0F +#define FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32 0x10 +#define FSCRYPT_POLICY_FLAGS_VALID 0x1F /* Encryption algorithms */ #define FSCRYPT_MODE_AES_256_XTS 1 -- cgit v1.2.3 From 1b66d253610c7f8f257103808a9460223a087469 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2020 00:45:45 +0200 Subject: bpf: Add get{peer, sock}name attach types for sock_addr As stated in 983695fa6765 ("bpf: fix unconnected udp hooks"), the objective for the existing cgroup connect/sendmsg/recvmsg/bind BPF hooks is to be transparent to applications. In Cilium we make use of these hooks [0] in order to enable E-W load balancing for existing Kubernetes service types for all Cilium managed nodes in the cluster. Those backends can be local or remote. The main advantage of this approach is that it operates as close as possible to the socket, and therefore allows to avoid packet-based NAT given in connect/sendmsg/recvmsg hooks we only need to xlate sock addresses. This also allows to expose NodePort services on loopback addresses in the host namespace, for example. As another advantage, this also efficiently blocks bind requests for applications in the host namespace for exposed ports. However, one missing item is that we also need to perform reverse xlation for inet{,6}_getname() hooks such that we can return the service IP/port tuple back to the application instead of the remote peer address. The vast majority of applications does not bother about getpeername(), but in a few occasions we've seen breakage when validating the peer's address since it returns unexpectedly the backend tuple instead of the service one. Therefore, this trivial patch allows to customise and adds a getpeername() as well as getsockname() BPF cgroup hook for both IPv4 and IPv6 in order to address this situation. Simple example: # ./cilium/cilium service list ID Frontend Service Type Backend 1 1.2.3.4:80 ClusterIP 1 => 10.0.0.10:80 Before; curl's verbose output example, no getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (10.0.0.10) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] After; with getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (1.2.3.4) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] Originally, I had both under a BPF_CGROUP_INET{4,6}_GETNAME type and exposed peer to the context similar as in inet{,6}_getname() fashion, but API-wise this is suboptimal as it always enforces programs having to test for ctx->peer which can easily be missed, hence BPF_CGROUP_INET{4,6}_GET{PEER,SOCK}NAME split. Similarly, the checked return code is on tnum_range(1, 1), but if a use case comes up in future, it can easily be changed to return an error code instead. Helper and ctx member access is the same as with connect/sendmsg/etc hooks. [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/61a479d759b2482ae3efb45546490bacd796a220.1589841594.git.daniel@iogearbox.net --- include/linux/bpf-cgroup.h | 1 + include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/syscall.c | 12 ++++++++++++ kernel/bpf/verifier.c | 6 +++++- net/core/filter.c | 4 ++++ net/ipv4/af_inet.c | 8 ++++++-- net/ipv6/af_inet6.c | 9 ++++++--- tools/include/uapi/linux/bpf.h | 4 ++++ 8 files changed, 42 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 272626cc3fc9..c66c545e161a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, } #define cgroup_bpf_enabled (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9b8a0f63b91..97e1fd19ff58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57dfc98289d5..431241c74614 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9c7d67d65d8c..2ed8351f47a4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7094,7 +7094,11 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); break; case BPF_PROG_TYPE_CGROUP_SKB: diff --git a/net/core/filter.c b/net/core/filter.c index 822d662f97ef..bd2853d23b50 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; @@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fcf0d12a407a..8f5c8c9409d3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -755,12 +755,11 @@ do_err: } EXPORT_SYMBOL(inet_accept); - /* * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -781,6 +780,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET4_GETSOCKNAME, + NULL); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 771a462a8322..3b6fcc0c321a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -504,9 +504,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); /* * This does both peername and sockname. */ - int inet6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; @@ -531,9 +530,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; - sin->sin6_port = inet->inet_sport; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET6_GETPEERNAME : + BPF_CGROUP_INET6_GETSOCKNAME, + NULL); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 146c742f1d49..1cddc398404a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From faf1d25440d6ad06d509dada4b6fe62fea844370 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Wed, 13 May 2020 15:38:44 +0200 Subject: loop: Clean up LOOP_SET_STATUS lo_flags handling LOOP_SET_STATUS(64) will actually allow some lo_flags to be modified; in particular, LO_FLAGS_AUTOCLEAR can be set and cleared, whereas LO_FLAGS_PARTSCAN can be set to request a partition scan. Make this explicit by updating the UAPI to include the flags that can be set/cleared using this ioctl. The implementation can then blindly take over the passed in flags, and use the previous flags for those flags that can't be set / cleared using LOOP_SET_STATUS. Signed-off-by: Martijn Coenen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/loop.c | 19 +++++++++++++------ include/uapi/linux/loop.h | 10 ++++++++-- 2 files changed, 21 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 31f10da4945e..13518ba191f5 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1036,9 +1036,7 @@ loop_set_status_from_info(struct loop_device *lo, lo->transfer = xfer->transfer; lo->ioctl = xfer->ioctl; - if ((lo->lo_flags & LO_FLAGS_AUTOCLEAR) != - (info->lo_flags & LO_FLAGS_AUTOCLEAR)) - lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; + lo->lo_flags = info->lo_flags; lo->lo_encrypt_key_size = info->lo_encrypt_key_size; lo->lo_init[0] = info->lo_init[0]; @@ -1323,6 +1321,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) int err; struct block_device *bdev; kuid_t uid = current_uid(); + int prev_lo_flags; bool partscan = false; bool size_changed = false; @@ -1359,10 +1358,19 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) goto out_unfreeze; } + prev_lo_flags = lo->lo_flags; + err = loop_set_status_from_info(lo, info); if (err) goto out_unfreeze; + /* Mask out flags that can't be set using LOOP_SET_STATUS. */ + lo->lo_flags &= ~LOOP_SET_STATUS_SETTABLE_FLAGS; + /* For those flags, use the previous values instead */ + lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS; + /* For flags that can't be cleared, use previous values too */ + lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_CLEARABLE_FLAGS; + if (size_changed) { loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, lo->lo_backing_file); @@ -1377,9 +1385,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue); - if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) && - !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { - lo->lo_flags |= LO_FLAGS_PARTSCAN; + if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) && + !(prev_lo_flags & LO_FLAGS_PARTSCAN)) { lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; bdev = lo->lo_device; partscan = true; diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h index 080a8df134ef..6b32fee80ce0 100644 --- a/include/uapi/linux/loop.h +++ b/include/uapi/linux/loop.h @@ -25,6 +25,12 @@ enum { LO_FLAGS_DIRECT_IO = 16, }; +/* LO_FLAGS that can be set using LOOP_SET_STATUS(64) */ +#define LOOP_SET_STATUS_SETTABLE_FLAGS (LO_FLAGS_AUTOCLEAR | LO_FLAGS_PARTSCAN) + +/* LO_FLAGS that can be cleared using LOOP_SET_STATUS(64) */ +#define LOOP_SET_STATUS_CLEARABLE_FLAGS (LO_FLAGS_AUTOCLEAR) + #include /* for __kernel_old_dev_t */ #include /* for __u64 */ @@ -37,7 +43,7 @@ struct loop_info { int lo_offset; int lo_encrypt_type; int lo_encrypt_key_size; /* ioctl w/o */ - int lo_flags; /* ioctl r/o */ + int lo_flags; char lo_name[LO_NAME_SIZE]; unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */ unsigned long lo_init[2]; @@ -53,7 +59,7 @@ struct loop_info64 { __u32 lo_number; /* ioctl r/o */ __u32 lo_encrypt_type; __u32 lo_encrypt_key_size; /* ioctl w/o */ - __u32 lo_flags; /* ioctl r/o */ + __u32 lo_flags; __u8 lo_file_name[LO_NAME_SIZE]; __u8 lo_crypt_name[LO_NAME_SIZE]; __u8 lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */ -- cgit v1.2.3 From 3448914e8cc550ba792d4ccc74471d1ca4293aae Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Wed, 13 May 2020 15:38:45 +0200 Subject: loop: Add LOOP_CONFIGURE ioctl This allows userspace to completely setup a loop device with a single ioctl, removing the in-between state where the device can be partially configured - eg the loop device has a backing file associated with it, but is reading from the wrong offset. Besides removing the intermediate state, another big benefit of this ioctl is that LOOP_SET_STATUS can be slow; the main reason for this slowness is that LOOP_SET_STATUS(64) calls blk_mq_freeze_queue() to freeze the associated queue; this requires waiting for RCU synchronization, which I've measured can take about 15-20ms on this device on average. In addition to doing what LOOP_SET_STATUS can do, LOOP_CONFIGURE can also be used to: - Set the correct block size immediately by setting loop_config.block_size (avoids LOOP_SET_BLOCK_SIZE) - Explicitly request direct I/O mode by setting LO_FLAGS_DIRECT_IO in loop_config.info.lo_flags (avoids LOOP_SET_DIRECT_IO) - Explicitly request read-only mode by setting LO_FLAGS_READ_ONLY in loop_config.info.lo_flags Here's setting up ~70 regular loop devices with an offset on an x86 Android device, using LOOP_SET_FD and LOOP_SET_STATUS: vsoc_x86:/system/apex # time for i in `seq 30 100`; do losetup -r -o 4096 /dev/block/loop$i com.android.adbd.apex; done 0m03.40s real 0m00.02s user 0m00.03s system Here's configuring ~70 devices in the same way, but using a modified losetup that uses the new LOOP_CONFIGURE ioctl: vsoc_x86:/system/apex # time for i in `seq 30 100`; do losetup -r -o 4096 /dev/block/loop$i com.android.adbd.apex; done 0m01.94s real 0m00.01s user 0m00.01s system Signed-off-by: Martijn Coenen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/loop.c | 104 +++++++++++++++++++++++++++++++++------------- include/uapi/linux/loop.h | 21 ++++++++++ 2 files changed, 97 insertions(+), 28 deletions(-) (limited to 'include/uapi') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 13518ba191f5..a565c5aafa52 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -228,6 +228,19 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) blk_mq_unfreeze_queue(lo->lo_queue); } +/** + * loop_validate_block_size() - validates the passed in block size + * @bsize: size to validate + */ +static int +loop_validate_block_size(unsigned short bsize) +{ + if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) + return -EINVAL; + + return 0; +} + /** * loop_set_size() - sets device size and notifies userspace * @lo: struct loop_device to set the size for @@ -1050,23 +1063,24 @@ loop_set_status_from_info(struct loop_device *lo, return 0; } -static int loop_set_fd(struct loop_device *lo, fmode_t mode, - struct block_device *bdev, unsigned int arg) +static int loop_configure(struct loop_device *lo, fmode_t mode, + struct block_device *bdev, + const struct loop_config *config) { struct file *file; struct inode *inode; struct address_space *mapping; struct block_device *claimed_bdev = NULL; - int lo_flags = 0; int error; loff_t size; bool partscan; + unsigned short bsize; /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); error = -EBADF; - file = fget(arg); + file = fget(config->fd); if (!file) goto out; @@ -1075,7 +1089,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, * here to avoid changing device under exclusive owner. */ if (!(mode & FMODE_EXCL)) { - claimed_bdev = bd_start_claiming(bdev, loop_set_fd); + claimed_bdev = bd_start_claiming(bdev, loop_configure); if (IS_ERR(claimed_bdev)) { error = PTR_ERR(claimed_bdev); goto out_putf; @@ -1097,11 +1111,26 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, mapping = file->f_mapping; inode = mapping->host; + size = get_loop_size(lo, file); + + if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { + error = -EINVAL; + goto out_unlock; + } + + if (config->block_size) { + error = loop_validate_block_size(config->block_size); + if (error) + goto out_unlock; + } + + error = loop_set_status_from_info(lo, &config->info); + if (error) + goto out_unlock; + if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || !file->f_op->write_iter) - lo_flags |= LO_FLAGS_READ_ONLY; - - size = get_loop_size(lo, file); + lo->lo_flags |= LO_FLAGS_READ_ONLY; error = loop_prepare_queue(lo); if (error) @@ -1109,30 +1138,28 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, error = 0; - set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); + set_device_ro(bdev, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); - lo->use_dio = false; + lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; - lo->lo_flags = lo_flags; lo->lo_backing_file = file; - lo->transfer = NULL; - lo->ioctl = NULL; - lo->lo_sizelimit = 0; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) + if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_write_cache(lo->lo_queue, true, false); - if (io_is_direct(lo->lo_backing_file) && inode->i_sb->s_bdev) { + if (config->block_size) + bsize = config->block_size; + else if (io_is_direct(lo->lo_backing_file) && inode->i_sb->s_bdev) /* In case of direct I/O, match underlying block size */ - unsigned short bsize = bdev_logical_block_size( - inode->i_sb->s_bdev); + bsize = bdev_logical_block_size(inode->i_sb->s_bdev); + else + bsize = 512; - blk_queue_logical_block_size(lo->lo_queue, bsize); - blk_queue_physical_block_size(lo->lo_queue, bsize); - blk_queue_io_min(lo->lo_queue, bsize); - } + blk_queue_logical_block_size(lo->lo_queue, bsize); + blk_queue_physical_block_size(lo->lo_queue, bsize); + blk_queue_io_min(lo->lo_queue, bsize); loop_update_rotational(lo); loop_update_dio(lo); @@ -1155,14 +1182,14 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (partscan) loop_reread_partitions(lo, bdev); if (claimed_bdev) - bd_abort_claiming(bdev, claimed_bdev, loop_set_fd); + bd_abort_claiming(bdev, claimed_bdev, loop_configure); return 0; out_unlock: mutex_unlock(&loop_ctl_mutex); out_bdev: if (claimed_bdev) - bd_abort_claiming(bdev, claimed_bdev, loop_set_fd); + bd_abort_claiming(bdev, claimed_bdev, loop_configure); out_putf: fput(file); out: @@ -1582,8 +1609,9 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) if (lo->lo_state != Lo_bound) return -ENXIO; - if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg)) - return -EINVAL; + err = loop_validate_block_size(arg); + if (err) + return err; if (lo->lo_queue->limits.logical_block_size == arg) return 0; @@ -1645,8 +1673,27 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, int err; switch (cmd) { - case LOOP_SET_FD: - return loop_set_fd(lo, mode, bdev, arg); + case LOOP_SET_FD: { + /* + * Legacy case - pass in a zeroed out struct loop_config with + * only the file descriptor set , which corresponds with the + * default parameters we'd have used otherwise. + */ + struct loop_config config; + + memset(&config, 0, sizeof(config)); + config.fd = arg; + + return loop_configure(lo, mode, bdev, &config); + } + case LOOP_CONFIGURE: { + struct loop_config config; + + if (copy_from_user(&config, argp, sizeof(config))) + return -EFAULT; + + return loop_configure(lo, mode, bdev, &config); + } case LOOP_CHANGE_FD: return loop_change_fd(lo, bdev, arg); case LOOP_CLR_FD: @@ -1818,6 +1865,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, case LOOP_CLR_FD: case LOOP_GET_STATUS64: case LOOP_SET_STATUS64: + case LOOP_CONFIGURE: arg = (unsigned long) compat_ptr(arg); /* fall through */ case LOOP_SET_FD: diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h index 6b32fee80ce0..24a1c45bd1ae 100644 --- a/include/uapi/linux/loop.h +++ b/include/uapi/linux/loop.h @@ -31,6 +31,10 @@ enum { /* LO_FLAGS that can be cleared using LOOP_SET_STATUS(64) */ #define LOOP_SET_STATUS_CLEARABLE_FLAGS (LO_FLAGS_AUTOCLEAR) +/* LO_FLAGS that can be set using LOOP_CONFIGURE */ +#define LOOP_CONFIGURE_SETTABLE_FLAGS (LO_FLAGS_READ_ONLY | LO_FLAGS_AUTOCLEAR \ + | LO_FLAGS_PARTSCAN | LO_FLAGS_DIRECT_IO) + #include /* for __kernel_old_dev_t */ #include /* for __u64 */ @@ -66,6 +70,22 @@ struct loop_info64 { __u64 lo_init[2]; }; +/** + * struct loop_config - Complete configuration for a loop device. + * @fd: fd of the file to be used as a backing file for the loop device. + * @block_size: block size to use; ignored if 0. + * @info: struct loop_info64 to configure the loop device with. + * + * This structure is used with the LOOP_CONFIGURE ioctl, and can be used to + * atomically setup and configure all loop device parameters at once. + */ +struct loop_config { + __u32 fd; + __u32 block_size; + struct loop_info64 info; + __u64 __reserved[8]; +}; + /* * Loop filter types */ @@ -96,6 +116,7 @@ struct loop_info64 { #define LOOP_SET_CAPACITY 0x4C07 #define LOOP_SET_DIRECT_IO 0x4C08 #define LOOP_SET_BLOCK_SIZE 0x4C09 +#define LOOP_CONFIGURE 0x4C0A /* /dev/loop-control interface */ #define LOOP_CTL_ADD 0x4C80 -- cgit v1.2.3 From fe810b509c5f62b5b3d5681ea6f5d36349ced979 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 11 May 2020 12:05:41 -0400 Subject: IB/hfi1: Add accelerated IP capability bit The accelerated IP capability bit is added to allow users to control which feature is enabled and disabled. Link: https://lore.kernel.org/r/20200511160541.173205.96870.stgit@awfm-01.aw.intel.com Reviewed-by: Dennis Dalessandro Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/common.h | 5 +++-- include/uapi/rdma/hfi/hfi1_user.h | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h index 40a1ff0c8a8e..1f7107e35a43 100644 --- a/drivers/infiniband/hw/hfi1/common.h +++ b/drivers/infiniband/hw/hfi1/common.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -149,7 +149,8 @@ HFI1_CAP_NO_INTEGRITY | \ HFI1_CAP_PKEY_CHECK | \ HFI1_CAP_TID_RDMA | \ - HFI1_CAP_OPFN) << \ + HFI1_CAP_OPFN | \ + HFI1_CAP_AIP) << \ HFI1_CAP_USER_SHIFT) /* * Set of capabilities that need to be enabled for kernel context in diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h index 01ac5853d9ac..d95ef9a2b032 100644 --- a/include/uapi/rdma/hfi/hfi1_user.h +++ b/include/uapi/rdma/hfi/hfi1_user.h @@ -6,7 +6,7 @@ * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -109,6 +109,7 @@ #define HFI1_CAP_OPFN (1UL << 16) /* Enable the OPFN protocol */ #define HFI1_CAP_SDMA_HEAD_CHECK (1UL << 17) /* SDMA head checking */ #define HFI1_CAP_EARLY_CREDIT_RETURN (1UL << 18) /* early credit return */ +#define HFI1_CAP_AIP (1UL << 19) /* Enable accelerated IP */ #define HFI1_RCVHDR_ENTSIZE_2 (1UL << 0) #define HFI1_RCVHDR_ENTSIZE_16 (1UL << 1) -- cgit v1.2.3 From cda9ee494248b890973f5d31cf7851c0d21755b9 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 May 2020 10:27:07 +0300 Subject: IB/uverbs: Extend CQ to get its own asynchronous event FD Extend CQ to get its own asynchronous event FD. The event FD is an optional attribute, in case wasn't given the ufile event FD will be used. Link: https://lore.kernel.org/r/20200519072711.257271-4-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 18 ++++++++++++++++++ drivers/infiniband/core/uverbs_std_types_cq.c | 9 ++++++--- include/uapi/rdma/ib_user_ioctl_cmds.h | 1 + 3 files changed, 25 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 34c155f88317..53a10479958b 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -297,6 +297,24 @@ static inline u32 make_port_cap_flags(const struct ib_port_attr *attr) return res; } +static inline struct ib_uverbs_async_event_file * +ib_uverbs_get_async_event(struct uverbs_attr_bundle *attrs, + u16 id) +{ + struct ib_uobject *async_ev_file_uobj; + struct ib_uverbs_async_event_file *async_ev_file; + + async_ev_file_uobj = uverbs_attr_get_uobject(attrs, id); + if (IS_ERR(async_ev_file_uobj)) + async_ev_file = READ_ONCE(attrs->ufile->default_async_file); + else + async_ev_file = container_of(async_ev_file_uobj, + struct ib_uverbs_async_event_file, + uobj); + if (async_ev_file) + uverbs_uobject_get(&async_ev_file->uobj); + return async_ev_file; +} void copy_port_attr_to_resp(struct ib_port_attr *attr, struct ib_uverbs_query_port_resp *resp, diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index be534b0af4f8..5dce2c7cc323 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -100,9 +100,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( uverbs_uobject_get(ev_file_uobj); } - obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); - if (obj->uevent.event_file) - uverbs_uobject_get(&obj->uevent.event_file->uobj); + obj->uevent.event_file = ib_uverbs_get_async_event( + attrs, UVERBS_ATTR_CREATE_CQ_EVENT_FD); if (attr.comp_vector >= attrs->ufile->device->num_comp_vectors) { ret = -EINVAL; @@ -173,6 +172,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, UVERBS_ATTR_TYPE(u32), UA_MANDATORY), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), UVERBS_ATTR_UHW()); static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)( diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index d4ddbe4e696c..286fdc1929e0 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -95,6 +95,7 @@ enum uverbs_attrs_create_cq_cmd_attr_ids { UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, UVERBS_ATTR_CREATE_CQ_FLAGS, UVERBS_ATTR_CREATE_CQ_RESP_CQE, + UVERBS_ATTR_CREATE_CQ_EVENT_FD, }; enum uverbs_attrs_destroy_cq_cmd_attr_ids { -- cgit v1.2.3 From 175ba58d62c84e1216cdf8b4f49f79e55e1ed04b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 May 2020 10:27:08 +0300 Subject: IB/uverbs: Move QP, SRQ, WQ type and flags to UAPI These constants are going to be used in the ioctl interface in coming patches so they are part of the UAPI, place them in the correct header for clarity. Link: https://lore.kernel.org/r/20200519072711.257271-5-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 43 ++++++++++++++++++--------------- include/uapi/rdma/ib_user_ioctl_verbs.h | 34 ++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 19 deletions(-) (limited to 'include/uapi') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c988e9205cf9..94533ae16697 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1040,9 +1040,9 @@ enum ib_cq_notify_flags { }; enum ib_srq_type { - IB_SRQT_BASIC, - IB_SRQT_XRC, - IB_SRQT_TM, + IB_SRQT_BASIC = IB_UVERBS_SRQT_BASIC, + IB_SRQT_XRC = IB_UVERBS_SRQT_XRC, + IB_SRQT_TM = IB_UVERBS_SRQT_TM, }; static inline bool ib_srq_has_cq(enum ib_srq_type srq_type) @@ -1111,16 +1111,16 @@ enum ib_qp_type { IB_QPT_SMI, IB_QPT_GSI, - IB_QPT_RC, - IB_QPT_UC, - IB_QPT_UD, + IB_QPT_RC = IB_UVERBS_QPT_RC, + IB_QPT_UC = IB_UVERBS_QPT_UC, + IB_QPT_UD = IB_UVERBS_QPT_UD, IB_QPT_RAW_IPV6, IB_QPT_RAW_ETHERTYPE, - IB_QPT_RAW_PACKET = 8, - IB_QPT_XRC_INI = 9, - IB_QPT_XRC_TGT, + IB_QPT_RAW_PACKET = IB_UVERBS_QPT_RAW_PACKET, + IB_QPT_XRC_INI = IB_UVERBS_QPT_XRC_INI, + IB_QPT_XRC_TGT = IB_UVERBS_QPT_XRC_TGT, IB_QPT_MAX, - IB_QPT_DRIVER = 0xFF, + IB_QPT_DRIVER = IB_UVERBS_QPT_DRIVER, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the * IB_QPT_MAX usages should not be affected in the core layer @@ -1139,17 +1139,21 @@ enum ib_qp_type { enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, - IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = + IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, IB_QP_CREATE_MANAGED_SEND = 1 << 3, IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_INTEGRITY_EN = 1 << 6, IB_QP_CREATE_NETDEV_USE = 1 << 7, - IB_QP_CREATE_SCATTER_FCS = 1 << 8, - IB_QP_CREATE_CVLAN_STRIPPING = 1 << 9, + IB_QP_CREATE_SCATTER_FCS = + IB_UVERBS_QP_CREATE_SCATTER_FCS, + IB_QP_CREATE_CVLAN_STRIPPING = + IB_UVERBS_QP_CREATE_CVLAN_STRIPPING, IB_QP_CREATE_SOURCE_QPN = 1 << 10, - IB_QP_CREATE_PCI_WRITE_END_PADDING = 1 << 11, + IB_QP_CREATE_PCI_WRITE_END_PADDING = + IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, @@ -1654,7 +1658,7 @@ enum ib_raw_packet_caps { }; enum ib_wq_type { - IB_WQT_RQ + IB_WQT_RQ = IB_UVERBS_WQT_RQ, }; enum ib_wq_state { @@ -1677,10 +1681,11 @@ struct ib_wq { }; enum ib_wq_flags { - IB_WQ_FLAGS_CVLAN_STRIPPING = 1 << 0, - IB_WQ_FLAGS_SCATTER_FCS = 1 << 1, - IB_WQ_FLAGS_DELAY_DROP = 1 << 2, - IB_WQ_FLAGS_PCI_WRITE_END_PADDING = 1 << 3, + IB_WQ_FLAGS_CVLAN_STRIPPING = IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING, + IB_WQ_FLAGS_SCATTER_FCS = IB_UVERBS_WQ_FLAGS_SCATTER_FCS, + IB_WQ_FLAGS_DELAY_DROP = IB_UVERBS_WQ_FLAGS_DELAY_DROP, + IB_WQ_FLAGS_PCI_WRITE_END_PADDING = + IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING, }; struct ib_wq_init_attr { diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index a640bb814be0..b1662dfe86a6 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -64,6 +64,40 @@ enum ib_uverbs_access_flags { ~(IB_UVERBS_ACCESS_OPTIONAL_FIRST - 1) }; +enum ib_uverbs_srq_type { + IB_UVERBS_SRQT_BASIC, + IB_UVERBS_SRQT_XRC, + IB_UVERBS_SRQT_TM, +}; + +enum ib_uverbs_wq_type { + IB_UVERBS_WQT_RQ, +}; + +enum ib_uverbs_wq_flags { + IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING = 1 << 0, + IB_UVERBS_WQ_FLAGS_SCATTER_FCS = 1 << 1, + IB_UVERBS_WQ_FLAGS_DELAY_DROP = 1 << 2, + IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING = 1 << 3, +}; + +enum ib_uverbs_qp_type { + IB_UVERBS_QPT_RC = 2, + IB_UVERBS_QPT_UC, + IB_UVERBS_QPT_UD, + IB_UVERBS_QPT_RAW_PACKET = 8, + IB_UVERBS_QPT_XRC_INI, + IB_UVERBS_QPT_XRC_TGT, + IB_UVERBS_QPT_DRIVER = 0xFF, +}; + +enum ib_uverbs_qp_create_flags { + IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, + IB_UVERBS_QP_CREATE_SCATTER_FCS = 1 << 8, + IB_UVERBS_QP_CREATE_CVLAN_STRIPPING = 1 << 9, + IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING = 1 << 11, +}; + enum ib_uverbs_query_port_cap_flags { IB_UVERBS_PCF_SM = 1 << 1, IB_UVERBS_PCF_NOTICE_SUP = 1 << 2, -- cgit v1.2.3 From c3eab946aba443f0b44a08f446735c74495610a9 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 May 2020 10:27:09 +0300 Subject: IB/uverbs: Introduce create/destroy SRQ commands over ioctl Introduce create/destroy SRQ commands over the ioctl interface to let it be extended to get an asynchronous event FD. Link: https://lore.kernel.org/r/20200519072711.257271-6-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 3 +- drivers/infiniband/core/rdma_core.h | 1 + drivers/infiniband/core/uverbs_std_types.c | 32 ---- drivers/infiniband/core/uverbs_std_types_srq.c | 234 +++++++++++++++++++++++++ drivers/infiniband/core/uverbs_uapi.c | 1 + include/uapi/rdma/ib_user_ioctl_cmds.h | 27 +++ 6 files changed, 265 insertions(+), 33 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_srq.c (limited to 'include/uapi') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 870f0fcd54d5..d7b46a7c07fd 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -36,6 +36,7 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ uverbs_uapi.o uverbs_std_types_device.o \ - uverbs_std_types_async_fd.o + uverbs_std_types_async_fd.o \ + uverbs_std_types_srq.o ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 2b529233e159..d623f911b70b 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -159,6 +159,7 @@ extern const struct uapi_definition uverbs_def_obj_dm[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; +extern const struct uapi_definition uverbs_def_obj_srq[]; extern const struct uapi_definition uverbs_def_write_intf[]; static inline const struct uverbs_api_write_method * diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 3abfc63225cb..d9b6912eafa8 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -142,31 +142,6 @@ static int uverbs_free_wq(struct ib_uobject *uobject, return ret; } -static int uverbs_free_srq(struct ib_uobject *uobject, - enum rdma_remove_reason why, - struct uverbs_attr_bundle *attrs) -{ - struct ib_srq *srq = uobject->object; - struct ib_uevent_object *uevent = - container_of(uobject, struct ib_uevent_object, uobject); - enum ib_srq_type srq_type = srq->srq_type; - int ret; - - ret = ib_destroy_srq_user(srq, &attrs->driver_udata); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; - - if (srq_type == IB_SRQT_XRC) { - struct ib_usrq_object *us = - container_of(uevent, struct ib_usrq_object, uevent); - - atomic_dec(&us->uxrcd->refcnt); - } - - ib_uverbs_release_uevent(uevent); - return ret; -} - static int uverbs_free_xrcd(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) @@ -267,11 +242,6 @@ DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW, UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw), &UVERBS_METHOD(UVERBS_METHOD_MW_DESTROY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_SRQ, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), - uverbs_free_srq)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_AH_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_AH_HANDLE, @@ -346,8 +316,6 @@ const struct uapi_definition uverbs_def_obj_intf[] = { UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MW, UAPI_DEF_OBJ_NEEDS_FN(dealloc_mw)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ, - UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_FLOW, UAPI_DEF_OBJ_NEEDS_FN(destroy_flow)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ, diff --git a/drivers/infiniband/core/uverbs_std_types_srq.c b/drivers/infiniband/core/uverbs_std_types_srq.c new file mode 100644 index 000000000000..c0ecbba26bf4 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_srq.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_srq(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_srq *srq = uobject->object; + struct ib_uevent_object *uevent = + container_of(uobject, struct ib_uevent_object, uobject); + enum ib_srq_type srq_type = srq->srq_type; + int ret; + + ret = ib_destroy_srq_user(srq, &attrs->driver_udata); + if (ib_is_destroy_retryable(ret, why, uobject)) + return ret; + + if (srq_type == IB_SRQT_XRC) { + struct ib_usrq_object *us = + container_of(uobject, struct ib_usrq_object, + uevent.uobject); + + atomic_dec(&us->uxrcd->refcnt); + } + + ib_uverbs_release_uevent(uevent); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_usrq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_SRQ_PD_HANDLE); + struct ib_srq_init_attr attr = {}; + struct ib_uobject *xrcd_uobj; + struct ib_srq *srq; + u64 user_handle; + int ret; + + ret = uverbs_copy_from(&attr.attr.max_sge, attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_SGE); + if (!ret) + ret = uverbs_copy_from(&attr.attr.max_wr, attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_WR); + if (!ret) + ret = uverbs_copy_from(&attr.attr.srq_limit, attrs, + UVERBS_ATTR_CREATE_SRQ_LIMIT); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_SRQ_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&attr.srq_type, attrs, + UVERBS_ATTR_CREATE_SRQ_TYPE); + if (ret) + return ret; + + if (ib_srq_has_cq(attr.srq_type)) { + attr.ext.cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE); + if (IS_ERR(attr.ext.cq)) + return PTR_ERR(attr.ext.cq); + } + + switch (attr.srq_type) { + case IB_UVERBS_SRQT_XRC: + xrcd_uobj = uverbs_attr_get_uobject(attrs, + UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE); + if (IS_ERR(xrcd_uobj)) + return PTR_ERR(xrcd_uobj); + + attr.ext.xrc.xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!attr.ext.xrc.xrcd) + return -EINVAL; + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + break; + case IB_UVERBS_SRQT_TM: + ret = uverbs_copy_from(&attr.ext.tag_matching.max_num_tags, + attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS); + if (ret) + return ret; + break; + case IB_UVERBS_SRQT_BASIC: + break; + default: + return -EINVAL; + } + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_SRQ_EVENT_FD); + INIT_LIST_HEAD(&obj->uevent.event_list); + attr.event_handler = ib_uverbs_srq_event_handler; + obj->uevent.uobject.user_handle = user_handle; + + srq = ib_create_srq_user(pd, &attr, obj, &attrs->driver_udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err; + } + + obj->uevent.uobject.object = srq; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + &attr.attr.max_wr, + sizeof(attr.attr.max_wr)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + &attr.attr.max_sge, + sizeof(attr.attr.max_sge)); + if (ret) + return ret; + + if (attr.srq_type == IB_SRQT_XRC) { + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, + &srq->ext.xrc.srq_num, + sizeof(srq->ext.xrc.srq_num)); + if (ret) + return ret; + } + + return 0; +err: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + if (attr.srq_type == IB_SRQT_XRC) + atomic_dec(&obj->uxrcd->refcnt); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_SRQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_SRQ_TYPE, + enum ib_uverbs_srq_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_LIMIT, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE, + UVERBS_OBJECT_XRCD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_SRQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_SRQ_HANDLE); + struct ib_usrq_object *obj = + container_of(uobj, struct ib_usrq_object, uevent.uobject); + struct ib_uverbs_destroy_srq_resp resp = { + .events_reported = obj->uevent.events_reported + }; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_SRQ_RESP, &resp, + sizeof(resp)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_SRQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_SRQ_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_srq_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_SRQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), + uverbs_free_srq), + &UVERBS_METHOD(UVERBS_METHOD_SRQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_SRQ_DESTROY) +); + +const struct uapi_definition uverbs_def_obj_srq[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ, + UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 3f121ac31e0a..3f5627954fe7 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -634,6 +634,7 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), + UAPI_DEF_CHAIN(uverbs_def_obj_srq), UAPI_DEF_CHAIN(uverbs_def_write_intf), {}, }; diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 286fdc1929e0..c07af46ff04c 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -121,6 +121,33 @@ enum uverbs_attrs_destroy_flow_action_esp { UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, }; +enum uverbs_attrs_create_srq_cmd_attr_ids { + UVERBS_ATTR_CREATE_SRQ_HANDLE, + UVERBS_ATTR_CREATE_SRQ_PD_HANDLE, + UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE, + UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE, + UVERBS_ATTR_CREATE_SRQ_USER_HANDLE, + UVERBS_ATTR_CREATE_SRQ_MAX_WR, + UVERBS_ATTR_CREATE_SRQ_MAX_SGE, + UVERBS_ATTR_CREATE_SRQ_LIMIT, + UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS, + UVERBS_ATTR_CREATE_SRQ_TYPE, + UVERBS_ATTR_CREATE_SRQ_EVENT_FD, + UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, +}; + +enum uverbs_attrs_destroy_srq_cmd_attr_ids { + UVERBS_ATTR_DESTROY_SRQ_HANDLE, + UVERBS_ATTR_DESTROY_SRQ_RESP, +}; + +enum uverbs_methods_srq { + UVERBS_METHOD_SRQ_CREATE, + UVERBS_METHOD_SRQ_DESTROY, +}; + enum uverbs_methods_cq { UVERBS_METHOD_CQ_CREATE, UVERBS_METHOD_CQ_DESTROY, -- cgit v1.2.3 From ef3bc084a8ed461e3d1f82481f47dacb96596f8f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 May 2020 10:27:10 +0300 Subject: IB/uverbs: Introduce create/destroy WQ commands over ioctl Introduce create/destroy WQ commands over the ioctl interface to let it be extended to get an asynchronous event FD. Link: https://lore.kernel.org/r/20200519072711.257271-7-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 3 +- drivers/infiniband/core/rdma_core.h | 1 + drivers/infiniband/core/uverbs_std_types.c | 23 --- drivers/infiniband/core/uverbs_std_types_wq.c | 194 ++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_uapi.c | 1 + include/uapi/rdma/ib_user_ioctl_cmds.h | 25 ++++ 6 files changed, 223 insertions(+), 24 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_wq.c (limited to 'include/uapi') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index d7b46a7c07fd..96c0a4b5af18 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -37,6 +37,7 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ uverbs_uapi.o uverbs_std_types_device.o \ uverbs_std_types_async_fd.o \ - uverbs_std_types_srq.o + uverbs_std_types_srq.o \ + uverbs_std_types_wq.o ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index d623f911b70b..9e9f2fa04fb9 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -160,6 +160,7 @@ extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; extern const struct uapi_definition uverbs_def_obj_srq[]; +extern const struct uapi_definition uverbs_def_obj_wq[]; extern const struct uapi_definition uverbs_def_write_intf[]; static inline const struct uverbs_api_write_method * diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index d9b6912eafa8..c328d5194076 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -125,23 +125,6 @@ static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, return ret; } -static int uverbs_free_wq(struct ib_uobject *uobject, - enum rdma_remove_reason why, - struct uverbs_attr_bundle *attrs) -{ - struct ib_wq *wq = uobject->object; - struct ib_uwq_object *uwq = - container_of(uobject, struct ib_uwq_object, uevent.uobject); - int ret; - - ret = ib_destroy_wq(wq, &attrs->driver_udata); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; - - ib_uverbs_release_uevent(&uwq->uevent); - return ret; -} - static int uverbs_free_xrcd(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) @@ -266,10 +249,6 @@ DECLARE_UVERBS_NAMED_OBJECT( uverbs_free_flow), &UVERBS_METHOD(UVERBS_METHOD_FLOW_DESTROY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_WQ, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_RWQ_IND_TBL_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE, @@ -318,8 +297,6 @@ const struct uapi_definition uverbs_def_obj_intf[] = { UAPI_DEF_OBJ_NEEDS_FN(dealloc_mw)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_FLOW, UAPI_DEF_OBJ_NEEDS_FN(destroy_flow)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ, - UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED( UVERBS_OBJECT_RWQ_IND_TBL, UAPI_DEF_OBJ_NEEDS_FN(destroy_rwq_ind_table)), diff --git a/drivers/infiniband/core/uverbs_std_types_wq.c b/drivers/infiniband/core/uverbs_std_types_wq.c new file mode 100644 index 000000000000..cad842ede077 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_wq.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_wq(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_wq *wq = uobject->object; + struct ib_uwq_object *uwq = + container_of(uobject, struct ib_uwq_object, uevent.uobject); + int ret; + + ret = ib_destroy_wq(wq, &attrs->driver_udata); + if (ib_is_destroy_retryable(ret, why, uobject)) + return ret; + + ib_uverbs_release_uevent(&uwq->uevent); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_WQ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uwq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_PD_HANDLE); + struct ib_cq *cq = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_CQ_HANDLE); + struct ib_wq_init_attr wq_init_attr = {}; + struct ib_wq *wq; + u64 user_handle; + int ret; + + ret = uverbs_get_flags32(&wq_init_attr.create_flags, attrs, + UVERBS_ATTR_CREATE_WQ_FLAGS, + IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING | + IB_UVERBS_WQ_FLAGS_SCATTER_FCS | + IB_UVERBS_WQ_FLAGS_DELAY_DROP | + IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING); + if (!ret) + ret = uverbs_copy_from(&wq_init_attr.max_sge, attrs, + UVERBS_ATTR_CREATE_WQ_MAX_SGE); + if (!ret) + ret = uverbs_copy_from(&wq_init_attr.max_wr, attrs, + UVERBS_ATTR_CREATE_WQ_MAX_WR); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_WQ_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&wq_init_attr.wq_type, attrs, + UVERBS_ATTR_CREATE_WQ_TYPE); + if (ret) + return ret; + + if (wq_init_attr.wq_type != IB_WQT_RQ) + return -EINVAL; + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_WQ_EVENT_FD); + obj->uevent.uobject.user_handle = user_handle; + INIT_LIST_HEAD(&obj->uevent.event_list); + wq_init_attr.event_handler = ib_uverbs_wq_event_handler; + wq_init_attr.wq_context = attrs->ufile; + wq_init_attr.cq = cq; + + wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata); + if (IS_ERR(wq)) { + ret = PTR_ERR(wq); + goto err; + } + + obj->uevent.uobject.object = wq; + wq->wq_type = wq_init_attr.wq_type; + wq->cq = cq; + wq->pd = pd; + wq->device = pd->device; + wq->wq_context = wq_init_attr.wq_context; + atomic_set(&wq->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&cq->usecnt); + wq->uobject = obj; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + &wq_init_attr.max_wr, + sizeof(wq_init_attr.max_wr)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + &wq_init_attr.max_sge, + sizeof(wq_init_attr.max_sge)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, + &wq->wq_num, + sizeof(wq->wq_num)); + return ret; + +err: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_WQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_HANDLE, + UVERBS_OBJECT_WQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_WQ_TYPE, + enum ib_wq_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_WQ_FLAGS, + enum ib_uverbs_wq_flags, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_WQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_WQ_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_WQ_HANDLE); + struct ib_uwq_object *obj = + container_of(uobj, struct ib_uwq_object, uevent.uobject); + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_WQ_RESP, + &obj->uevent.events_reported, + sizeof(obj->uevent.events_reported)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_WQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_WQ_HANDLE, + UVERBS_OBJECT_WQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_WQ_RESP, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_WQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq), + &UVERBS_METHOD(UVERBS_METHOD_WQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_WQ_DESTROY) +); + +const struct uapi_definition uverbs_def_obj_wq[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ, + UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 3f5627954fe7..0ec8cf86ecfa 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -635,6 +635,7 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), UAPI_DEF_CHAIN(uverbs_def_obj_srq), + UAPI_DEF_CHAIN(uverbs_def_obj_wq), UAPI_DEF_CHAIN(uverbs_def_write_intf), {}, }; diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index c07af46ff04c..381b17889d20 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -153,6 +153,31 @@ enum uverbs_methods_cq { UVERBS_METHOD_CQ_DESTROY, }; +enum uverbs_attrs_create_wq_cmd_attr_ids { + UVERBS_ATTR_CREATE_WQ_HANDLE, + UVERBS_ATTR_CREATE_WQ_PD_HANDLE, + UVERBS_ATTR_CREATE_WQ_CQ_HANDLE, + UVERBS_ATTR_CREATE_WQ_USER_HANDLE, + UVERBS_ATTR_CREATE_WQ_TYPE, + UVERBS_ATTR_CREATE_WQ_EVENT_FD, + UVERBS_ATTR_CREATE_WQ_MAX_WR, + UVERBS_ATTR_CREATE_WQ_MAX_SGE, + UVERBS_ATTR_CREATE_WQ_FLAGS, + UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, +}; + +enum uverbs_attrs_destroy_wq_cmd_attr_ids { + UVERBS_ATTR_DESTROY_WQ_HANDLE, + UVERBS_ATTR_DESTROY_WQ_RESP, +}; + +enum uverbs_methods_wq { + UVERBS_METHOD_WQ_CREATE, + UVERBS_METHOD_WQ_DESTROY, +}; + enum uverbs_methods_actions_flow_action_ops { UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, UVERBS_METHOD_FLOW_ACTION_DESTROY, -- cgit v1.2.3 From 6d1e7ba241e990b5c6ba7fdaa03d466f852f3c9e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 May 2020 10:27:11 +0300 Subject: IB/uverbs: Introduce create/destroy QP commands over ioctl Introduce create/destroy QP commands over the ioctl interface to let it be extended to get an asynchronous event FD. Link: https://lore.kernel.org/r/20200519072711.257271-8-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 3 +- drivers/infiniband/core/rdma_core.h | 1 + drivers/infiniband/core/uverbs_std_types.c | 40 --- drivers/infiniband/core/uverbs_std_types_qp.c | 401 ++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_uapi.c | 1 + include/uapi/rdma/ib_user_ioctl_cmds.h | 28 ++ include/uapi/rdma/ib_user_ioctl_verbs.h | 9 + 7 files changed, 442 insertions(+), 41 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_qp.c (limited to 'include/uapi') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 96c0a4b5af18..63c1591223ac 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -38,6 +38,7 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_uapi.o uverbs_std_types_device.o \ uverbs_std_types_async_fd.o \ uverbs_std_types_srq.o \ - uverbs_std_types_wq.o + uverbs_std_types_wq.o \ + uverbs_std_types_qp.o ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 9e9f2fa04fb9..33706dad6c0f 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -159,6 +159,7 @@ extern const struct uapi_definition uverbs_def_obj_dm[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; +extern const struct uapi_definition uverbs_def_obj_qp[]; extern const struct uapi_definition uverbs_def_obj_srq[]; extern const struct uapi_definition uverbs_def_obj_wq[]; extern const struct uapi_definition uverbs_def_write_intf[]; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index c328d5194076..08c39cfb1bd9 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -75,40 +75,6 @@ static int uverbs_free_mw(struct ib_uobject *uobject, return uverbs_dealloc_mw((struct ib_mw *)uobject->object); } -static int uverbs_free_qp(struct ib_uobject *uobject, - enum rdma_remove_reason why, - struct uverbs_attr_bundle *attrs) -{ - struct ib_qp *qp = uobject->object; - struct ib_uqp_object *uqp = - container_of(uobject, struct ib_uqp_object, uevent.uobject); - int ret; - - /* - * If this is a user triggered destroy then do not allow destruction - * until the user cleans up all the mcast bindings. Unlike in other - * places we forcibly clean up the mcast attachments for !DESTROY - * because the mcast attaches are not ubojects and will not be - * destroyed by anything else during cleanup processing. - */ - if (why == RDMA_REMOVE_DESTROY) { - if (!list_empty(&uqp->mcast_list)) - return -EBUSY; - } else if (qp == qp->real_qp) { - ib_uverbs_detach_umcast(qp, uqp); - } - - ret = ib_destroy_qp_user(qp, &attrs->driver_udata); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; - - if (uqp->uxrcd) - atomic_dec(&uqp->uxrcd->refcnt); - - ib_uverbs_release_uevent(&uqp->uevent); - return ret; -} - static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) @@ -210,10 +176,6 @@ DECLARE_UVERBS_NAMED_OBJECT( "[infinibandevent]", O_RDONLY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_QP, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_MW_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MW_HANDLE, @@ -289,8 +251,6 @@ const struct uapi_definition uverbs_def_obj_intf[] = { UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COMP_CHANNEL, UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP, - UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_AH, UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MW, diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c new file mode 100644 index 000000000000..3bf8dcdfe7eb --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_qp.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include "rdma_core.h" +#include "uverbs.h" +#include "core_priv.h" + +static int uverbs_free_qp(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_qp *qp = uobject->object; + struct ib_uqp_object *uqp = + container_of(uobject, struct ib_uqp_object, uevent.uobject); + int ret; + + /* + * If this is a user triggered destroy then do not allow destruction + * until the user cleans up all the mcast bindings. Unlike in other + * places we forcibly clean up the mcast attachments for !DESTROY + * because the mcast attaches are not ubojects and will not be + * destroyed by anything else during cleanup processing. + */ + if (why == RDMA_REMOVE_DESTROY) { + if (!list_empty(&uqp->mcast_list)) + return -EBUSY; + } else if (qp == qp->real_qp) { + ib_uverbs_detach_umcast(qp, uqp); + } + + ret = ib_destroy_qp_user(qp, &attrs->driver_udata); + if (ib_is_destroy_retryable(ret, why, uobject)) + return ret; + + if (uqp->uxrcd) + atomic_dec(&uqp->uxrcd->refcnt); + + ib_uverbs_release_uevent(&uqp->uevent); + return ret; +} + +static int check_creation_flags(enum ib_qp_type qp_type, + u32 create_flags) +{ + create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL; + + if (!create_flags || qp_type == IB_QPT_DRIVER) + return 0; + + if (qp_type != IB_QPT_RAW_PACKET && qp_type != IB_QPT_UD) + return -EINVAL; + + if ((create_flags & IB_UVERBS_QP_CREATE_SCATTER_FCS || + create_flags & IB_UVERBS_QP_CREATE_CVLAN_STRIPPING) && + qp_type != IB_QPT_RAW_PACKET) + return -EINVAL; + + return 0; +} + +static void set_caps(struct ib_qp_init_attr *attr, + struct ib_uverbs_qp_cap *cap, bool req) +{ + if (req) { + attr->cap.max_send_wr = cap->max_send_wr; + attr->cap.max_recv_wr = cap->max_recv_wr; + attr->cap.max_send_sge = cap->max_send_sge; + attr->cap.max_recv_sge = cap->max_recv_sge; + attr->cap.max_inline_data = cap->max_inline_data; + } else { + cap->max_send_wr = attr->cap.max_send_wr; + cap->max_recv_wr = attr->cap.max_recv_wr; + cap->max_send_sge = attr->cap.max_send_sge; + cap->max_recv_sge = attr->cap.max_recv_sge; + cap->max_inline_data = attr->cap.max_inline_data; + } +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uqp_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_QP_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_qp_init_attr attr = {}; + struct ib_uverbs_qp_cap cap = {}; + struct ib_rwq_ind_table *rwq_ind_tbl = NULL; + struct ib_qp *qp; + struct ib_pd *pd = NULL; + struct ib_srq *srq = NULL; + struct ib_cq *recv_cq = NULL; + struct ib_cq *send_cq = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *xrcd_uobj = NULL; + struct ib_device *device; + u64 user_handle; + int ret; + + ret = uverbs_copy_from_or_zero(&cap, attrs, + UVERBS_ATTR_CREATE_QP_CAP); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_QP_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&attr.qp_type, attrs, + UVERBS_ATTR_CREATE_QP_TYPE); + if (ret) + return ret; + + switch (attr.qp_type) { + case IB_QPT_XRC_TGT: + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_PD_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE)) + return -EINVAL; + + xrcd_uobj = uverbs_attr_get_uobject(attrs, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE); + if (IS_ERR(xrcd_uobj)) + return PTR_ERR(xrcd_uobj); + + xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!xrcd) + return -EINVAL; + device = xrcd->device; + break; + case IB_UVERBS_QPT_RAW_PACKET: + if (!capable(CAP_NET_RAW)) + return -EPERM; + fallthrough; + case IB_UVERBS_QPT_RC: + case IB_UVERBS_QPT_UC: + case IB_UVERBS_QPT_UD: + case IB_UVERBS_QPT_XRC_INI: + case IB_UVERBS_QPT_DRIVER: + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE) || + (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE) && + attr.qp_type == IB_QPT_XRC_INI)) + return -EINVAL; + + pd = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_PD_HANDLE); + if (IS_ERR(pd)) + return PTR_ERR(pd); + + rwq_ind_tbl = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE); + if (!IS_ERR(rwq_ind_tbl)) { + if (cap.max_recv_wr || cap.max_recv_sge || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE)) + return -EINVAL; + + /* send_cq is optinal */ + if (cap.max_send_wr) { + send_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); + if (IS_ERR(send_cq)) + return PTR_ERR(send_cq); + } + attr.rwq_ind_tbl = rwq_ind_tbl; + } else { + send_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); + if (IS_ERR(send_cq)) + return PTR_ERR(send_cq); + + if (attr.qp_type != IB_QPT_XRC_INI) { + recv_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE); + if (IS_ERR(recv_cq)) + return PTR_ERR(recv_cq); + } + } + + device = pd->device; + break; + default: + return -EINVAL; + } + + ret = uverbs_get_flags32(&attr.create_flags, attrs, + UVERBS_ATTR_CREATE_QP_FLAGS, + IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_UVERBS_QP_CREATE_SCATTER_FCS | + IB_UVERBS_QP_CREATE_CVLAN_STRIPPING | + IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING | + IB_UVERBS_QP_CREATE_SQ_SIG_ALL); + if (ret) + return ret; + + ret = check_creation_flags(attr.qp_type, attr.create_flags); + if (ret) + return ret; + + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN)) { + ret = uverbs_copy_from(&attr.source_qpn, attrs, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN); + if (ret) + return ret; + attr.create_flags |= IB_QP_CREATE_SOURCE_QPN; + } + + srq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE); + if (!IS_ERR(srq)) { + if ((srq->srq_type == IB_SRQT_XRC && + attr.qp_type != IB_QPT_XRC_TGT) || + (srq->srq_type != IB_SRQT_XRC && + attr.qp_type == IB_QPT_XRC_TGT)) + return -EINVAL; + attr.srq = srq; + } + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_QP_EVENT_FD); + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + obj->uevent.uobject.user_handle = user_handle; + attr.event_handler = ib_uverbs_qp_event_handler; + attr.send_cq = send_cq; + attr.recv_cq = recv_cq; + attr.xrcd = xrcd; + if (attr.create_flags & IB_UVERBS_QP_CREATE_SQ_SIG_ALL) { + /* This creation bit is uverbs one, need to mask before + * calling drivers. It was added to prevent an extra user attr + * only for that when using ioctl. + */ + attr.create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL; + attr.sq_sig_type = IB_SIGNAL_ALL_WR; + } else { + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + } + + set_caps(&attr, &cap, true); + mutex_init(&obj->mcast_lock); + + if (attr.qp_type == IB_QPT_XRC_TGT) + qp = ib_create_qp(pd, &attr); + else + qp = _ib_create_qp(device, pd, &attr, &attrs->driver_udata, + obj); + + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_put; + } + + if (attr.qp_type != IB_QPT_XRC_TGT) { + atomic_inc(&pd->usecnt); + if (attr.send_cq) + atomic_inc(&attr.send_cq->usecnt); + if (attr.recv_cq) + atomic_inc(&attr.recv_cq->usecnt); + if (attr.srq) + atomic_inc(&attr.srq->usecnt); + if (attr.rwq_ind_tbl) + atomic_inc(&attr.rwq_ind_tbl->usecnt); + } else { + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + /* It is done in _ib_create_qp for other QP types */ + qp->uobject = obj; + } + + obj->uevent.uobject.object = qp; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_QP_HANDLE); + + if (attr.qp_type != IB_QPT_XRC_TGT) { + ret = ib_create_qp_security(qp, device); + if (ret) + return ret; + } + + set_caps(&attr, &cap, false); + ret = uverbs_copy_to_struct_or_zero(attrs, + UVERBS_ATTR_CREATE_QP_RESP_CAP, &cap, + sizeof(cap)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, + &qp->qp_num, + sizeof(qp->qp_num)); + + return ret; +err_put: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QP_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_HANDLE, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_XRCD_HANDLE, + UVERBS_OBJECT_XRCD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE, + UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_CAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap, + max_inline_data), + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_QP_TYPE, + enum ib_uverbs_qp_type, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_QP_FLAGS, + enum ib_uverbs_qp_create_flags, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_SOURCE_QPN, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_QP_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_CAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap, + max_inline_data), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_QP_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_QP_HANDLE); + struct ib_uqp_object *obj = + container_of(uobj, struct ib_uqp_object, uevent.uobject); + struct ib_uverbs_destroy_qp_resp resp = { + .events_reported = obj->uevent.events_reported + }; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_QP_RESP, &resp, + sizeof(resp)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QP_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_QP_HANDLE, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_QP_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_qp_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_QP, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp), + &UVERBS_METHOD(UVERBS_METHOD_QP_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_QP_DESTROY)); + +const struct uapi_definition uverbs_def_obj_qp[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP, + UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 0ec8cf86ecfa..5addc8fae3f3 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -634,6 +634,7 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), + UAPI_DEF_CHAIN(uverbs_def_obj_qp), UAPI_DEF_CHAIN(uverbs_def_obj_srq), UAPI_DEF_CHAIN(uverbs_def_obj_wq), UAPI_DEF_CHAIN(uverbs_def_write_intf), diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 381b17889d20..4961d5e858eb 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -121,6 +121,34 @@ enum uverbs_attrs_destroy_flow_action_esp { UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, }; +enum uverbs_attrs_create_qp_cmd_attr_ids { + UVERBS_ATTR_CREATE_QP_HANDLE, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE, + UVERBS_ATTR_CREATE_QP_PD_HANDLE, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE, + UVERBS_ATTR_CREATE_QP_USER_HANDLE, + UVERBS_ATTR_CREATE_QP_CAP, + UVERBS_ATTR_CREATE_QP_TYPE, + UVERBS_ATTR_CREATE_QP_FLAGS, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN, + UVERBS_ATTR_CREATE_QP_EVENT_FD, + UVERBS_ATTR_CREATE_QP_RESP_CAP, + UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, +}; + +enum uverbs_attrs_destroy_qp_cmd_attr_ids { + UVERBS_ATTR_DESTROY_QP_HANDLE, + UVERBS_ATTR_DESTROY_QP_RESP, +}; + +enum uverbs_methods_qp { + UVERBS_METHOD_QP_CREATE, + UVERBS_METHOD_QP_DESTROY, +}; + enum uverbs_attrs_create_srq_cmd_attr_ids { UVERBS_ATTR_CREATE_SRQ_HANDLE, UVERBS_ATTR_CREATE_SRQ_PD_HANDLE, diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index b1662dfe86a6..5debab45ebcb 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -96,6 +96,7 @@ enum ib_uverbs_qp_create_flags { IB_UVERBS_QP_CREATE_SCATTER_FCS = 1 << 8, IB_UVERBS_QP_CREATE_CVLAN_STRIPPING = 1 << 9, IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING = 1 << 11, + IB_UVERBS_QP_CREATE_SQ_SIG_ALL = 1 << 12, }; enum ib_uverbs_query_port_cap_flags { @@ -219,6 +220,14 @@ struct ib_uverbs_query_port_resp_ex { __u8 reserved[6]; }; +struct ib_uverbs_qp_cap { + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; +}; + enum rdma_driver_id { RDMA_DRIVER_UNKNOWN, RDMA_DRIVER_MLX5, -- cgit v1.2.3 From d8bed686ab96169ac80b497d1cbed89300d97f83 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Tue, 19 May 2020 22:45:20 +0800 Subject: net: psample: Add tunnel support Currently, psample can only send the packet bits after decapsulation. The tunnel information is lost. Add the tunnel support. If the sampled packet has no tunnel info, the behavior is the same as before. If it has, add a nested metadata field named PSAMPLE_ATTR_TUNNEL and include the tunnel subfields if applicable. Increase the metadata length for sampled packet with the tunnel info. If new subfields of tunnel info should be included, update the metadata length accordingly. Signed-off-by: Chris Mi Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/psample.h | 22 ++++++ net/psample/psample.c | 157 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h index ce1116cff53d..aea26ab1431c 100644 --- a/include/uapi/linux/psample.h +++ b/include/uapi/linux/psample.h @@ -11,6 +11,7 @@ enum { PSAMPLE_ATTR_GROUP_SEQ, PSAMPLE_ATTR_SAMPLE_RATE, PSAMPLE_ATTR_DATA, + PSAMPLE_ATTR_TUNNEL, /* commands attributes */ PSAMPLE_ATTR_GROUP_REFCOUNT, @@ -25,6 +26,27 @@ enum psample_command { PSAMPLE_CMD_DEL_GROUP, }; +enum psample_tunnel_key_attr { + PSAMPLE_TUNNEL_KEY_ATTR_ID, /* be64 Tunnel ID */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC, /* be32 src IP address. */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST, /* be32 dst IP address. */ + PSAMPLE_TUNNEL_KEY_ATTR_TOS, /* u8 Tunnel IP ToS. */ + PSAMPLE_TUNNEL_KEY_ATTR_TTL, /* u8 Tunnel IP TTL. */ + PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ + PSAMPLE_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ + PSAMPLE_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ + PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, /* Array of Geneve options. */ + PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC, /* be16 src Transport Port. */ + PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, /* be16 dst Transport Port. */ + PSAMPLE_TUNNEL_KEY_ATTR_VXLAN_OPTS, /* Nested VXLAN opts* */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 address. */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ + PSAMPLE_TUNNEL_KEY_ATTR_PAD, + PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* struct erspan_metadata */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE, /* No argument. IPV4_INFO_BRIDGE mode.*/ + __PSAMPLE_TUNNEL_KEY_ATTR_MAX +}; + /* Can be overridden at runtime by module option */ #define PSAMPLE_ATTR_MAX (__PSAMPLE_ATTR_MAX - 1) diff --git a/net/psample/psample.c b/net/psample/psample.c index 6f2fbc6b9eb2..34a74043840b 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #define PSAMPLE_MAX_PACKET_SIZE 0xffff @@ -207,10 +209,155 @@ void psample_group_put(struct psample_group *group) } EXPORT_SYMBOL_GPL(psample_group_put); +static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + unsigned short tun_proto = ip_tunnel_info_af(tun_info); + const void *tun_opts = ip_tunnel_info_opts(tun_info); + const struct ip_tunnel_key *tun_key = &tun_info->key; + int tun_opts_len = tun_info->options_len; + + if (tun_key->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, PSAMPLE_TUNNEL_KEY_ATTR_ID, tun_key->tun_id, + PSAMPLE_TUNNEL_KEY_ATTR_PAD)) + return -EMSGSIZE; + + if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE)) + return -EMSGSIZE; + + switch (tun_proto) { + case AF_INET: + if (tun_key->u.ipv4.src && + nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC, + tun_key->u.ipv4.src)) + return -EMSGSIZE; + if (tun_key->u.ipv4.dst && + nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST, + tun_key->u.ipv4.dst)) + return -EMSGSIZE; + break; + case AF_INET6: + if (!ipv6_addr_any(&tun_key->u.ipv6.src) && + nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC, + &tun_key->u.ipv6.src)) + return -EMSGSIZE; + if (!ipv6_addr_any(&tun_key->u.ipv6.dst) && + nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST, + &tun_key->u.ipv6.dst)) + return -EMSGSIZE; + break; + } + if (tun_key->tos && + nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TOS, tun_key->tos)) + return -EMSGSIZE; + if (nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TTL, tun_key->ttl)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_CSUM) && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + if (tun_key->tp_src && + nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC, tun_key->tp_src)) + return -EMSGSIZE; + if (tun_key->tp_dst && + nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, tun_key->tp_dst)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_OAM) && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_OAM)) + return -EMSGSIZE; + if (tun_opts_len) { + if (tun_key->tun_flags & TUNNEL_GENEVE_OPT && + nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, + tun_opts_len, tun_opts)) + return -EMSGSIZE; + else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT && + nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, + tun_opts_len, tun_opts)) + return -EMSGSIZE; + } + + return 0; +} + +static int psample_ip_tun_to_nlattr(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct nlattr *nla; + int err; + + nla = nla_nest_start_noflag(skb, PSAMPLE_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + err = __psample_ip_tun_to_nlattr(skb, tun_info); + if (err) { + nla_nest_cancel(skb, nla); + return err; + } + + nla_nest_end(skb, nla); + + return 0; +} + +static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) +{ + unsigned short tun_proto = ip_tunnel_info_af(tun_info); + const struct ip_tunnel_key *tun_key = &tun_info->key; + int tun_opts_len = tun_info->options_len; + int sum = 0; + + if (tun_key->tun_flags & TUNNEL_KEY) + sum += nla_total_size(sizeof(u64)); + + if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE) + sum += nla_total_size(0); + + switch (tun_proto) { + case AF_INET: + if (tun_key->u.ipv4.src) + sum += nla_total_size(sizeof(u32)); + if (tun_key->u.ipv4.dst) + sum += nla_total_size(sizeof(u32)); + break; + case AF_INET6: + if (!ipv6_addr_any(&tun_key->u.ipv6.src)) + sum += nla_total_size(sizeof(struct in6_addr)); + if (!ipv6_addr_any(&tun_key->u.ipv6.dst)) + sum += nla_total_size(sizeof(struct in6_addr)); + break; + } + if (tun_key->tos) + sum += nla_total_size(sizeof(u8)); + sum += nla_total_size(sizeof(u8)); /* TTL */ + if (tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) + sum += nla_total_size(0); + if (tun_key->tun_flags & TUNNEL_CSUM) + sum += nla_total_size(0); + if (tun_key->tp_src) + sum += nla_total_size(sizeof(u16)); + if (tun_key->tp_dst) + sum += nla_total_size(sizeof(u16)); + if (tun_key->tun_flags & TUNNEL_OAM) + sum += nla_total_size(0); + if (tun_opts_len) { + if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) + sum += nla_total_size(tun_opts_len); + else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT) + sum += nla_total_size(tun_opts_len); + } + + return sum; +} + void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, u32 trunc_size, int in_ifindex, int out_ifindex, u32 sample_rate) { + struct ip_tunnel_info *tun_info; struct sk_buff *nl_skb; int data_len; int meta_len; @@ -224,6 +371,10 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, nla_total_size(sizeof(u32)) + /* group_num */ nla_total_size(sizeof(u32)); /* seq */ + tun_info = skb_tunnel_info(skb); + if (tun_info) + meta_len += psample_tunnel_meta_len(tun_info); + data_len = min(skb->len, trunc_size); if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE) data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN @@ -278,6 +429,12 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, goto error; } + if (tun_info) { + ret = psample_ip_tun_to_nlattr(nl_skb, tun_info); + if (unlikely(ret < 0)) + goto error; + } + genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC); -- cgit v1.2.3 From 8066021915924f58ed338bf38208215f5a7355f6 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 20 May 2020 08:29:14 +0200 Subject: ethtool: provide UAPI for PHY Signal Quality Index (SQI) Signal Quality Index is a mandatory value required by "OPEN Alliance SIG" for the 100Base-T1 PHYs [1]. This indicator can be used for cable integrity diagnostic and investigating other noise sources and implement by at least two vendors: NXP[2] and TI[3]. [1] http://www.opensig.org/download/document/218/Advanced_PHY_features_for_automotive_Ethernet_V1.0.pdf [2] https://www.nxp.com/docs/en/data-sheet/TJA1100.pdf [3] https://www.ti.com/product/DP83TC811R-Q1 Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 6 ++- include/linux/phy.h | 2 + include/uapi/linux/ethtool_netlink.h | 2 + net/ethtool/linkstate.c | 75 +++++++++++++++++++++++++++- 4 files changed, 82 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index eed46b6aa07d..7e651ea33eab 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -454,10 +454,12 @@ Request contents: Kernel response contents: - ==================================== ====== ========================== + ==================================== ====== ============================ ``ETHTOOL_A_LINKSTATE_HEADER`` nested reply header ``ETHTOOL_A_LINKSTATE_LINK`` bool link state (up/down) - ==================================== ====== ========================== + ``ETHTOOL_A_LINKSTATE_SQI`` u32 Current Signal Quality Index + ``ETHTOOL_A_LINKSTATE_SQI_MAX`` u32 Max support SQI value + ==================================== ====== ============================ For most NIC drivers, the value of ``ETHTOOL_A_LINKSTATE_LINK`` returns carrier flag provided by ``netif_carrier_ok()`` but there are drivers which diff --git a/include/linux/phy.h b/include/linux/phy.h index 467aa8bf9f64..2bcdf19ed3b4 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -723,6 +723,8 @@ struct phy_driver { struct ethtool_tunable *tuna, const void *data); int (*set_loopback)(struct phy_device *dev, bool enable); + int (*get_sqi)(struct phy_device *dev); + int (*get_sqi_max)(struct phy_device *dev); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 2881af411f76..e6f109b76c9a 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -232,6 +232,8 @@ enum { ETHTOOL_A_LINKSTATE_UNSPEC, ETHTOOL_A_LINKSTATE_HEADER, /* nest - _A_HEADER_* */ ETHTOOL_A_LINKSTATE_LINK, /* u8 */ + ETHTOOL_A_LINKSTATE_SQI, /* u32 */ + ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ /* add new constants above here */ __ETHTOOL_A_LINKSTATE_CNT, diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index 2740cde0a182..7f47ba89054e 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -2,6 +2,7 @@ #include "netlink.h" #include "common.h" +#include struct linkstate_req_info { struct ethnl_req_info base; @@ -10,6 +11,8 @@ struct linkstate_req_info { struct linkstate_reply_data { struct ethnl_reply_data base; int link; + int sqi; + int sqi_max; }; #define LINKSTATE_REPDATA(__reply_base) \ @@ -20,8 +23,46 @@ linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { [ETHTOOL_A_LINKSTATE_UNSPEC] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_HEADER] = { .type = NLA_NESTED }, [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_SQI] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_SQI_MAX] = { .type = NLA_REJECT }, }; +static int linkstate_get_sqi(struct net_device *dev) +{ + struct phy_device *phydev = dev->phydev; + int ret; + + if (!phydev) + return -EOPNOTSUPP; + + mutex_lock(&phydev->lock); + if (!phydev->drv || !phydev->drv->get_sqi) + ret = -EOPNOTSUPP; + else + ret = phydev->drv->get_sqi(phydev); + mutex_unlock(&phydev->lock); + + return ret; +} + +static int linkstate_get_sqi_max(struct net_device *dev) +{ + struct phy_device *phydev = dev->phydev; + int ret; + + if (!phydev) + return -EOPNOTSUPP; + + mutex_lock(&phydev->lock); + if (!phydev->drv || !phydev->drv->get_sqi_max) + ret = -EOPNOTSUPP; + else + ret = phydev->drv->get_sqi_max(phydev); + mutex_unlock(&phydev->lock); + + return ret; +} + static int linkstate_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) @@ -34,6 +75,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; data->link = __ethtool_get_link(dev); + + ret = linkstate_get_sqi(dev); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + + data->sqi = ret; + + ret = linkstate_get_sqi_max(dev); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + + data->sqi_max = ret; + ethnl_ops_complete(dev); return 0; @@ -42,8 +96,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, static int linkstate_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { - return nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); + int len; + + len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + 0; + + if (data->sqi != -EOPNOTSUPP) + len += nla_total_size(sizeof(u32)); + + if (data->sqi_max != -EOPNOTSUPP) + len += nla_total_size(sizeof(u32)); + + return len; } static int linkstate_fill_reply(struct sk_buff *skb, @@ -56,6 +121,14 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) return -EMSGSIZE; + if (data->sqi != -EOPNOTSUPP && + nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) + return -EMSGSIZE; + + if (data->sqi_max != -EOPNOTSUPP && + nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) + return -EMSGSIZE; + return 0; } -- cgit v1.2.3 From 82c8c4ddcae74460f4ea484ab9ad97ddb466e469 Mon Sep 17 00:00:00 2001 From: James Jones Date: Wed, 11 Dec 2019 12:55:47 -0800 Subject: drm: Generalized NV Block Linear DRM format mod Builds upon the existing NVIDIA 16Bx2 block linear format modifiers by adding more "fields" to the existing parameterized DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK format modifier macro that allow fully defining a unique-across- all-NVIDIA-hardware bit layout using a minimal set of fields and values. The new modifier macro DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D is effectively backwards compatible with the existing macro, introducing a superset of the previously definable format modifiers. Backwards compatibility has two quirks. First, the zero value for the "kind" field, which is implied by the DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK macro, must be special cased in drivers and assumed to map to the pre-Turing generic kind of 0xfe, since a kind of "zero" is reserved for linear buffer layouts on all GPUs. Second, it is assumed backwards compatibility is only needed when running on Tegra GPUs, and specifically Tegra GPUs prior to Xavier. This is based on two assertions: -Tegra GPUs prior to Xavier used a slightly different raw bit layout than desktop GPUs, making it impossible to directly share block linear buffers between the two. -Support for the existing block linear modifiers was incomplete, making them useful only for exporting buffers created by nouveau and importing them to Tegra DRM as framebuffers for scan out. There was no support for adding framebuffers using format modifiers in nouveau, nor importing dma-buf/PRIME GEM objects into nouveau userspace drivers with modifiers in Mesa. Hence it is assumed the prior modifiers were not intended for use on desktop GPUs, and as a corollary, were not intended to support sharing block linear buffers across two different NVIDIA GPUs. v2: - Added canonicalize helper function v3: - Added additional bit to compression field to support Tesla (NV5x,G8x,G9x,GT1xx,GT2xx) class chips. Signed-off-by: James Jones Signed-off-by: Ben Skeggs --- include/uapi/drm/drm_fourcc.h | 122 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 114 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 9e488d10f8b4..490143500a50 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -527,7 +527,113 @@ extern "C" { #define DRM_FORMAT_MOD_NVIDIA_TEGRA_TILED fourcc_mod_code(NVIDIA, 1) /* - * 16Bx2 Block Linear layout, used by desktop GPUs, and Tegra K1 and later + * Generalized Block Linear layout, used by desktop GPUs starting with NV50/G80, + * and Tegra GPUs starting with Tegra K1. + * + * Pixels are arranged in Groups of Bytes (GOBs). GOB size and layout varies + * based on the architecture generation. GOBs themselves are then arranged in + * 3D blocks, with the block dimensions (in terms of GOBs) always being a power + * of two, and hence expressible as their log2 equivalent (E.g., "2" represents + * a block depth or height of "4"). + * + * Chapter 20 "Pixel Memory Formats" of the Tegra X1 TRM describes this format + * in full detail. + * + * Macro + * Bits Param Description + * ---- ----- ----------------------------------------------------------------- + * + * 3:0 h log2(height) of each block, in GOBs. Placed here for + * compatibility with the existing + * DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK()-based modifiers. + * + * 4:4 - Must be 1, to indicate block-linear layout. Necessary for + * compatibility with the existing + * DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK()-based modifiers. + * + * 8:5 - Reserved (To support 3D-surfaces with variable log2(depth) block + * size). Must be zero. + * + * Note there is no log2(width) parameter. Some portions of the + * hardware support a block width of two gobs, but it is impractical + * to use due to lack of support elsewhere, and has no known + * benefits. + * + * 11:9 - Reserved (To support 2D-array textures with variable array stride + * in blocks, specified via log2(tile width in blocks)). Must be + * zero. + * + * 19:12 k Page Kind. This value directly maps to a field in the page + * tables of all GPUs >= NV50. It affects the exact layout of bits + * in memory and can be derived from the tuple + * + * (format, GPU model, compression type, samples per pixel) + * + * Where compression type is defined below. If GPU model were + * implied by the format modifier, format, or memory buffer, page + * kind would not need to be included in the modifier itself, but + * since the modifier should define the layout of the associated + * memory buffer independent from any device or other context, it + * must be included here. + * + * 21:20 g GOB Height and Page Kind Generation. The height of a GOB changed + * starting with Fermi GPUs. Additionally, the mapping between page + * kind and bit layout has changed at various points. + * + * 0 = Gob Height 8, Fermi - Volta, Tegra K1+ Page Kind mapping + * 1 = Gob Height 4, G80 - GT2XX Page Kind mapping + * 2 = Gob Height 8, Turing+ Page Kind mapping + * 3 = Reserved for future use. + * + * 22:22 s Sector layout. On Tegra GPUs prior to Xavier, there is a further + * bit remapping step that occurs at an even lower level than the + * page kind and block linear swizzles. This causes the layout of + * surfaces mapped in those SOC's GPUs to be incompatible with the + * equivalent mapping on other GPUs in the same system. + * + * 0 = Tegra K1 - Tegra Parker/TX2 Layout. + * 1 = Desktop GPU and Tegra Xavier+ Layout + * + * 25:23 c Lossless Framebuffer Compression type. + * + * 0 = none + * 1 = ROP/3D, layout 1, exact compression format implied by Page + * Kind field + * 2 = ROP/3D, layout 2, exact compression format implied by Page + * Kind field + * 3 = CDE horizontal + * 4 = CDE vertical + * 5 = Reserved for future use + * 6 = Reserved for future use + * 7 = Reserved for future use + * + * 55:25 - Reserved for future use. Must be zero. + */ +#define DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(c, s, g, k, h) \ + fourcc_mod_code(NVIDIA, (0x10 | \ + ((h) & 0xf) | \ + (((k) & 0xff) << 12) | \ + (((g) & 0x3) << 20) | \ + (((s) & 0x1) << 22) | \ + (((c) & 0x7) << 23))) + +/* To grandfather in prior block linear format modifiers to the above layout, + * the page kind "0", which corresponds to "pitch/linear" and hence is unusable + * with block-linear layouts, is remapped within drivers to the value 0xfe, + * which corresponds to the "generic" kind used for simple single-sample + * uncompressed color formats on Fermi - Volta GPUs. + */ +static inline __u64 +drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier) +{ + if (!(modifier & 0x10) || (modifier & (0xff << 12))) + return modifier; + else + return modifier | (0xfe << 12); +} + +/* + * 16Bx2 Block Linear layout, used by Tegra K1 and later * * Pixels are arranged in 64x8 Groups Of Bytes (GOBs). GOBs are then stacked * vertically by a power of 2 (1 to 32 GOBs) to form a block. @@ -548,20 +654,20 @@ extern "C" { * in full detail. */ #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(v) \ - fourcc_mod_code(NVIDIA, 0x10 | ((v) & 0xf)) + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 0, 0, 0, (v)) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_ONE_GOB \ - fourcc_mod_code(NVIDIA, 0x10) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(0) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_TWO_GOB \ - fourcc_mod_code(NVIDIA, 0x11) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(1) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_FOUR_GOB \ - fourcc_mod_code(NVIDIA, 0x12) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(2) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_EIGHT_GOB \ - fourcc_mod_code(NVIDIA, 0x13) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(3) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_SIXTEEN_GOB \ - fourcc_mod_code(NVIDIA, 0x14) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(4) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_THIRTYTWO_GOB \ - fourcc_mod_code(NVIDIA, 0x15) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(5) /* * Some Broadcom modifiers take parameters, for example the number of -- cgit v1.2.3 From 38428d68719c454d269cb03b776d8a4b0ad66111 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 21 May 2020 22:26:13 -0700 Subject: nexthop: support for fdb ecmp nexthops This patch introduces ecmp nexthops and nexthop groups for mac fdb entries. In subsequent patches this is used by the vxlan driver fdb entries. The use case is E-VPN multihoming [1,2,3] which requires bridged vxlan traffic to be load balanced to remote switches (vteps) belonging to the same multi-homed ethernet segment (This is analogous to a multi-homed LAG but over vxlan). Changes include new nexthop flag NHA_FDB for nexthops referenced by fdb entries. These nexthops only have ip. This patch includes appropriate checks to avoid routes referencing such nexthops. example: $ip nexthop add id 12 via 172.16.1.2 fdb $ip nexthop add id 13 via 172.16.1.3 fdb $ip nexthop add id 102 group 12/13 fdb $bridge fdb add 02:02:00:00:00:13 dev vxlan1000 nhid 101 self [1] E-VPN https://tools.ietf.org/html/rfc7432 [2] E-VPN VxLAN: https://tools.ietf.org/html/rfc8365 [3] LPC talk with mention of nexthop groups for L2 ecmp http://vger.kernel.org/lpc_net2018_talks/scaling_bridge_fdb_database_slidesV3.pdf v4 - fixed uninitialized variable reported by kernel test robot Reported-by: kernel test robot Signed-off-by: Roopa Prabhu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + include/net/nexthop.h | 32 +++++++++++ include/uapi/linux/nexthop.h | 3 + net/ipv4/nexthop.c | 132 +++++++++++++++++++++++++++++++++++-------- net/ipv6/route.c | 5 ++ 5 files changed, 148 insertions(+), 25 deletions(-) (limited to 'include/uapi') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index fdaf975e3331..3f615a29766e 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -65,6 +65,7 @@ struct fib6_config { struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; + bool fc_is_fdb; }; struct fib6_node { diff --git a/include/net/nexthop.h b/include/net/nexthop.h index c440ccc861fc..d929c98931ad 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -26,6 +26,7 @@ struct nh_config { u8 nh_family; u8 nh_protocol; u8 nh_blackhole; + u8 nh_fdb; u32 nh_flags; int nh_ifindex; @@ -52,6 +53,7 @@ struct nh_info { u8 family; bool reject_nh; + bool fdb_nh; union { struct fib_nh_common fib_nhc; @@ -80,6 +82,7 @@ struct nexthop { struct rb_node rb_node; /* entry on netns rbtree */ struct list_head fi_list; /* v4 entries using nh */ struct list_head f6i_list; /* v6 entries using nh */ + struct list_head fdb_list; /* fdb entries using this nh */ struct list_head grp_list; /* nh group entries using this nh */ struct net *net; @@ -88,6 +91,7 @@ struct nexthop { u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; + bool is_fdb_nh; refcount_t refcnt; struct rcu_head rcu; @@ -304,4 +308,32 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash) int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg); + +static inline int nexthop_get_family(struct nexthop *nh) +{ + struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); + + return nhi->family; +} + +static inline +struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh) +{ + struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); + + return &nhi->fib_nhc; +} + +static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh, + int hash) +{ + struct nh_info *nhi; + struct nexthop *nhp; + + nhp = nexthop_select_path(nh, hash); + if (unlikely(!nhp)) + return NULL; + nhi = rcu_dereference(nhp->nh_info); + return &nhi->fib_nhc; +} #endif diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h index 7b61867e9848..2d4a1e784cf0 100644 --- a/include/uapi/linux/nexthop.h +++ b/include/uapi/linux/nexthop.h @@ -49,6 +49,9 @@ enum { NHA_GROUPS, /* flag; only return nexthop groups in dump */ NHA_MASTER, /* u32; only return nexthops with given master dev */ + NHA_FDB, /* flag; nexthop belongs to a bridge fdb */ + /* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */ + __NHA_MAX, }; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 3957364d556c..bf91edc04631 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -33,6 +33,7 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, + [NHA_FDB] = { .type = NLA_FLAG }, }; static unsigned int nh_dev_hashfn(unsigned int val) @@ -107,6 +108,7 @@ static struct nexthop *nexthop_alloc(void) INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); + INIT_LIST_HEAD(&nh->fdb_list); } return nh; } @@ -227,6 +229,9 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; + if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB)) + goto nla_put_failure; + if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); @@ -241,7 +246,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; - } else { + } else if (!nh->is_fdb_nh) { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; @@ -387,12 +392,35 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, return true; } +static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + if (!nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); + return -EINVAL; + } + + nhi = rtnl_dereference(nh->nh_info); + if (*nh_family == AF_UNSPEC) { + *nh_family = nhi->family; + } else if (*nh_family != nhi->family) { + NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); + return -EINVAL; + } + + return 0; +} + static int nh_check_attr_group(struct net *net, struct nlattr *tb[], struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); + u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; + u8 nhg_fdb = 0; if (len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, @@ -421,6 +449,8 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], } } + if (tb[NHA_FDB]) + nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; @@ -432,11 +462,20 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], } if (!valid_group_nh(nh, len, extack)) return -EINVAL; + + if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) + return -EINVAL; + + if (!nhg_fdb && nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); + return -EINVAL; + } } for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) { if (!tb[i]) continue; - + if (tb[NHA_FDB]) + continue; NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; @@ -495,6 +534,9 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) if (hash > atomic_read(&nhge->upper_bound)) continue; + if (nhge->nh->is_fdb_nh) + return nhge->nh; + /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ @@ -564,6 +606,11 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, { struct nh_info *nhi; + if (nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + return -EINVAL; + } + /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source @@ -640,6 +687,12 @@ int fib_check_nexthop(struct nexthop *nh, u8 scope, { int err = 0; + if (nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + err = -EINVAL; + goto out; + } + if (nh->is_group) { struct nh_group *nhg; @@ -1125,6 +1178,9 @@ static struct nexthop *nexthop_create_group(struct net *net, nh_group_rebalance(nhg); } + if (cfg->nh_fdb) + nh->is_fdb_nh = 1; + rcu_assign_pointer(nh->nh_grp, nhg); return nh; @@ -1152,7 +1208,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; - u32 tb_id = l3mdev_fib_table(cfg->dev); + u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); @@ -1161,6 +1217,9 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, goto out; } + if (nh->is_fdb_nh) + goto out; + /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { @@ -1186,6 +1245,7 @@ static int nh_create_ipv6(struct net *net, struct nexthop *nh, .fc_flags = cfg->nh_flags, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, + .fc_is_fdb = cfg->nh_fdb, }; int err; @@ -1227,6 +1287,9 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; + if (cfg->nh_fdb) + nh->is_fdb_nh = 1; + if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; @@ -1248,7 +1311,8 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, } /* add the entry to the device based hash */ - nexthop_devhash_add(net, nhi); + if (!nh->is_fdb_nh) + nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); @@ -1352,6 +1416,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); + if (tb[NHA_FDB]) { + if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); + goto out; + } + if (nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); + goto out; + } + cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); + } + if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); @@ -1375,8 +1452,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || - tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { - NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif"); + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { + NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } @@ -1385,26 +1462,28 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, goto out; } - if (!tb[NHA_OIF]) { - NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops"); + if (!cfg->nh_fdb && !tb[NHA_OIF]) { + NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } - cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); - if (cfg->nh_ifindex) - cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); + if (!cfg->nh_fdb && tb[NHA_OIF]) { + cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); + if (cfg->nh_ifindex) + cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); - if (!cfg->dev) { - NL_SET_ERR_MSG(extack, "Invalid device index"); - goto out; - } else if (!(cfg->dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Nexthop device is not up"); - err = -ENETDOWN; - goto out; - } else if (!netif_carrier_ok(cfg->dev)) { - NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); - err = -ENETDOWN; - goto out; + if (!cfg->dev) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + goto out; + } else if (!(cfg->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } else if (!netif_carrier_ok(cfg->dev)) { + NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); + err = -ENETDOWN; + goto out; + } } err = -EINVAL; @@ -1633,7 +1712,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, int *master_idx, bool *group_filter, - struct netlink_callback *cb) + bool *fdb_filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[NHA_MAX + 1]; @@ -1670,6 +1749,9 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, case NHA_GROUPS: *group_filter = true; break; + case NHA_FDB: + *fdb_filter = true; + break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; @@ -1688,17 +1770,17 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { + bool group_filter = false, fdb_filter = false; struct nhmsg *nhm = nlmsg_data(cb->nlh); int dev_filter_idx = 0, master_idx = 0; struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; - bool group_filter = false; struct rb_node *node; int idx = 0, s_idx; int err; err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, - &group_filter, cb); + &group_filter, &fdb_filter, cb); if (err < 0) return err; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a52ec1b86432..82cbb46a2a4f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3421,6 +3421,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, #ifdef CONFIG_IPV6_ROUTER_PREF fib6_nh->last_probe = jiffies; #endif + if (cfg->fc_is_fdb) { + fib6_nh->fib_nh_gw6 = cfg->fc_gateway; + fib6_nh->fib_nh_gw_family = AF_INET6; + return 0; + } err = -ENODEV; if (cfg->fc_ifindex) { -- cgit v1.2.3 From 1274e1cc42264d4e629841e4f182795cb0becfd2 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 21 May 2020 22:26:14 -0700 Subject: vxlan: ecmp support for mac fdb entries Todays vxlan mac fdb entries can point to multiple remote ips (rdsts) with the sole purpose of replicating broadcast-multicast and unknown unicast packets to those remote ips. E-VPN multihoming [1,2,3] requires bridged vxlan traffic to be load balanced to remote switches (vteps) belonging to the same multi-homed ethernet segment (E-VPN multihoming is analogous to multi-homed LAG implementations, but with the inter-switch peerlink replaced with a vxlan tunnel). In other words it needs support for mac ecmp. Furthermore, for faster convergence, E-VPN multihoming needs the ability to update fdb ecmp nexthops independent of the fdb entries. New route nexthop API is perfect for this usecase. This patch extends the vxlan fdb code to take a nexthop id pointing to an ecmp nexthop group. Changes include: - New NDA_NH_ID attribute for fdbs - Use the newly added fdb nexthop groups - makes vxlan rdsts and nexthop handling code mutually exclusive - since this is a new use-case and the requirement is for ecmp nexthop groups, the fdb add and update path checks that the nexthop is really an ecmp nexthop group. This check can be relaxed in the future, if we want to introduce replication fdb nexthop groups and allow its use in lieu of current rdst lists. - fdb update requests with nexthop id's only allowed for existing fdb's that have nexthop id's - learning will not override an existing fdb entry with nexthop group - I have wrapped the switchdev offload code around the presence of rdst [1] E-VPN RFC https://tools.ietf.org/html/rfc7432 [2] E-VPN with vxlan https://tools.ietf.org/html/rfc8365 [3] http://vger.kernel.org/lpc_net2018_talks/scaling_bridge_fdb_database_slidesV3.pdf Includes a null check fix in vxlan_xmit from Nikolay v2 - Fixed build issue: Reported-by: kbuild test robot Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 306 +++++++++++++++++++++++++++++++++-------- include/net/vxlan.h | 25 ++++ include/uapi/linux/neighbour.h | 1 + net/core/neighbour.c | 2 + 4 files changed, 275 insertions(+), 59 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index a5b415fed11e..754e00240eea 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -26,6 +26,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -78,6 +79,8 @@ struct vxlan_fdb { u16 state; /* see ndm_state */ __be32 vni; u16 flags; /* see ndm_flags and below */ + struct list_head nh_list; + struct nexthop __rcu *nh; }; #define NTF_VXLAN_ADDED_BY_USER 0x100 @@ -174,11 +177,15 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port) */ static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) { + if (rcu_access_pointer(fdb->nh)) + return NULL; return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); } static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) { + if (rcu_access_pointer(fdb->nh)) + return NULL; return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); } @@ -251,9 +258,10 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, { unsigned long now = jiffies; struct nda_cacheinfo ci; + bool send_ip, send_eth; struct nlmsghdr *nlh; + struct nexthop *nh; struct ndmsg *ndm; - bool send_ip, send_eth; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) @@ -264,16 +272,21 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, send_eth = send_ip = true; + nh = rcu_dereference_rtnl(fdb->nh); if (type == RTM_GETNEIGH) { - send_ip = !vxlan_addr_any(&rdst->remote_ip); + if (rdst) { + send_ip = !vxlan_addr_any(&rdst->remote_ip); + ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; + } else if (nh) { + ndm->ndm_family = nexthop_get_family(nh); + } send_eth = !is_zero_ether_addr(fdb->eth_addr); - ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; ndm->ndm_ifindex = vxlan->dev->ifindex; ndm->ndm_flags = fdb->flags; - if (rdst->offloaded) + if (rdst && rdst->offloaded) ndm->ndm_flags |= NTF_OFFLOADED; ndm->ndm_type = RTN_UNICAST; @@ -284,23 +297,30 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; + if (nh) { + if (nla_put_u32(skb, NDA_NH_ID, nh->id)) + goto nla_put_failure; + } else if (rdst) { + if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, + &rdst->remote_ip)) + goto nla_put_failure; + + if (rdst->remote_port && + rdst->remote_port != vxlan->cfg.dst_port && + nla_put_be16(skb, NDA_PORT, rdst->remote_port)) + goto nla_put_failure; + if (rdst->remote_vni != vxlan->default_dst.remote_vni && + nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) + goto nla_put_failure; + if (rdst->remote_ifindex && + nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) + goto nla_put_failure; + } - if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) - goto nla_put_failure; - - if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && - nla_put_be16(skb, NDA_PORT, rdst->remote_port)) - goto nla_put_failure; - if (rdst->remote_vni != vxlan->default_dst.remote_vni && - nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) - goto nla_put_failure; if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && nla_put_u32(skb, NDA_SRC_VNI, be32_to_cpu(fdb->vni))) goto nla_put_failure; - if (rdst->remote_ifindex && - nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) - goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; @@ -401,7 +421,7 @@ static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, { int err; - if (swdev_notify) { + if (swdev_notify && rd) { switch (type) { case RTM_NEWNEIGH: err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, @@ -805,6 +825,8 @@ static struct vxlan_fdb *vxlan_fdb_alloc(const u8 *mac, __u16 state, f->flags = ndm_flags; f->updated = f->used = jiffies; f->vni = src_vni; + f->nh = NULL; + INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); memcpy(f->eth_addr, mac, ETH_ALEN); @@ -819,11 +841,78 @@ static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac, vxlan_fdb_head(vxlan, mac, src_vni)); } +static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, + u32 nhid, struct netlink_ext_ack *extack) +{ + struct nexthop *old_nh = rtnl_dereference(fdb->nh); + struct nh_group *nhg; + struct nexthop *nh; + int err = -EINVAL; + + if (old_nh && old_nh->id == nhid) + return 0; + + nh = nexthop_find_by_id(vxlan->net, nhid); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto err_inval; + } + + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + nh = NULL; + goto err_inval; + } + if (!nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop"); + goto err_inval; + } + + if (!nh->is_group || !nh->nh_grp->mpath) { + NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group"); + goto err_inval; + } + + /* check nexthop group family */ + nhg = rtnl_dereference(nh->nh_grp); + switch (vxlan->default_dst.remote_ip.sa.sa_family) { + case AF_INET: + if (!nhg->has_v4) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); + goto err_inval; + } + break; + case AF_INET6: + if (nhg->has_v4) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); + goto err_inval; + } + } + } + + if (old_nh) { + list_del_rcu(&fdb->nh_list); + nexthop_put(old_nh); + } + rcu_assign_pointer(fdb->nh, nh); + list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list); + return 1; + +err_inval: + if (nh) + nexthop_put(nh); + return err; +} + static int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, - struct vxlan_fdb **fdb) + u32 nhid, struct vxlan_fdb **fdb, + struct netlink_ext_ack *extack) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; @@ -838,20 +927,33 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, if (!f) return -ENOMEM; - rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); - if (rc < 0) { - kfree(f); - return rc; - } + if (nhid) + rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); + else + rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); + if (rc < 0) + goto errout; *fdb = f; return 0; + +errout: + kfree(f); + return rc; } static void __vxlan_fdb_free(struct vxlan_fdb *f) { struct vxlan_rdst *rd, *nd; + struct nexthop *nh; + + nh = rcu_dereference_raw(f->nh); + if (nh) { + rcu_assign_pointer(f->nh, NULL); + list_del_rcu(&f->nh_list); + nexthop_put(nh); + } list_for_each_entry_safe(rd, nd, &f->remotes, list) { dst_cache_destroy(&rd->dst_cache); @@ -875,10 +977,15 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); --vxlan->addrcnt; - if (do_notify) - list_for_each_entry(rd, &f->remotes, list) - vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, + if (do_notify) { + if (rcu_access_pointer(f->nh)) + vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH, swdev_notify, NULL); + else + list_for_each_entry(rd, &f->remotes, list) + vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, + swdev_notify, NULL); + } hlist_del_rcu(&f->hlist); call_rcu(&f->rcu, vxlan_fdb_free); @@ -897,7 +1004,7 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, __u16 state, __u16 flags, __be16 port, __be32 vni, __u32 ifindex, __u16 ndm_flags, - struct vxlan_fdb *f, + struct vxlan_fdb *f, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -908,6 +1015,18 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, int rc = 0; int err; + if (nhid && !rcu_access_pointer(f->nh)) { + NL_SET_ERR_MSG(extack, + "Cannot replace an existing non nexthop fdb with a nexthop"); + return -EOPNOTSUPP; + } + + if (nhid && (flags & NLM_F_APPEND)) { + NL_SET_ERR_MSG(extack, + "Cannot append to a nexthop fdb"); + return -EOPNOTSUPP; + } + /* Do not allow an externally learned entry to take over an entry added * by the user. */ @@ -929,10 +1048,17 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, /* Only change unicasts */ if (!(is_multicast_ether_addr(f->eth_addr) || is_zero_ether_addr(f->eth_addr))) { - rc = vxlan_fdb_replace(f, ip, port, vni, - ifindex, &oldrd); + if (nhid) { + rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); + if (rc < 0) + return rc; + } else { + rc = vxlan_fdb_replace(f, ip, port, vni, + ifindex, &oldrd); + } notify |= rc; } else { + NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries"); return -EOPNOTSUPP; } } @@ -962,6 +1088,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, return 0; err_notify: + if (nhid) + return err; if ((flags & NLM_F_REPLACE) && rc) *rd = oldrd; else if ((flags & NLM_F_APPEND) && rc) { @@ -975,7 +1103,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, - __u32 ifindex, __u16 ndm_flags, + __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -990,7 +1118,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni, - vni, ifindex, fdb_flags, &f); + vni, ifindex, fdb_flags, nhid, &f, extack); if (rc < 0) return rc; @@ -1012,7 +1140,7 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, - __u32 ifindex, __u16 ndm_flags, + __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -1028,14 +1156,15 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan, return vxlan_fdb_update_existing(vxlan, ip, state, flags, port, vni, ifindex, ndm_flags, f, - swdev_notify, extack); + nhid, swdev_notify, extack); } else { if (!(flags & NLM_F_CREATE)) return -ENOENT; return vxlan_fdb_update_create(vxlan, mac, ip, state, flags, port, src_vni, vni, ifindex, - ndm_flags, swdev_notify, extack); + ndm_flags, nhid, swdev_notify, + extack); } } @@ -1049,7 +1178,7 @@ static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, union vxlan_addr *ip, __be16 *port, __be32 *src_vni, - __be32 *vni, u32 *ifindex) + __be32 *vni, u32 *ifindex, u32 *nhid) { struct net *net = dev_net(vxlan->dev); int err; @@ -1109,6 +1238,11 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, *ifindex = 0; } + if (tb[NDA_NH_ID]) + *nhid = nla_get_u32(tb[NDA_NH_ID]); + else + *nhid = 0; + return 0; } @@ -1123,7 +1257,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], union vxlan_addr ip; __be16 port; __be32 src_vni, vni; - u32 ifindex; + u32 ifindex, nhid; u32 hash_index; int err; @@ -1133,10 +1267,11 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } - if (tb[NDA_DST] == NULL) + if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID])) return -EINVAL; - err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, + &nhid); if (err) return err; @@ -1148,7 +1283,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, - true, extack); + nhid, true, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; @@ -1159,8 +1294,8 @@ static int __vxlan_fdb_delete(struct vxlan_dev *vxlan, __be16 port, __be32 src_vni, __be32 vni, u32 ifindex, bool swdev_notify) { - struct vxlan_fdb *f; struct vxlan_rdst *rd = NULL; + struct vxlan_fdb *f; int err = -ENOENT; f = vxlan_find_mac(vxlan, addr, src_vni); @@ -1195,12 +1330,13 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct vxlan_dev *vxlan = netdev_priv(dev); union vxlan_addr ip; __be32 src_vni, vni; - __be16 port; - u32 ifindex; + u32 ifindex, nhid; u32 hash_index; + __be16 port; int err; - err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, + &nhid); if (err) return err; @@ -1228,6 +1364,17 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { struct vxlan_rdst *rd; + if (rcu_access_pointer(f->nh)) { + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI, NULL); + if (err < 0) + goto out; + continue; + } + list_for_each_entry_rcu(rd, &f->remotes, list) { if (*idx < cb->args[2]) goto skip; @@ -1311,6 +1458,10 @@ static bool vxlan_snoop(struct net_device *dev, if (f->state & (NUD_PERMANENT | NUD_NOARP)) return true; + /* Don't override an fdb with nexthop with a learnt entry */ + if (rcu_access_pointer(f->nh)) + return true; + if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pIS to %pIS\n", @@ -1333,7 +1484,7 @@ static bool vxlan_snoop(struct net_device *dev, vxlan->cfg.dst_port, vni, vxlan->default_dst.remote_vni, - ifindex, NTF_SELF, true, NULL); + ifindex, NTF_SELF, 0, true, NULL); spin_unlock(&vxlan->hash_lock[hash_index]); } @@ -2616,6 +2767,38 @@ tx_error: kfree_skb(skb); } +static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, + struct vxlan_fdb *f, __be32 vni, bool did_rsc) +{ + struct vxlan_rdst nh_rdst; + struct nexthop *nh; + bool do_xmit; + u32 hash; + + memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); + hash = skb_get_hash(skb); + + rcu_read_lock(); + nh = rcu_dereference(f->nh); + if (!nh) { + rcu_read_unlock(); + goto drop; + } + do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); + rcu_read_unlock(); + + if (likely(do_xmit)) + vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); + else + goto drop; + + return; + +drop: + dev->stats.tx_dropped++; + dev_kfree_skb(skb); +} + /* Transmit local packets over Vxlan * * Outer IP header inherits ECN and DF from inner header. @@ -2692,22 +2875,27 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } } - list_for_each_entry_rcu(rdst, &f->remotes, list) { - struct sk_buff *skb1; + if (rcu_access_pointer(f->nh)) { + vxlan_xmit_nh(skb, dev, f, + (vni ? : vxlan->default_dst.remote_vni), did_rsc); + } else { + list_for_each_entry_rcu(rdst, &f->remotes, list) { + struct sk_buff *skb1; - if (!fdst) { - fdst = rdst; - continue; + if (!fdst) { + fdst = rdst; + continue; + } + skb1 = skb_clone(skb, GFP_ATOMIC); + if (skb1) + vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); } - skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1) - vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); + if (fdst) + vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); + else + kfree_skb(skb); } - if (fdst) - vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); - else - kfree_skb(skb); return NETDEV_TX_OK; } @@ -3615,7 +3803,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, - NTF_SELF, &f); + NTF_SELF, 0, &f, extack); if (err) return err; } @@ -4013,7 +4201,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], vxlan->cfg.dst_port, conf.vni, conf.vni, conf.remote_ifindex, - NTF_SELF, true, extack); + NTF_SELF, 0, true, extack); if (err) { spin_unlock_bh(&vxlan->hash_lock[hash_index]); netdev_adjacent_change_abort(dst->remote_dev, @@ -4335,7 +4523,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev, fdb_info->remote_vni, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, - false, extack); + 0, false, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 373aadcfea21..3a41627cbdfe 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -7,6 +7,7 @@ #include #include #include +#include #define IANA_VXLAN_UDP_PORT 4789 @@ -487,4 +488,28 @@ static inline void vxlan_flag_attr_error(int attrtype, #undef VXLAN_FLAG } +static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, + int hash, + struct vxlan_rdst *rdst) +{ + struct fib_nh_common *nhc; + + nhc = nexthop_path_fdb_result(nh, hash); + if (unlikely(!nhc)) + return false; + + switch (nhc->nhc_gw_family) { + case AF_INET: + rdst->remote_ip.sin.sin_addr.s_addr = nhc->nhc_gw.ipv4; + rdst->remote_ip.sa.sa_family = AF_INET; + break; + case AF_INET6: + rdst->remote_ip.sin6.sin6_addr = nhc->nhc_gw.ipv6; + rdst->remote_ip.sa.sa_family = AF_INET6; + break; + } + + return true; +} + #endif diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index cd144e3099a3..eefcda8ca44e 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -29,6 +29,7 @@ enum { NDA_LINK_NETNSID, NDA_SRC_VNI, NDA_PROTOCOL, /* Originator of entry */ + NDA_NH_ID, __NDA_MAX }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b607ea602774..37e4dba62460 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family) } const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, @@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, + [NDA_NH_ID] = { .type = NLA_U32 }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.3 From 31344b2fcead3239c5b801016d9bae82506b92c2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 May 2020 14:49:10 +0200 Subject: btrfs: remove more obsolete v0 extent ref declarations The extent references v0 have been superseded long time go, there are some unused declarations of access helpers. We can safely remove them now. The struct btrfs_extent_ref_v0 is not used anywhere, but struct btrfs_extent_item_v0 is still part of a backward compatibility check in relocation.c and thus not removed. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 --------- include/uapi/linux/btrfs_tree.h | 9 --------- 2 files changed, 18 deletions(-) (limited to 'include/uapi') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0b78ab0213bb..86ec25250ac5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1648,9 +1648,6 @@ BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); -BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); - - BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); static inline void btrfs_tree_block_key(struct extent_buffer *eb, @@ -1698,12 +1695,6 @@ static inline u32 btrfs_extent_inline_ref_size(int type) return 0; } -BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64); -BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0, - generation, 64); -BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64); -BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); - /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 8e322e2c7e78..a3f3975df0de 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -519,15 +519,6 @@ struct btrfs_extent_inline_ref { __le64 offset; } __attribute__ ((__packed__)); -/* old style backrefs item */ -struct btrfs_extent_ref_v0 { - __le64 root; - __le64 generation; - __le64 objectid; - __le32 count; -} __attribute__ ((__packed__)); - - /* dev extents record free space on individual devices. The owner * field points back to the chunk allocation mapping tree that allocated * the extent. The chunk tree uuid field is a way to double check the owner -- cgit v1.2.3 From 61aec25a6db5d0c2e8ab5da6d2d152269d0d9d69 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 26 May 2020 14:29:04 +0200 Subject: cls_flower: Support filtering on multiple MPLS Label Stack Entries With struct flow_dissector_key_mpls now recording the first FLOW_DIS_MPLS_MAX labels, we can extend Flower to filter on any of these LSEs independently. In order to avoid creating new netlink attributes for every possible depth, let's define a new TCA_FLOWER_KEY_MPLS_OPTS nested attribute that contains the list of LSEs to match. Each LSE is represented by another attribute, TCA_FLOWER_KEY_MPLS_OPTS_LSE, which then contains the attributes representing the depth and the MPLS fields to match at this depth (label, TTL, etc.). For each MPLS field, the mask is always set to all-ones, as this is what the original API did. We could allow user configurable masks in the future if there is demand for more flexibility. The new API also allows to only specify an LSE depth. In that case, Flower only verifies that the MPLS label stack depth is greater or equal to the provided depth (that is, an LSE exists at this depth). Filters that only match on one (or more) fields of the first LSE are dumped using the old netlink attributes, to avoid confusing user space programs that don't understand the new API. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 23 ++++ net/sched/cls_flower.c | 243 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 265 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index fc672b232437..7576209d96f9 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -576,6 +576,8 @@ enum { TCA_FLOWER_KEY_CT_LABELS, /* u128 */ TCA_FLOWER_KEY_CT_LABELS_MASK, /* u128 */ + TCA_FLOWER_KEY_MPLS_OPTS, + __TCA_FLOWER_MAX, }; @@ -640,6 +642,27 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1) +enum { + TCA_FLOWER_KEY_MPLS_OPTS_UNSPEC, + TCA_FLOWER_KEY_MPLS_OPTS_LSE, + __TCA_FLOWER_KEY_MPLS_OPTS_MAX, +}; + +#define TCA_FLOWER_KEY_MPLS_OPTS_MAX (__TCA_FLOWER_KEY_MPLS_OPTS_MAX - 1) + +enum { + TCA_FLOWER_KEY_MPLS_OPT_LSE_UNSPEC, + TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH, + TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL, + TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS, + TCA_FLOWER_KEY_MPLS_OPT_LSE_TC, + TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL, + __TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX, +}; + +#define TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX \ + (__TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index f524afe0b7f5..96f5999281e0 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -668,6 +668,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_MPLS_OPTS] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_IP_TOS] = { .type = NLA_U8 }, @@ -726,6 +727,20 @@ erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; +static const struct nla_policy +mpls_opts_policy[TCA_FLOWER_KEY_MPLS_OPTS_MAX + 1] = { + [TCA_FLOWER_KEY_MPLS_OPTS_LSE] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { + [TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_TC] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL] = { .type = NLA_U32 }, +}; + static void fl_set_key_val(struct nlattr **tb, void *val, int val_type, void *mask, int mask_type, int len) @@ -776,6 +791,126 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, return 0; } +static int fl_set_key_mpls_lse(const struct nlattr *nla_lse, + struct flow_dissector_key_mpls *key_val, + struct flow_dissector_key_mpls *key_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1]; + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_val; + u8 lse_index; + u8 depth; + int err; + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX, nla_lse, + mpls_stack_entry_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]) { + NL_SET_ERR_MSG(extack, "Missing MPLS option \"depth\""); + return -EINVAL; + } + + depth = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]); + + /* LSE depth starts at 1, for consistency with terminology used by + * RFC 3031 (section 3.9), where depth 0 refers to unlabeled packets. + */ + if (depth < 1 || depth > FLOW_DIS_MPLS_MAX) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH], + "Invalid MPLS depth"); + return -EINVAL; + } + lse_index = depth - 1; + + dissector_set_mpls_lse(key_val, lse_index); + dissector_set_mpls_lse(key_mask, lse_index); + + lse_val = &key_val->ls[lse_index]; + lse_mask = &key_mask->ls[lse_index]; + + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]) { + lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]); + lse_mask->mpls_ttl = MPLS_TTL_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]) { + u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]); + + if (bos & ~MPLS_BOS_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS], + "Bottom Of Stack (BOS) must be 0 or 1"); + return -EINVAL; + } + lse_val->mpls_bos = bos; + lse_mask->mpls_bos = MPLS_BOS_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]) { + u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]); + + if (tc & ~MPLS_TC_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC], + "Traffic Class (TC) must be between 0 and 7"); + return -EINVAL; + } + lse_val->mpls_tc = tc; + lse_mask->mpls_tc = MPLS_TC_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]) { + u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]); + + if (label & ~MPLS_LABEL_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL], + "Label must be between 0 and 1048575"); + return -EINVAL; + } + lse_val->mpls_label = label; + lse_mask->mpls_label = MPLS_LABEL_MASK; + } + + return 0; +} + +static int fl_set_key_mpls_opts(const struct nlattr *nla_mpls_opts, + struct flow_dissector_key_mpls *key_val, + struct flow_dissector_key_mpls *key_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *nla_lse; + int rem; + int err; + + if (!(nla_mpls_opts->nla_type & NLA_F_NESTED)) { + NL_SET_ERR_MSG_ATTR(extack, nla_mpls_opts, + "NLA_F_NESTED is missing"); + return -EINVAL; + } + + nla_for_each_nested(nla_lse, nla_mpls_opts, rem) { + if (nla_type(nla_lse) != TCA_FLOWER_KEY_MPLS_OPTS_LSE) { + NL_SET_ERR_MSG_ATTR(extack, nla_lse, + "Invalid MPLS option type"); + return -EINVAL; + } + + err = fl_set_key_mpls_lse(nla_lse, key_val, key_mask, extack); + if (err < 0) + return err; + } + if (rem) { + NL_SET_ERR_MSG(extack, + "Bytes leftover after parsing MPLS options"); + return -EINVAL; + } + + return 0; +} + static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_val, struct flow_dissector_key_mpls *key_mask, @@ -784,6 +919,21 @@ static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_mpls_lse *lse_mask; struct flow_dissector_mpls_lse *lse_val; + if (tb[TCA_FLOWER_KEY_MPLS_OPTS]) { + if (tb[TCA_FLOWER_KEY_MPLS_TTL] || + tb[TCA_FLOWER_KEY_MPLS_BOS] || + tb[TCA_FLOWER_KEY_MPLS_TC] || + tb[TCA_FLOWER_KEY_MPLS_LABEL]) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPTS], + "MPLS label, Traffic Class, Bottom Of Stack and Time To Live must be encapsulated in the MPLS options attribute"); + return -EBADMSG; + } + + return fl_set_key_mpls_opts(tb[TCA_FLOWER_KEY_MPLS_OPTS], + key_val, key_mask, extack); + } + lse_val = &key_val->ls[0]; lse_mask = &key_mask->ls[0]; @@ -2232,6 +2382,89 @@ static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key, return 0; } +static int fl_dump_key_mpls_opt_lse(struct sk_buff *skb, + struct flow_dissector_key_mpls *mpls_key, + struct flow_dissector_key_mpls *mpls_mask, + u8 lse_index) +{ + struct flow_dissector_mpls_lse *lse_mask = &mpls_mask->ls[lse_index]; + struct flow_dissector_mpls_lse *lse_key = &mpls_key->ls[lse_index]; + int err; + + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH, + lse_index + 1); + if (err) + return err; + + if (lse_mask->mpls_ttl) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL, + lse_key->mpls_ttl); + if (err) + return err; + } + if (lse_mask->mpls_bos) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS, + lse_key->mpls_bos); + if (err) + return err; + } + if (lse_mask->mpls_tc) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TC, + lse_key->mpls_tc); + if (err) + return err; + } + if (lse_mask->mpls_label) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL, + lse_key->mpls_label); + if (err) + return err; + } + + return 0; +} + +static int fl_dump_key_mpls_opts(struct sk_buff *skb, + struct flow_dissector_key_mpls *mpls_key, + struct flow_dissector_key_mpls *mpls_mask) +{ + struct nlattr *opts; + struct nlattr *lse; + u8 lse_index; + int err; + + opts = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS); + if (!opts) + return -EMSGSIZE; + + for (lse_index = 0; lse_index < FLOW_DIS_MPLS_MAX; lse_index++) { + if (!(mpls_mask->used_lses & 1 << lse_index)) + continue; + + lse = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS_LSE); + if (!lse) { + err = -EMSGSIZE; + goto err_opts; + } + + err = fl_dump_key_mpls_opt_lse(skb, mpls_key, mpls_mask, + lse_index); + if (err) + goto err_opts_lse; + nla_nest_end(skb, lse); + } + nla_nest_end(skb, opts); + + return 0; + +err_opts_lse: + nla_nest_cancel(skb, lse); +err_opts: + nla_nest_cancel(skb, opts); + + return err; +} + static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_key_mpls *mpls_key, struct flow_dissector_key_mpls *mpls_mask) @@ -2240,12 +2473,20 @@ static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_mpls_lse *lse_key; int err; - if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask))) + if (!mpls_mask->used_lses) return 0; lse_mask = &mpls_mask->ls[0]; lse_key = &mpls_key->ls[0]; + /* For backward compatibility, don't use the MPLS nested attributes if + * the rule can be expressed using the old attributes. + */ + if (mpls_mask->used_lses & ~1 || + (!lse_mask->mpls_ttl && !lse_mask->mpls_bos && + !lse_mask->mpls_tc && !lse_mask->mpls_label)) + return fl_dump_key_mpls_opts(skb, mpls_key, mpls_mask); + if (lse_mask->mpls_ttl) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL, lse_key->mpls_ttl); -- cgit v1.2.3 From a331172b156b23e83dfb556ade0ca23426c3f149 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:37 +0200 Subject: net: ethtool: Add attributes for cable test TDR data Some Ethernet PHYs can return the raw time domain reflectromatry data. Add the attributes to allow this data to be requested and returned via netlink ethtool. Signed-off-by: Andrew Lunn v2: m -> cm Report what the PHY actually used for start/stop/step. Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 81 ++++++++++++++++++++++++++++ include/uapi/linux/ethtool_netlink.h | 63 ++++++++++++++++++++++ 2 files changed, 144 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 7e651ea33eab..dae36227d590 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -205,6 +205,7 @@ Userspace to kernel: ``ETHTOOL_MSG_EEE_SET`` set EEE settings ``ETHTOOL_MSG_TSINFO_GET`` get timestamping info ``ETHTOOL_MSG_CABLE_TEST_ACT`` action start cable test + ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT`` action start raw TDR cable test ===================================== ================================ Kernel to userspace: @@ -237,6 +238,7 @@ Kernel to userspace: ``ETHTOOL_MSG_EEE_NTF`` EEE settings ``ETHTOOL_MSG_TSINFO_GET_REPLY`` timestamping info ``ETHTOOL_MSG_CABLE_TEST_NTF`` Cable test results + ``ETHTOOL_MSG_CABLE_TEST_TDR_NTF`` Cable test TDR results ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1014,6 +1016,84 @@ information. | | | ``ETHTOOL_A_CABLE_FAULT_LENGTH_CM`` | u32 | length in cm | +-+-+-----------------------------------------+--------+---------------------+ +CABLE_TEST TDR +============== + +Start a cable test and report raw TDR data + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` nested request header + ==================================== ====== ========================== + +Notification contents: + +Raw TDR data is gathered by sending a pulse down the cable and +recording the amplitude of the reflected pulse for a given distance. + +It can take a number of seconds to collect TDR data, especial if the +full 100 meters is probed at 1 meter intervals. When the test is +started a notification will be sent containing just +ETHTOOL_A_CABLE_TEST_TDR_STATUS with the value +ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED. + +When the test has completed a second notification will be sent +containing ETHTOOL_A_CABLE_TEST_TDR_STATUS with the value +ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED and the TDR data. + +The message may optionally contain the amplitude of the pulse send +down the cable. This is measured in mV. A reflection should not be +bigger than transmitted pulse. + +Before the raw TDR data should be an ETHTOOL_A_CABLE_TDR_NEST_STEP +nest containing information about the distance along the cable for the +first reading, the last reading, and the step between each +reading. Distances are measured in centimeters. These should be the +exact values the PHY used. These may be different to what the user +requested, if the native measurement resolution is greater than 1 cm. + +For each step along the cable, a ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE is +used to report the amplitude of the reflection for a given pair. + + +---------------------------------------------+--------+----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` | nested | reply header | + +---------------------------------------------+--------+----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_STATUS`` | u8 | completed | + +---------------------------------------------+--------+----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST`` | nested | all the results | + +-+-------------------------------------------+--------+----------------------+ + | | ``ETHTOOL_A_CABLE_TDR_NEST_PULSE`` | nested | TX Pulse amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_PULSE_mV`` | s16 | Pulse amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | ``ETHTOOL_A_CABLE_NEST_STEP`` | nested | TDR step info | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE ``| u32 | First data distance | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_STEP_LAST_DISTANCE `` | u32 | Last data distance | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_STEP_STEP_DISTANCE `` | u32 | distance of each step| + +-+-+-----------------------------------------+--------+----------------------+ + | | ``ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE`` | nested | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_RESULTS_PAIR`` | u8 | pair number | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_AMPLITUDE_mV`` | s16 | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | ``ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE`` | nested | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_RESULTS_PAIR`` | u8 | pair number | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_AMPLITUDE_mV`` | s16 | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | ``ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE`` | nested | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_RESULTS_PAIR`` | u8 | pair number | + +-+-+-----------------------------------------+--------+----------------------+ + | | | ``ETHTOOL_A_CABLE_AMPLITUDE_mV`` | s16 | Reflection amplitude | + +-+-+-----------------------------------------+--------+----------------------+ + Request translation =================== @@ -1110,4 +1190,5 @@ are netlink only. ``ETHTOOL_GFECPARAM`` n/a ``ETHTOOL_SFECPARAM`` n/a n/a ''ETHTOOL_MSG_CABLE_TEST_ACT'' + n/a ''ETHTOOL_MSG_CABLE_TEST_TDR_ACT'' =================================== ===================================== diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index e6f109b76c9a..739faa7070c6 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -40,6 +40,7 @@ enum { ETHTOOL_MSG_EEE_SET, ETHTOOL_MSG_TSINFO_GET, ETHTOOL_MSG_CABLE_TEST_ACT, + ETHTOOL_MSG_CABLE_TEST_TDR_ACT, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -76,6 +77,7 @@ enum { ETHTOOL_MSG_EEE_NTF, ETHTOOL_MSG_TSINFO_GET_REPLY, ETHTOOL_MSG_CABLE_TEST_NTF, + ETHTOOL_MSG_CABLE_TEST_TDR_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -478,6 +480,67 @@ enum { ETHTOOL_A_CABLE_TEST_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_NTF_CNT - 1) }; +/* CABLE TEST TDR */ + +enum { + ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_CNT, + ETHTOOL_A_CABLE_TEST_TDR_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CNT - 1 +}; + +/* CABLE TEST TDR NOTIFY */ + +enum { + ETHTOOL_A_CABLE_AMPLITUDE_UNSPEC, + ETHTOOL_A_CABLE_AMPLITUDE_PAIR, /* u8 */ + ETHTOOL_A_CABLE_AMPLITUDE_mV, /* s16 */ + + __ETHTOOL_A_CABLE_AMPLITUDE_CNT, + ETHTOOL_A_CABLE_AMPLITUDE_MAX = (__ETHTOOL_A_CABLE_AMPLITUDE_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_PULSE_UNSPEC, + ETHTOOL_A_CABLE_PULSE_mV, /* s16 */ + + __ETHTOOL_A_CABLE_PULSE_CNT, + ETHTOOL_A_CABLE_PULSE_MAX = (__ETHTOOL_A_CABLE_PULSE_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_STEP_UNSPEC, + ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE, /* u32 */ + ETHTOOL_A_CABLE_STEP_LAST_DISTANCE, /* u32 */ + ETHTOOL_A_CABLE_STEP_STEP_DISTANCE, /* u32 */ + + __ETHTOOL_A_CABLE_STEP_CNT, + ETHTOOL_A_CABLE_STEP_MAX = (__ETHTOOL_A_CABLE_STEP_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_TDR_NEST_UNSPEC, + ETHTOOL_A_CABLE_TDR_NEST_STEP, /* nest - ETHTTOOL_A_CABLE_STEP */ + ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE, /* nest - ETHTOOL_A_CABLE_AMPLITUDE */ + ETHTOOL_A_CABLE_TDR_NEST_PULSE, /* nest - ETHTOOL_A_CABLE_PULSE */ + + __ETHTOOL_A_CABLE_TDR_NEST_CNT, + ETHTOOL_A_CABLE_TDR_NEST_MAX = (__ETHTOOL_A_CABLE_TDR_NEST_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_TEST_TDR_NTF_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ + ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, /* nest - of results: */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, + ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 -- cgit v1.2.3 From f2bc8ad31a7f814237bc6301d59296d76505a688 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 27 May 2020 00:21:41 +0200 Subject: net: ethtool: Allow PHY cable test TDR data to configured Allow the user to configure where on the cable the TDR data should be retrieved, in terms of first and last sample, and the step between samples. Also add the ability to ask for TDR data for just one pair. If this configuration is not provided, it defaults to 1-150m at 1m intervals for all pairs. Signed-off-by: Andrew Lunn v3: Move the TDR configuration into a structure Add a range check on step Use NL_SET_ERR_MSG_ATTR() when appropriate Move TDR configuration into a nest Document attributes in the request Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 22 +++++- drivers/net/phy/marvell.c | 59 ++++++++++----- drivers/net/phy/phy.c | 5 +- include/linux/phy.h | 21 +++++- include/uapi/linux/ethtool_netlink.h | 13 ++++ net/ethtool/cabletest.c | 104 ++++++++++++++++++++++++++- 6 files changed, 197 insertions(+), 27 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index dae36227d590..d42661b91128 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1023,9 +1023,25 @@ Start a cable test and report raw TDR data Request contents: - ==================================== ====== ========================== - ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` nested request header - ==================================== ====== ========================== + +--------------------------------------------+--------+-----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_HEADER`` | nested | reply header | + +--------------------------------------------+--------+-----------------------+ + | ``ETHTOOL_A_CABLE_TEST_TDR_CFG`` | nested | test configuration | + +-+------------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE `` | u32 | first data distance | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_LAST_DISTANCE `` | u32 | last data distance | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_STEP_STEP_DISTANCE `` | u32 | distance of each step | + +-+-+----------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR`` | u8 | pair to test | + +-+-+----------------------------------------+--------+-----------------------+ + +The ETHTOOL_A_CABLE_TEST_TDR_CFG is optional, as well as all members +of the nest. All distances are expressed in centimeters. The PHY takes +the distances as a guide, and rounds to the nearest distance it +actually supports. If a pair is passed, only that one pair will be +tested. Otherwise all pairs are tested. Notification contents: diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e597bee2e966..335e51d6f138 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -198,6 +198,7 @@ #define MII_VCT5_CTRL_PEEK_HYST_DEFAULT 3 #define MII_VCT5_SAMPLE_POINT_DISTANCE 0x18 +#define MII_VCT5_SAMPLE_POINT_DISTANCE_MAX 511 #define MII_VCT5_TX_PULSE_CTRL 0x1c #define MII_VCT5_TX_PULSE_CTRL_DONT_WAIT_LINK_DOWN BIT(12) #define MII_VCT5_TX_PULSE_CTRL_PULSE_WIDTH_128nS (0x0 << 10) @@ -270,6 +271,10 @@ struct marvell_priv { char *hwmon_name; struct device *hwmon_dev; bool cable_test_tdr; + u32 first; + u32 last; + u32 step; + s8 pair; }; static int marvell_read_page(struct phy_device *phydev) @@ -1787,12 +1792,18 @@ static u32 marvell_vct5_distance2cm(int distance) return distance * 805 / 10; } +static u32 marvell_vct5_cm2distance(int cm) +{ + return cm * 10 / 805; +} + static int marvell_vct5_amplitude_distance(struct phy_device *phydev, - int distance) + int distance, int pair) { - int mV_pair0, mV_pair1, mV_pair2, mV_pair3; u16 reg; int err; + int mV; + int i; err = phy_write_paged(phydev, MII_MARVELL_VCT5_PAGE, MII_VCT5_SAMPLE_POINT_DISTANCE, @@ -1814,21 +1825,20 @@ static int marvell_vct5_amplitude_distance(struct phy_device *phydev, if (err) return err; - mV_pair0 = marvell_vct5_amplitude(phydev, 0); - mV_pair1 = marvell_vct5_amplitude(phydev, 1); - mV_pair2 = marvell_vct5_amplitude(phydev, 2); - mV_pair3 = marvell_vct5_amplitude(phydev, 3); + for (i = 0; i < 4; i++) { + if (pair != PHY_PAIR_ALL && i != pair) + continue; - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_A, mV_pair0); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_B, mV_pair1); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_C, mV_pair2); - ethnl_cable_test_amplitude(phydev, ETHTOOL_A_CABLE_PAIR_D, mV_pair3); + mV = marvell_vct5_amplitude(phydev, i); + ethnl_cable_test_amplitude(phydev, i, mV); + } return 0; } static int marvell_vct5_amplitude_graph(struct phy_device *phydev) { + struct marvell_priv *priv = phydev->priv; int distance; int err; u16 reg; @@ -1843,8 +1853,11 @@ static int marvell_vct5_amplitude_graph(struct phy_device *phydev) if (err) return err; - for (distance = 0; distance <= 100; distance++) { - err = marvell_vct5_amplitude_distance(phydev, distance); + for (distance = priv->first; + distance <= priv->last; + distance += priv->step) { + err = marvell_vct5_amplitude_distance(phydev, distance, + priv->pair); if (err) return err; } @@ -1918,11 +1931,24 @@ static int marvell_vct7_cable_test_start(struct phy_device *phydev) MII_VCT7_CTRL_CENTIMETERS); } -static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev) +static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev, + const struct phy_tdr_config *cfg) { struct marvell_priv *priv = phydev->priv; int ret; + priv->cable_test_tdr = true; + priv->first = marvell_vct5_cm2distance(cfg->first); + priv->last = marvell_vct5_cm2distance(cfg->last); + priv->step = marvell_vct5_cm2distance(cfg->step); + priv->pair = cfg->pair; + + if (priv->first > MII_VCT5_SAMPLE_POINT_DISTANCE_MAX) + return -EINVAL; + + if (priv->last > MII_VCT5_SAMPLE_POINT_DISTANCE_MAX) + return -EINVAL; + /* Disable VCT7 */ ret = phy_write_paged(phydev, MII_MARVELL_VCT7_PAGE, MII_VCT7_CTRL, 0); @@ -1933,15 +1959,14 @@ static int marvell_vct5_cable_test_tdr_start(struct phy_device *phydev) if (ret) return ret; - priv->cable_test_tdr = true; ret = ethnl_cable_test_pulse(phydev, 1000); if (ret) return ret; return ethnl_cable_test_step(phydev, - marvell_vct5_distance2cm(0), - marvell_vct5_distance2cm(100), - marvell_vct5_distance2cm(1)); + marvell_vct5_distance2cm(priv->first), + marvell_vct5_distance2cm(priv->last), + marvell_vct5_distance2cm(priv->step)); } static int marvell_vct7_distance_to_length(int distance, bool meter) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 495d9ba3d5bf..1de3938628f4 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -553,7 +553,8 @@ out: EXPORT_SYMBOL(phy_start_cable_test); int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config) { struct net_device *dev = phydev->attached_dev; int err = -ENOMEM; @@ -590,7 +591,7 @@ int phy_start_cable_test_tdr(struct phy_device *phydev, phy_link_down(phydev); netif_testing_on(dev); - err = phydev->drv->cable_test_tdr_start(phydev); + err = phydev->drv->cable_test_tdr_start(phydev, config); if (err) { netif_testing_off(dev); phy_link_up(phydev); diff --git a/include/linux/phy.h b/include/linux/phy.h index d3c384f353ca..8c05d0fb5c00 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -548,6 +548,18 @@ struct phy_device { #define to_phy_device(d) container_of(to_mdio_device(d), \ struct phy_device, mdio) +/* A structure containing possible configuration parameters + * for a TDR cable test. The driver does not need to implement + * all the parameters, but should report what is actually used. + */ +struct phy_tdr_config { + u32 first; + u32 last; + u32 step; + s8 pair; +}; +#define PHY_PAIR_ALL -1 + /* struct phy_driver: Driver structure for a particular PHY type * * driver_data: static driver data @@ -701,7 +713,8 @@ struct phy_driver { int (*cable_test_start)(struct phy_device *dev); /* Start a raw TDR cable test */ - int (*cable_test_tdr_start)(struct phy_device *dev); + int (*cable_test_tdr_start)(struct phy_device *dev, + const struct phy_tdr_config *config); /* Once per second, or on interrupt, request the status of the * test. @@ -1256,7 +1269,8 @@ int phy_reset_after_clk_enable(struct phy_device *phydev); int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack); + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config); #else static inline int phy_start_cable_test(struct phy_device *phydev, @@ -1267,7 +1281,8 @@ int phy_start_cable_test(struct phy_device *phydev, } static inline int phy_start_cable_test_tdr(struct phy_device *phydev, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + const struct phy_tdr_config *config) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 739faa7070c6..fc9051f2eeac 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -482,9 +482,22 @@ enum { /* CABLE TEST TDR */ +enum { + ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, + ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 +}; + enum { ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_TDR_CFG, /* nest - *_TDR_CFG_* */ /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_TDR_CNT, diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index 390d0673ff01..9991688d7d1d 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -5,7 +5,11 @@ #include "netlink.h" #include "common.h" -/* CABLE_TEST_ACT */ +/* 802.3 standard allows 100 meters for BaseT cables. However longer + * cables might work, depending on the quality of the cables and the + * PHY. So allow testing for up to 150 meters. + */ +#define MAX_CABLE_LENGTH_CM (150 * 100) static const struct nla_policy cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = { @@ -203,16 +207,107 @@ err: } EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); +struct cable_test_tdr_req_info { + struct ethnl_req_info base; +}; + +static const struct nla_policy +cable_test_tdr_act_cfg_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR] = { .type = NLA_U8 }, +}; + static const struct nla_policy cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = { [ETHTOOL_A_CABLE_TEST_TDR_UNSPEC] = { .type = NLA_REJECT }, [ETHTOOL_A_CABLE_TEST_TDR_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG] = { .type = NLA_NESTED }, }; +/* CABLE_TEST_TDR_ACT */ +int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, + struct genl_info *info, + struct phy_tdr_config *cfg) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1]; + int ret; + + ret = nla_parse_nested(tb, ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX, nest, + cable_test_tdr_act_cfg_policy, info->extack); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]) + cfg->first = nla_get_u32( + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]); + else + cfg->first = 100; + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]) + cfg->last = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]); + else + cfg->last = MAX_CABLE_LENGTH_CM; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]) + cfg->step = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]); + else + cfg->step = 100; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]) { + cfg->pair = nla_get_u8(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]); + if (cfg->pair > ETHTOOL_A_CABLE_PAIR_D) { + NL_SET_ERR_MSG_ATTR( + info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR], + "invalid pair parameter"); + return -EINVAL; + } + } else { + cfg->pair = PHY_PAIR_ALL; + } + + if (cfg->first > MAX_CABLE_LENGTH_CM) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST], + "invalid first parameter"); + return -EINVAL; + } + + if (cfg->last > MAX_CABLE_LENGTH_CM) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST], + "invalid last parameter"); + return -EINVAL; + } + + if (cfg->first > cfg->last) { + NL_SET_ERR_MSG(info->extack, "invalid first/last parameter"); + return -EINVAL; + } + + if (!cfg->step) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], + "invalid step parameter"); + return -EINVAL; + } + + if (cfg->step > (cfg->last - cfg->first)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], + "step parameter too big"); + return -EINVAL; + } + + return 0; +} + int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1]; struct ethnl_req_info req_info = {}; + struct phy_tdr_config cfg; struct net_device *dev; int ret; @@ -235,12 +330,17 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) goto out_dev_put; } + ret = ethnl_act_cable_test_tdr_cfg(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG], + info, &cfg); + if (ret) + goto out_dev_put; + rtnl_lock(); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; - ret = phy_start_cable_test_tdr(dev->phydev, info->extack); + ret = phy_start_cable_test_tdr(dev->phydev, info->extack, &cfg); ethnl_ops_complete(dev); -- cgit v1.2.3 From 33462e68231bccfe563a87614f4c4dd5d333837c Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Fri, 24 Apr 2020 14:29:03 +0300 Subject: cfg80211: add support for TID specific AMSDU configuration This patch adds support to control per TID MSDU aggregation using the NL80211_TID_CONFIG_ATTR_AMSDU_CTRL attribute. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200424112905.26770-4-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- include/uapi/linux/nl80211.h | 10 +++++++--- net/wireless/nl80211.c | 8 ++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e71d4f690ef1..5cacf24cc9f0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -640,8 +640,9 @@ struct cfg80211_chan_def { * @noack: noack configuration value for the TID * @retry_long: retry count value * @retry_short: retry count value - * @ampdu: Enable/Disable aggregation + * @ampdu: Enable/Disable MPDU aggregation * @rtscts: Enable/Disable RTS/CTS + * @amsdu: Enable/Disable MSDU aggregation */ struct cfg80211_tid_cfg { bool config_override; @@ -651,6 +652,7 @@ struct cfg80211_tid_cfg { u8 retry_long, retry_short; enum nl80211_tid_config ampdu; enum nl80211_tid_config rtscts; + enum nl80211_tid_config amsdu; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 9679d561f7d0..1ccb0bf657ec 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4844,12 +4844,15 @@ enum nl80211_tid_config { * &NL80211_CMD_SET_TID_CONFIG. Its type is u8, min value is 1 and * the max value is advertised by the driver in this attribute on * output in wiphy capabilities. - * @NL80211_TID_CONFIG_ATTR_AMPDU_CTRL: Enable/Disable aggregation for the TIDs - * specified in %NL80211_TID_CONFIG_ATTR_TIDS. Its type is u8, using - * the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_AMPDU_CTRL: Enable/Disable MPDU aggregation + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. + * Its type is u8, using the values from &nl80211_tid_config. * @NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL: Enable/Disable RTS_CTS for the TIDs * specified in %NL80211_TID_CONFIG_ATTR_TIDS. It is u8 type, using * the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_AMSDU_CTRL: Enable/Disable MSDU aggregation + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. + * Its type is u8, using the values from &nl80211_tid_config. */ enum nl80211_tid_config_attr { __NL80211_TID_CONFIG_ATTR_INVALID, @@ -4863,6 +4866,7 @@ enum nl80211_tid_config_attr { NL80211_TID_CONFIG_ATTR_RETRY_LONG, NL80211_TID_CONFIG_ATTR_AMPDU_CTRL, NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL, + NL80211_TID_CONFIG_ATTR_AMSDU_CTRL, /* keep last */ __NL80211_TID_CONFIG_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fa66d5b6f557..482a80b78844 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -343,6 +343,8 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), [NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL] = NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), + [NL80211_TID_CONFIG_ATTR_AMSDU_CTRL] = + NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), }; static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { @@ -14080,6 +14082,12 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL]); } + if (attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]) { + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_AMSDU_CTRL); + tid_conf->amsdu = + nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]); + } + if (peer) mask = rdev->wiphy.tid_config_support.peer; else -- cgit v1.2.3 From c03369558c435f7e82f7c06b0173fa73c1ed15c0 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Fri, 24 Apr 2020 14:29:04 +0300 Subject: nl80211: simplify peer specific TID configuration Current rule for applying TID configuration for specific peer looks overly complicated. No need to reject new TID configuration when override flag is specified. Another call with the same TID configuration, but without override flag, allows to apply new configuration anyway. Use the same approach as for the 'all peers' case: if override flag is specified, then reset existing TID configuration and immediately apply a new one. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200424112905.26770-5-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 10 ++++------ net/wireless/nl80211.c | 5 +---- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1ccb0bf657ec..d1b1d9e49887 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4823,12 +4823,10 @@ enum nl80211_tid_config { * (%NL80211_TID_CONFIG_ATTR_TIDS, %NL80211_TID_CONFIG_ATTR_OVERRIDE). * @NL80211_TID_CONFIG_ATTR_PEER_SUPP: same as the previous per-vif one, but * per peer instead. - * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if no peer - * is selected, if set indicates that the new configuration overrides - * all previous peer configurations, otherwise previous peer specific - * configurations should be left untouched. If peer is selected then - * it will reset particular TID configuration of that peer and it will - * not accept other TID config attributes along with peer. + * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if set indicates + * that the new configuration overrides all previous peer + * configurations, otherwise previous peer specific configurations + * should be left untouched. * @NL80211_TID_CONFIG_ATTR_TIDS: a bitmask value of TIDs (bit 0 to 7) * Its type is u16. * @NL80211_TID_CONFIG_ATTR_NOACK: Configure ack policy for the TID. diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 482a80b78844..258c621f651c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14036,10 +14036,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, if (rdev->ops->reset_tid_config) { err = rdev_reset_tid_config(rdev, dev, peer, tid_conf->tids); - /* If peer is there no other configuration will be - * allowed - */ - if (err || peer) + if (err) return err; } else { return -EINVAL; -- cgit v1.2.3 From 942ba88ba9c87f5e225574f1f0d6548f0105ed73 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Thu, 30 Apr 2020 10:25:51 -0700 Subject: nl80211: add KHz frequency offset for most wifi commands cfg80211 recently gained the ability to understand a frequency offset component in KHz. Expose this in nl80211 through the new attributes NL80211_ATTR_WIPHY_FREQ_OFFSET, NL80211_FREQUENCY_ATTR_OFFSET, NL80211_ATTR_CENTER_FREQ1_OFFSET, and NL80211_BSS_FREQUENCY_OFFSET. These add support to send and receive a KHz offset component with the following NL80211 commands: - NL80211_CMD_FRAME - NL80211_CMD_GET_SCAN - NL80211_CMD_AUTHENTICATE - NL80211_CMD_ASSOCIATE - NL80211_CMD_CONNECT Along with any other command which takes a chandef, ie: - NL80211_CMD_SET_CHANNEL - NL80211_CMD_SET_WIPHY - NL80211_CMD_START_AP - NL80211_CMD_RADAR_DETECT - NL80211_CMD_NOTIFY_RADAR - NL80211_CMD_CHANNEL_SWITCH - NL80211_JOIN_IBSS - NL80211_CMD_REMAIN_ON_CHANNEL - NL80211_CMD_JOIN_OCB - NL80211_CMD_JOIN_MESH - NL80211_CMD_TDLS_CHANNEL_SWITCH If the driver advertises a band containing channels with frequency offset, it must also verify support for frequency offset channels in its cfg80211 ops, or return an error. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200430172554.18383-3-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 50 ++++++++++++++++++---------- net/wireless/nl80211.c | 78 ++++++++++++++++++++++++++++++++------------ 2 files changed, 91 insertions(+), 37 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d1b1d9e49887..b1cd132c1d27 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -296,13 +296,14 @@ * to get a list of all present wiphys. * @NL80211_CMD_SET_WIPHY: set wiphy parameters, needs %NL80211_ATTR_WIPHY or * %NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME, - * %NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ (and the - * attributes determining the channel width; this is used for setting - * monitor mode channel), %NL80211_ATTR_WIPHY_RETRY_SHORT, - * %NL80211_ATTR_WIPHY_RETRY_LONG, %NL80211_ATTR_WIPHY_FRAG_THRESHOLD, - * and/or %NL80211_ATTR_WIPHY_RTS_THRESHOLD. - * However, for setting the channel, see %NL80211_CMD_SET_CHANNEL - * instead, the support here is for backward compatibility only. + * %NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ, + * %NL80211_ATTR_WIPHY_FREQ_OFFSET (and the attributes determining the + * channel width; this is used for setting monitor mode channel), + * %NL80211_ATTR_WIPHY_RETRY_SHORT, %NL80211_ATTR_WIPHY_RETRY_LONG, + * %NL80211_ATTR_WIPHY_FRAG_THRESHOLD, and/or + * %NL80211_ATTR_WIPHY_RTS_THRESHOLD. However, for setting the channel, + * see %NL80211_CMD_SET_CHANNEL instead, the support here is for backward + * compatibility only. * @NL80211_CMD_NEW_WIPHY: Newly created wiphy, response to get request * or rename notification. Has attributes %NL80211_ATTR_WIPHY and * %NL80211_ATTR_WIPHY_NAME. @@ -351,7 +352,8 @@ * %NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_INACTIVITY_TIMEOUT, * %NL80211_ATTR_ACL_POLICY and %NL80211_ATTR_MAC_ADDRS. * The channel to use can be set on the interface or be given using the - * %NL80211_ATTR_WIPHY_FREQ and the attributes determining channel width. + * %NL80211_ATTR_WIPHY_FREQ and %NL80211_ATTR_WIPHY_FREQ_OFFSET, and the + * attributes determining channel width. * @NL80211_CMD_NEW_BEACON: old alias for %NL80211_CMD_START_AP * @NL80211_CMD_STOP_AP: Stop AP operation on the given interface * @NL80211_CMD_DEL_BEACON: old alias for %NL80211_CMD_STOP_AP @@ -536,11 +538,12 @@ * interface. %NL80211_ATTR_MAC is used to specify PeerSTAAddress (and * BSSID in case of station mode). %NL80211_ATTR_SSID is used to specify * the SSID (mainly for association, but is included in authentication - * request, too, to help BSS selection. %NL80211_ATTR_WIPHY_FREQ is used - * to specify the frequence of the channel in MHz. %NL80211_ATTR_AUTH_TYPE - * is used to specify the authentication type. %NL80211_ATTR_IE is used to - * define IEs (VendorSpecificInfo, but also including RSN IE and FT IEs) - * to be added to the frame. + * request, too, to help BSS selection. %NL80211_ATTR_WIPHY_FREQ + + * %NL80211_ATTR_WIPHY_FREQ_OFFSET is used to specify the frequence of the + * channel in MHz. %NL80211_ATTR_AUTH_TYPE is used to specify the + * authentication type. %NL80211_ATTR_IE is used to define IEs + * (VendorSpecificInfo, but also including RSN IE and FT IEs) to be added + * to the frame. * When used as an event, this reports reception of an Authentication * frame in station and IBSS modes when the local MLME processed the * frame, i.e., it was for the local STA and was received in correct @@ -595,8 +598,9 @@ * requests to connect to a specified network but without separating * auth and assoc steps. For this, you need to specify the SSID in a * %NL80211_ATTR_SSID attribute, and can optionally specify the association - * IEs in %NL80211_ATTR_IE, %NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_USE_MFP, - * %NL80211_ATTR_MAC, %NL80211_ATTR_WIPHY_FREQ, %NL80211_ATTR_CONTROL_PORT, + * IEs in %NL80211_ATTR_IE, %NL80211_ATTR_AUTH_TYPE, + * %NL80211_ATTR_USE_MFP, %NL80211_ATTR_MAC, %NL80211_ATTR_WIPHY_FREQ, + * %NL80211_ATTR_WIPHY_FREQ_OFFSET, %NL80211_ATTR_CONTROL_PORT, * %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT, * %NL80211_ATTR_CONTROL_PORT_OVER_NL80211, %NL80211_ATTR_MAC_HINT, and @@ -1433,7 +1437,8 @@ enum nl80211_commands { * of &enum nl80211_chan_width, describing the channel width. See the * documentation of the enum for more information. * @NL80211_ATTR_CENTER_FREQ1: Center frequency of the first part of the - * channel, used for anything but 20 MHz bandwidth + * channel, used for anything but 20 MHz bandwidth. In S1G this is the + * operating channel center frequency. * @NL80211_ATTR_CENTER_FREQ2: Center frequency of the second part of the * channel, used only for 80+80 MHz bandwidth * @NL80211_ATTR_WIPHY_CHANNEL_TYPE: included with NL80211_ATTR_WIPHY_FREQ @@ -2480,9 +2485,14 @@ enum nl80211_commands { * entry without having to force a disconnection after the PMK timeout. If * no roaming occurs between the reauth threshold and PMK expiration, * disassociation is still forced. - * * @NL80211_ATTR_RECEIVE_MULTICAST: multicast flag for the * %NL80211_CMD_REGISTER_FRAME command, see the description there. + * @NL80211_ATTR_WIPHY_FREQ_OFFSET: offset of the associated + * %NL80211_ATTR_WIPHY_FREQ in positive KHz. Only valid when supplied with + * an %NL80211_ATTR_WIPHY_FREQ_OFFSET. + * @NL80211_ATTR_CENTER_FREQ1_OFFSET: Center frequency offset in KHz for the + * first channel segment specified in %NL80211_ATTR_CENTER_FREQ1. + * * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2960,6 +2970,8 @@ enum nl80211_attrs { NL80211_ATTR_PMK_REAUTH_THRESHOLD, NL80211_ATTR_RECEIVE_MULTICAST, + NL80211_ATTR_WIPHY_FREQ_OFFSET, + NL80211_ATTR_CENTER_FREQ1_OFFSET, /* add attributes here, update the policy in nl80211.c */ @@ -3682,6 +3694,7 @@ enum nl80211_wmm_rule { * (see &enum nl80211_wmm_rule) * @NL80211_FREQUENCY_ATTR_NO_HE: HE operation is not allowed on this channel * in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_OFFSET: frequency offset in KHz * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -3712,6 +3725,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_NO_10MHZ, NL80211_FREQUENCY_ATTR_WMM, NL80211_FREQUENCY_ATTR_NO_HE, + NL80211_FREQUENCY_ATTR_OFFSET, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, @@ -4482,6 +4496,7 @@ enum nl80211_bss_scan_width { * @NL80211_BSS_CHAIN_SIGNAL: per-chain signal strength of last BSS update. * Contains a nested array of signal strength attributes (u8, dBm), * using the nesting index as the antenna number. + * @NL80211_BSS_FREQUENCY_OFFSET: frequency offset in KHz * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -4506,6 +4521,7 @@ enum nl80211_bss { NL80211_BSS_PARENT_TSF, NL80211_BSS_PARENT_BSSID, NL80211_BSS_CHAIN_SIGNAL, + NL80211_BSS_FREQUENCY_OFFSET, /* keep last */ __NL80211_BSS_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f6523f1485a3..87d7efd186d0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -365,6 +365,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CHANNEL_WIDTH] = { .type = NLA_U32 }, [NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 }, + [NL80211_ATTR_CENTER_FREQ1_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), [NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1), @@ -638,6 +639,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1), [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100), [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, + [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), }; /* policy for the key attributes */ @@ -904,6 +906,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, chan->center_freq)) goto nla_put_failure; + if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_OFFSET, chan->freq_offset)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_DISABLED) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DISABLED)) goto nla_put_failure; @@ -1309,13 +1314,11 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) } static struct ieee80211_channel *nl80211_get_valid_chan(struct wiphy *wiphy, - struct nlattr *tb) + u32 freq) { struct ieee80211_channel *chan; - if (tb == NULL) - return NULL; - chan = ieee80211_get_channel(wiphy, nla_get_u32(tb)); + chan = ieee80211_get_channel_khz(wiphy, freq); if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) return NULL; return chan; @@ -2770,13 +2773,17 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, if (!attrs[NL80211_ATTR_WIPHY_FREQ]) return -EINVAL; - control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]); + control_freq = MHZ_TO_KHZ( + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + control_freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); memset(chandef, 0, sizeof(*chandef)); - - chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq); + chandef->chan = ieee80211_get_channel_khz(&rdev->wiphy, control_freq); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; - chandef->center_freq1 = control_freq; + chandef->center_freq1 = KHZ_TO_MHZ(control_freq); + chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; /* Primary channel not allowed */ @@ -2824,9 +2831,15 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (attrs[NL80211_ATTR_CENTER_FREQ1]) + if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); + if (attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]) + chandef->freq1_offset = nla_get_u32( + attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]); + else + chandef->freq1_offset = 0; + } if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); @@ -3259,6 +3272,9 @@ static int nl80211_send_chandef(struct sk_buff *msg, if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chandef->chan->center_freq)) return -ENOBUFS; + if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, + chandef->chan->freq_offset)) + return -ENOBUFS; switch (chandef->width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: @@ -8873,6 +8889,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, goto nla_put_failure; if (nla_put_u16(msg, NL80211_BSS_CAPABILITY, res->capability) || nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) || + nla_put_u32(msg, NL80211_BSS_FREQUENCY_OFFSET, + res->channel->freq_offset) || nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) || nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO, jiffies_to_msecs(jiffies - intbss->ts))) @@ -9141,6 +9159,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) enum nl80211_auth_type auth_type; struct key_parse key; bool local_state_change; + u32 freq; if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -9197,8 +9216,12 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - chan = nl80211_get_valid_chan(&rdev->wiphy, - info->attrs[NL80211_ATTR_WIPHY_FREQ]); + freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + + chan = nl80211_get_valid_chan(&rdev->wiphy, freq); if (!chan) return -EINVAL; @@ -9388,6 +9411,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) struct cfg80211_assoc_request req = {}; const u8 *bssid, *ssid; int err, ssid_len = 0; + u32 freq; if (dev->ieee80211_ptr->conn_owner_nlportid && dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) @@ -9407,8 +9431,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - chan = nl80211_get_valid_chan(&rdev->wiphy, - info->attrs[NL80211_ATTR_WIPHY_FREQ]); + freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + chan = nl80211_get_valid_chan(&rdev->wiphy, freq); if (!chan) return -EINVAL; @@ -10088,6 +10115,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) struct cfg80211_connect_params connect; struct wiphy *wiphy; struct cfg80211_cached_keys *connkeys = NULL; + u32 freq = 0; int err; memset(&connect, 0, sizeof(connect)); @@ -10158,14 +10186,21 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) connect.prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); - if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { - connect.channel = nl80211_get_valid_chan( - wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ]); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) + freq = MHZ_TO_KHZ(nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + + if (freq) { + connect.channel = nl80211_get_valid_chan(wiphy, freq); if (!connect.channel) return -EINVAL; } else if (info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]) { - connect.channel_hint = nl80211_get_valid_chan( - wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]); + freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]); + freq = MHZ_TO_KHZ(freq); + connect.channel_hint = nl80211_get_valid_chan(wiphy, freq); if (!connect.channel_hint) return -EINVAL; } @@ -16215,6 +16250,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, freq % 1000) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, buf) || @@ -16864,8 +16900,10 @@ void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (freq && - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, - KHZ_TO_MHZ(freq))) || + (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, + KHZ_TO_MHZ(freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, + freq % 1000))) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, frame)) -- cgit v1.2.3 From 2032f3b2f943256ff40df23182913dfc7e73ec6a Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Thu, 30 Apr 2020 10:25:52 -0700 Subject: nl80211: support scan frequencies in KHz If the driver advertises NL80211_EXT_FEATURE_SCAN_FREQ_KHZ userspace can omit NL80211_ATTR_SCAN_FREQUENCIES in favor of an NL80211_ATTR_SCAN_FREQ_KHZ. To get scan results in KHz userspace must also set the NL80211_SCAN_FLAG_FREQ_KHZ. This lets nl80211 remain compatible with older userspaces while not requring and sending redundant (and potentially incorrect) scan frequency sets. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200430172554.18383-4-thomas@adapt-ip.com [use just nla_nest_start() (not _noflag) for NL80211_ATTR_SCAN_FREQ_KHZ] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 13 ++++++++++- net/mac80211/main.c | 2 ++ net/wireless/nl80211.c | 51 +++++++++++++++++++++++++++++++++----------- 3 files changed, 53 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index b1cd132c1d27..47d39b6a073d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2492,7 +2492,7 @@ enum nl80211_commands { * an %NL80211_ATTR_WIPHY_FREQ_OFFSET. * @NL80211_ATTR_CENTER_FREQ1_OFFSET: Center frequency offset in KHz for the * first channel segment specified in %NL80211_ATTR_CENTER_FREQ1. - * + * @NL80211_ATTR_SCAN_FREQ_KHZ: nested attribute with KHz frequencies * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2972,6 +2972,7 @@ enum nl80211_attrs { NL80211_ATTR_RECEIVE_MULTICAST, NL80211_ATTR_WIPHY_FREQ_OFFSET, NL80211_ATTR_CENTER_FREQ1_OFFSET, + NL80211_ATTR_SCAN_FREQ_KHZ, /* add attributes here, update the policy in nl80211.c */ @@ -5723,6 +5724,11 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS: management frame registrations * are possible for multicast frames and those will be reported properly. * + * @NL80211_EXT_FEATURE_SCAN_FREQ_KHZ: This driver supports receiving and + * reporting scan request with %NL80211_ATTR_SCAN_FREQ_KHZ. In order to + * report %NL80211_ATTR_SCAN_FREQ_KHZ, %NL80211_SCAN_FLAG_FREQ_KHZ must be + * included in the scan request. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5776,6 +5782,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_DEL_IBSS_STA, NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -5887,6 +5894,9 @@ enum nl80211_timeout_reason { * @NL80211_SCAN_FLAG_MIN_PREQ_CONTENT: minimize probe request content to * only have supported rates and no additional capabilities (unless * added by userspace explicitly.) + * @NL80211_SCAN_FLAG_FREQ_KHZ: report scan results with + * %NL80211_ATTR_SCAN_FREQ_KHZ. This also means + * %NL80211_ATTR_SCAN_FREQUENCIES will not be included. */ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, @@ -5902,6 +5912,7 @@ enum nl80211_scan_flags { NL80211_SCAN_FLAG_HIGH_ACCURACY = 1<<10, NL80211_SCAN_FLAG_RANDOM_SN = 1<<11, NL80211_SCAN_FLAG_MIN_PREQ_CONTENT = 1<<12, + NL80211_SCAN_FLAG_FREQ_KHZ = 1<<13, }; /** diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 06c90d360633..ac74bd780b42 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -596,6 +596,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ); if (!ops->hw_scan) { wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 87d7efd186d0..84bfa147769a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -640,6 +640,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100), [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), + [NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED }, }; /* policy for the key attributes */ @@ -7719,6 +7720,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_scan_request *request; + struct nlattr *scan_freqs = NULL; + bool scan_freqs_khz = false; struct nlattr *attr; struct wiphy *wiphy; int err, tmp, n_ssids = 0, n_channels, i; @@ -7737,9 +7740,17 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto unlock; } - if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { - n_channels = validate_scan_freqs( - info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]); + if (info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]) { + if (!wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ)) + return -EOPNOTSUPP; + scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]; + scan_freqs_khz = true; + } else if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) + scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]; + + if (scan_freqs) { + n_channels = validate_scan_freqs(scan_freqs); if (!n_channels) { err = -EINVAL; goto unlock; @@ -7787,13 +7798,16 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } i = 0; - if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { + if (scan_freqs) { /* user specified, bail out if channel not found */ - nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp) { + nla_for_each_nested(attr, scan_freqs, tmp) { struct ieee80211_channel *chan; + int freq = nla_get_u32(attr); - chan = ieee80211_get_channel(wiphy, nla_get_u32(attr)); + if (!scan_freqs_khz) + freq = MHZ_TO_KHZ(freq); + chan = ieee80211_get_channel_khz(wiphy, freq); if (!chan) { err = -EINVAL; goto out_free; @@ -15231,14 +15245,27 @@ static int nl80211_add_scan_req(struct sk_buff *msg, } nla_nest_end(msg, nest); - nest = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_FREQUENCIES); - if (!nest) - goto nla_put_failure; - for (i = 0; i < req->n_channels; i++) { - if (nla_put_u32(msg, i, req->channels[i]->center_freq)) + if (req->flags & NL80211_SCAN_FLAG_FREQ_KHZ) { + nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQ_KHZ); + if (!nest) + goto nla_put_failure; + for (i = 0; i < req->n_channels; i++) { + if (nla_put_u32(msg, i, + ieee80211_channel_to_khz(req->channels[i]))) + goto nla_put_failure; + } + nla_nest_end(msg, nest); + } else { + nest = nla_nest_start_noflag(msg, + NL80211_ATTR_SCAN_FREQUENCIES); + if (!nest) goto nla_put_failure; + for (i = 0; i < req->n_channels; i++) { + if (nla_put_u32(msg, i, req->channels[i]->center_freq)) + goto nla_put_failure; + } + nla_nest_end(msg, nest); } - nla_nest_end(msg, nest); if (req->ie && nla_put(msg, NL80211_ATTR_IE, req->ie_len, req->ie)) -- cgit v1.2.3 From dca9ca2d588bd2c0989c671f048540b82e57cf1e Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Fri, 8 May 2020 16:42:00 +0200 Subject: nl80211: add ability to report TX status for control port TX This adds the necessary capabilities in nl80211 to allow drivers to assign a cookie to control port TX frames (returned via extack in the netlink ACK message of the command) and then later report the frame's status. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200508144202.7678-2-markus.theil@tu-ilmenau.de [use extack cookie instead of explicit message, recombine patches] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 20 +++++++++++++++++++- include/uapi/linux/nl80211.h | 12 ++++++++++++ net/mac80211/ieee80211_i.h | 3 ++- net/mac80211/tx.c | 3 ++- net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++-------- net/wireless/rdev-ops.h | 9 ++++++--- net/wireless/trace.h | 17 +++++++++++++++++ 7 files changed, 91 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 021366cfb2b0..f842f3652026 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4069,7 +4069,8 @@ struct cfg80211_ops { struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, const __be16 proto, - const bool noencrypt); + const bool noencrypt, + u64 *cookie); int (*get_ftm_responder_stats)(struct wiphy *wiphy, struct net_device *dev, @@ -7049,6 +7050,23 @@ static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, const u8 *buf, size_t len, bool ack, gfp_t gfp); +/** + * cfg80211_control_port_tx_status - notification of TX status for control + * port frames + * @wdev: wireless device receiving the frame + * @cookie: Cookie returned by cfg80211_ops::tx_control_port() + * @buf: Data frame (header + body) + * @len: length of the frame data + * @ack: Whether frame was acknowledged + * @gfp: context flags + * + * This function is called whenever a control port frame was requested to be + * transmitted with cfg80211_ops::tx_control_port() to report the TX status of + * the transmission attempt. + */ +void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp); /** * cfg80211_rx_control_port - notification about a received control port frame diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 47d39b6a073d..0f324b6b81cc 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1164,6 +1164,12 @@ * dropped because it did not include a valid MME MIC while beacon * protection was enabled (BIGTK configured in station mode). * + * @NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS: Report TX status of a control + * port frame transmitted with %NL80211_CMD_CONTROL_PORT_FRAME. + * %NL80211_ATTR_COOKIE identifies the TX command and %NL80211_ATTR_FRAME + * includes the contents of the frame. %NL80211_ATTR_ACK flag is included + * if the recipient acknowledged the frame. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1392,6 +1398,8 @@ enum nl80211_commands { NL80211_CMD_UNPROT_BEACON, + NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -5729,6 +5737,9 @@ enum nl80211_feature_flags { * report %NL80211_ATTR_SCAN_FREQ_KHZ, %NL80211_SCAN_FLAG_FREQ_KHZ must be * included in the scan request. * + * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS: The driver + * can report tx status for control port over nl80211 tx operations. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5783,6 +5794,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2d1b6cb75497..b87dc873825b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1800,7 +1800,8 @@ void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_xmit(struct sta_info *sta); int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, __be16 proto, bool unencrypted); + const u8 *dest, __be16 proto, bool unencrypted, + u64 *cookie); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 47f460c8bd74..5931128e1855 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5339,7 +5339,8 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, __be16 proto, bool unencrypted) + const u8 *dest, __be16 proto, bool unencrypted, + u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 84bfa147769a..7ea764865546 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13866,6 +13866,7 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) { + bool dont_wait_for_ack = info->attrs[NL80211_ATTR_DONT_WAIT_FOR_ACK]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -13874,6 +13875,7 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) u8 *dest; u16 proto; bool noencrypt; + u64 cookie = 0; int err; if (!wiphy_ext_feature_isset(&rdev->wiphy, @@ -13918,9 +13920,12 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) noencrypt = nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]); - return rdev_tx_control_port(rdev, dev, buf, len, - dest, cpu_to_be16(proto), noencrypt); - + err = rdev_tx_control_port(rdev, dev, buf, len, + dest, cpu_to_be16(proto), noencrypt, + dont_wait_for_ack ? NULL : &cookie); + if (!err && !dont_wait_for_ack) + nl_set_extack_cookie_u64(info->extack, cookie); + return err; out: wdev_unlock(wdev); return err; @@ -16294,8 +16299,9 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, return -ENOBUFS; } -void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, - const u8 *buf, size_t len, bool ack, gfp_t gfp) +static void nl80211_frame_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp, enum nl80211_commands command) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -16303,13 +16309,16 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, struct sk_buff *msg; void *hdr; - trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); + if (command == NL80211_CMD_FRAME_TX_STATUS) + trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); + else + trace_cfg80211_control_port_tx_status(wdev, cookie, ack); msg = nlmsg_new(100 + len, gfp); if (!msg) return; - hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS); + hdr = nl80211hdr_put(msg, 0, 0, 0, command); if (!hdr) { nlmsg_free(msg); return; @@ -16332,9 +16341,25 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, NL80211_MCGRP_MLME, gfp); return; - nla_put_failure: +nla_put_failure: nlmsg_free(msg); } + +void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp) +{ + nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp, + NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS); +} +EXPORT_SYMBOL(cfg80211_control_port_tx_status); + +void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, gfp_t gfp) +{ + nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp, + NL80211_CMD_FRAME_TX_STATUS); +} EXPORT_SYMBOL(cfg80211_mgmt_tx_status); static int __nl80211_rx_control_port(struct net_device *dev, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index df5142e86c4f..950d57494168 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -748,14 +748,17 @@ static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, const u8 *dest, __be16 proto, - const bool noencrypt) + const bool noencrypt, u64 *cookie) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, - dest, proto, noencrypt); - trace_rdev_return_int(&rdev->wiphy, ret); + dest, proto, noencrypt, cookie); + if (cookie) + trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); + else + trace_rdev_return_int(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index f2ab44a2a3e4..b23cab016521 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2861,6 +2861,23 @@ TRACE_EVENT(cfg80211_mgmt_tx_status, WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) ); +TRACE_EVENT(cfg80211_control_port_tx_status, + TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack), + TP_ARGS(wdev, cookie, ack), + TP_STRUCT__entry( + WDEV_ENTRY + __field(u64, cookie) + __field(bool, ack) + ), + TP_fast_assign( + WDEV_ASSIGN; + __entry->cookie = cookie; + __entry->ack = ack; + ), + TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s", + WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) +); + TRACE_EVENT(cfg80211_rx_control_port, TP_PROTO(struct net_device *netdev, struct sk_buff *skb, bool unencrypted), -- cgit v1.2.3 From 9a5f6488623730dc16cca0836ade23869761adee Mon Sep 17 00:00:00 2001 From: Tamizh Chelvam Date: Wed, 13 May 2020 13:41:44 +0530 Subject: nl80211: Add support to configure TID specific Tx rate configuration This patch adds support to configure per TID Tx Rate configuration through NL80211_TID_CONFIG_ATTR_TX_RATE* attributes. And it uses nl80211_parse_tx_bitrate_mask api to validate the Tx rate mask. Signed-off-by: Tamizh Chelvam Link: https://lore.kernel.org/r/1589357504-10175-1-git-send-email-tamizhr@codeaurora.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 29 +++++++++++++---------- include/uapi/linux/nl80211.h | 21 +++++++++++++++++ net/wireless/nl80211.c | 56 +++++++++++++++++++++++++++++++++----------- 3 files changed, 80 insertions(+), 26 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f842f3652026..e2dbc9c02ef3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -630,6 +630,19 @@ struct cfg80211_chan_def { u16 freq1_offset; }; +/* + * cfg80211_bitrate_mask - masks for bitrate control + */ +struct cfg80211_bitrate_mask { + struct { + u32 legacy; + u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; + u16 vht_mcs[NL80211_VHT_NSS_MAX]; + enum nl80211_txrate_gi gi; + } control[NUM_NL80211_BANDS]; +}; + + /** * struct cfg80211_tid_cfg - TID specific configuration * @config_override: Flag to notify driver to reset TID configuration @@ -643,6 +656,8 @@ struct cfg80211_chan_def { * @ampdu: Enable/Disable MPDU aggregation * @rtscts: Enable/Disable RTS/CTS * @amsdu: Enable/Disable MSDU aggregation + * @txrate_type: Tx bitrate mask type + * @txrate_mask: Tx bitrate to be applied for the TID */ struct cfg80211_tid_cfg { bool config_override; @@ -653,6 +668,8 @@ struct cfg80211_tid_cfg { enum nl80211_tid_config ampdu; enum nl80211_tid_config rtscts; enum nl80211_tid_config amsdu; + enum nl80211_tx_rate_setting txrate_type; + struct cfg80211_bitrate_mask txrate_mask; }; /** @@ -1007,18 +1024,6 @@ struct cfg80211_acl_data { struct mac_address mac_addrs[]; }; -/* - * cfg80211_bitrate_mask - masks for bitrate control - */ -struct cfg80211_bitrate_mask { - struct { - u32 legacy; - u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; - u16 vht_mcs[NL80211_VHT_NSS_MAX]; - enum nl80211_txrate_gi gi; - } control[NUM_NL80211_BANDS]; -}; - /** * enum cfg80211_ap_settings_flags - AP settings flags * diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 0f324b6b81cc..c14666b75e57 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4841,6 +4841,17 @@ enum nl80211_tid_config { NL80211_TID_CONFIG_DISABLE, }; +/* enum nl80211_tx_rate_setting - TX rate configuration type + * @NL80211_TX_RATE_AUTOMATIC: automatically determine TX rate + * @NL80211_TX_RATE_LIMITED: limit the TX rate by the TX rate parameter + * @NL80211_TX_RATE_FIXED: fix TX rate to the TX rate parameter + */ +enum nl80211_tx_rate_setting { + NL80211_TX_RATE_AUTOMATIC, + NL80211_TX_RATE_LIMITED, + NL80211_TX_RATE_FIXED, +}; + /* enum nl80211_tid_config_attr - TID specific configuration. * @NL80211_TID_CONFIG_ATTR_PAD: pad attribute for 64-bit values * @NL80211_TID_CONFIG_ATTR_VIF_SUPP: a bitmap (u64) of attributes supported @@ -4876,6 +4887,14 @@ enum nl80211_tid_config { * @NL80211_TID_CONFIG_ATTR_AMSDU_CTRL: Enable/Disable MSDU aggregation * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. * Its type is u8, using the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE: This attribute will be useful + * to notfiy the driver that what type of txrate should be used + * for the TIDs specified in %NL80211_TID_CONFIG_ATTR_TIDS. using + * the values form &nl80211_tx_rate_setting. + * @NL80211_TID_CONFIG_ATTR_TX_RATE: Data frame TX rate mask should be applied + * with the parameters passed through %NL80211_ATTR_TX_RATES. + * configuration is applied to the data frame for the tid to that connected + * station. */ enum nl80211_tid_config_attr { __NL80211_TID_CONFIG_ATTR_INVALID, @@ -4890,6 +4909,8 @@ enum nl80211_tid_config_attr { NL80211_TID_CONFIG_ATTR_AMPDU_CTRL, NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL, NL80211_TID_CONFIG_ATTR_AMSDU_CTRL, + NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE, + NL80211_TID_CONFIG_ATTR_TX_RATE, /* keep last */ __NL80211_TID_CONFIG_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7ea764865546..22c4d13e28cb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -329,6 +329,15 @@ he_bss_color_policy[NL80211_HE_BSS_COLOR_ATTR_MAX + 1] = { [NL80211_HE_BSS_COLOR_ATTR_PARTIAL] = { .type = NLA_FLAG }, }; +static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { + [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, + [NL80211_TXRATE_HT] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_HT_RATES }, + [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), + [NL80211_TXRATE_GI] = { .type = NLA_U8 }, +}; + static const struct nla_policy nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { [NL80211_TID_CONFIG_ATTR_VIF_SUPP] = { .type = NLA_U64 }, @@ -345,6 +354,10 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), [NL80211_TID_CONFIG_ATTR_AMSDU_CTRL] = NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), + [NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE] = + NLA_POLICY_MAX(NLA_U8, NL80211_TX_RATE_FIXED), + [NL80211_TID_CONFIG_ATTR_TX_RATE] = + NLA_POLICY_NESTED(nl80211_txattr_policy), }; static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { @@ -4388,16 +4401,9 @@ static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, return true; } -static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { - [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_RATES }, - [NL80211_TXRATE_HT] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_HT_RATES }, - [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), - [NL80211_TXRATE_GI] = { .type = NLA_U8 }, -}; - static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, + struct nlattr *attrs[], + enum nl80211_attrs attr, struct cfg80211_bitrate_mask *mask) { struct nlattr *tb[NL80211_TXRATE_MAX + 1]; @@ -4428,14 +4434,14 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, } /* if no rates are given set it back to the defaults */ - if (!info->attrs[NL80211_ATTR_TX_RATES]) + if (!attrs[attr]) goto out; /* The nested attribute uses enum nl80211_band as the index. This maps * directly to the enum nl80211_band values used in cfg80211. */ BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8); - nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { + nla_for_each_nested(tx_rates, attrs[attr], rem) { enum nl80211_band band = nla_type(tx_rates); int err; @@ -4940,7 +4946,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return -EINVAL; if (info->attrs[NL80211_ATTR_TX_RATES]) { - err = nl80211_parse_tx_bitrate_mask(info, ¶ms.beacon_rate); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, + ¶ms.beacon_rate); if (err) return err; @@ -10753,7 +10761,8 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; - err = nl80211_parse_tx_bitrate_mask(info, &mask); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, &mask); if (err) return err; @@ -11359,7 +11368,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_TX_RATES]) { - err = nl80211_parse_tx_bitrate_mask(info, &setup.beacon_rate); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, + &setup.beacon_rate); if (err) return err; @@ -14139,6 +14150,23 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]); } + if (attrs[NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE]) { + u32 idx = NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE, attr; + + tid_conf->txrate_type = nla_get_u8(attrs[idx]); + + if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) { + attr = NL80211_TID_CONFIG_ATTR_TX_RATE; + err = nl80211_parse_tx_bitrate_mask(info, attrs, attr, + &tid_conf->txrate_mask); + if (err) + return err; + + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE); + } + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE); + } + if (peer) mask = rdev->wiphy.tid_config_support.peer; else -- cgit v1.2.3 From 1b9ae0c92925ac40489be526d67d0010d0724ce0 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Thu, 21 May 2020 22:14:22 +0200 Subject: wireless: Use linux/stddef.h instead of stddef.h When compiling inside the kernel include linux/stddef.h instead of stddef.h. When I compile this header file in backports for power PC I run into a conflict with ptrdiff_t. I was unable to reproduce this in mainline kernel. I still would like to fix this problem in the kernel. Fixes: 6989310f5d43 ("wireless: Use offsetof instead of custom macro.") Signed-off-by: Hauke Mehrtens Link: https://lore.kernel.org/r/20200521201422.16493-1-hauke@hauke-m.de Signed-off-by: Johannes Berg --- include/uapi/linux/wireless.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/wireless.h b/include/uapi/linux/wireless.h index a2c006a364e0..24f3371ad826 100644 --- a/include/uapi/linux/wireless.h +++ b/include/uapi/linux/wireless.h @@ -74,7 +74,11 @@ #include /* for "struct sockaddr" et al */ #include /* for IFNAMSIZ and co... */ -#include /* for offsetof */ +#ifdef __KERNEL__ +# include /* for offsetof */ +#else +# include /* for offsetof */ +#endif /***************************** VERSION *****************************/ /* -- cgit v1.2.3 From 3234ac664a870e6ea69ae3a57d824cd7edbeacc5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 May 2020 14:06:17 -0700 Subject: /dev/mem: Revoke mappings when a driver claims the region Close the hole of holding a mapping over kernel driver takeover event of a given address range. Commit 90a545e98126 ("restrict /dev/mem to idle io memory ranges") introduced CONFIG_IO_STRICT_DEVMEM with the goal of protecting the kernel against scenarios where a /dev/mem user tramples memory that a kernel driver owns. However, this protection only prevents *new* read(), write() and mmap() requests. Established mappings prior to the driver calling request_mem_region() are left alone. Especially with persistent memory, and the core kernel metadata that is stored there, there are plentiful scenarios for a /dev/mem user to violate the expectations of the driver and cause amplified damage. Teach request_mem_region() to find and shoot down active /dev/mem mappings that it believes it has successfully claimed for the exclusive use of the driver. Effectively a driver call to request_mem_region() becomes a hole-punch on the /dev/mem device. The typical usage of unmap_mapping_range() is part of truncate_pagecache() to punch a hole in a file, but in this case the implementation is only doing the "first half" of a hole punch. Namely it is just evacuating current established mappings of the "hole", and it relies on the fact that /dev/mem establishes mappings in terms of absolute physical address offsets. Once existing mmap users are invalidated they can attempt to re-establish the mapping, or attempt to continue issuing read(2) / write(2) to the invalidated extent, but they will then be subject to the CONFIG_IO_STRICT_DEVMEM checking that can block those subsequent accesses. Cc: Arnd Bergmann Cc: Ingo Molnar Cc: Kees Cook Cc: Matthew Wilcox Cc: Russell King Cc: Andrew Morton Cc: Greg Kroah-Hartman Fixes: 90a545e98126 ("restrict /dev/mem to idle io memory ranges") Signed-off-by: Dan Williams Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/159009507306.847224.8502634072429766747.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/mem.c | 101 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/ioport.h | 6 +++ include/uapi/linux/magic.h | 1 + kernel/resource.c | 5 +++ 4 files changed, 111 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 43dd0891ca1e..31cae88a730b 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -31,11 +31,15 @@ #include #include #include +#include +#include +#include #ifdef CONFIG_IA64 # include #endif +#define DEVMEM_MINOR 1 #define DEVPORT_MINOR 4 static inline unsigned long size_inside_page(unsigned long start, @@ -805,12 +809,64 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig) return ret; } +static struct inode *devmem_inode; + +#ifdef CONFIG_IO_STRICT_DEVMEM +void revoke_devmem(struct resource *res) +{ + struct inode *inode = READ_ONCE(devmem_inode); + + /* + * Check that the initialization has completed. Losing the race + * is ok because it means drivers are claiming resources before + * the fs_initcall level of init and prevent /dev/mem from + * establishing mappings. + */ + if (!inode) + return; + + /* + * The expectation is that the driver has successfully marked + * the resource busy by this point, so devmem_is_allowed() + * should start returning false, however for performance this + * does not iterate the entire resource range. + */ + if (devmem_is_allowed(PHYS_PFN(res->start)) && + devmem_is_allowed(PHYS_PFN(res->end))) { + /* + * *cringe* iomem=relaxed says "go ahead, what's the + * worst that can happen?" + */ + return; + } + + unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1); +} +#endif + static int open_port(struct inode *inode, struct file *filp) { + int rc; + if (!capable(CAP_SYS_RAWIO)) return -EPERM; - return security_locked_down(LOCKDOWN_DEV_MEM); + rc = security_locked_down(LOCKDOWN_DEV_MEM); + if (rc) + return rc; + + if (iminor(inode) != DEVMEM_MINOR) + return 0; + + /* + * Use a unified address space to have a single point to manage + * revocations when drivers want to take over a /dev/mem mapped + * range. + */ + inode->i_mapping = devmem_inode->i_mapping; + filp->f_mapping = inode->i_mapping; + + return 0; } #define zero_lseek null_lseek @@ -885,7 +941,7 @@ static const struct memdev { fmode_t fmode; } devlist[] = { #ifdef CONFIG_DEVMEM - [1] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET }, + [DEVMEM_MINOR] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET }, #endif #ifdef CONFIG_DEVKMEM [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET }, @@ -939,6 +995,45 @@ static char *mem_devnode(struct device *dev, umode_t *mode) static struct class *mem_class; +static int devmem_fs_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type devmem_fs_type = { + .name = "devmem", + .owner = THIS_MODULE, + .init_fs_context = devmem_fs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int devmem_init_inode(void) +{ + static struct vfsmount *devmem_vfs_mount; + static int devmem_fs_cnt; + struct inode *inode; + int rc; + + rc = simple_pin_fs(&devmem_fs_type, &devmem_vfs_mount, &devmem_fs_cnt); + if (rc < 0) { + pr_err("Cannot mount /dev/mem pseudo filesystem: %d\n", rc); + return rc; + } + + inode = alloc_anon_inode(devmem_vfs_mount->mnt_sb); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + pr_err("Cannot allocate inode for /dev/mem: %d\n", rc); + simple_release_fs(&devmem_vfs_mount, &devmem_fs_cnt); + return rc; + } + + /* publish /dev/mem initialized */ + WRITE_ONCE(devmem_inode, inode); + + return 0; +} + static int __init chr_dev_init(void) { int minor; @@ -960,6 +1055,8 @@ static int __init chr_dev_init(void) */ if ((minor == DEVPORT_MINOR) && !arch_has_dev_port()) continue; + if ((minor == DEVMEM_MINOR) && devmem_init_inode() != 0) + continue; device_create(mem_class, NULL, MKDEV(MEM_MAJOR, minor), NULL, devlist[minor].name); diff --git a/include/linux/ioport.h b/include/linux/ioport.h index a9b9170b5dd2..6c3eca90cbc4 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -301,5 +301,11 @@ struct resource *devm_request_free_mem_region(struct device *dev, struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name); +#ifdef CONFIG_IO_STRICT_DEVMEM +void revoke_devmem(struct resource *res); +#else +static inline void revoke_devmem(struct resource *res) { }; +#endif + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index d78064007b17..f3956fc11de6 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -94,6 +94,7 @@ #define BALLOON_KVM_MAGIC 0x13661366 #define ZSMALLOC_MAGIC 0x58295829 #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */ +#define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define Z3FOLD_MAGIC 0x33 #define PPC_CMM_MAGIC 0xc7571590 diff --git a/kernel/resource.c b/kernel/resource.c index 76036a41143b..841737bbda9e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1126,6 +1126,7 @@ struct resource * __request_region(struct resource *parent, { DECLARE_WAITQUEUE(wait, current); struct resource *res = alloc_resource(GFP_KERNEL); + struct resource *orig_parent = parent; if (!res) return NULL; @@ -1176,6 +1177,10 @@ struct resource * __request_region(struct resource *parent, break; } write_unlock(&resource_lock); + + if (res && orig_parent == &iomem_resource) + revoke_devmem(res); + return res; } EXPORT_SYMBOL(__request_region); -- cgit v1.2.3 From 20f6a05ef63594feb0c6dfbd629da0448b43124d Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Wed, 27 May 2020 12:34:30 +0000 Subject: bridge: mrp: Rework the MRP netlink interface This patch reworks the MRP netlink interface. Before, each attribute represented a binary structure which made it hard to be extended. Therefore update the MRP netlink interface such that each existing attribute to be a nested attribute which contains the fields of the binary structures. In this way the MRP netlink interface can be extended without breaking the backwards compatibility. It is also using strict checking for attributes under the MRP top attribute. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 64 ++++++++-- net/bridge/br_mrp.c | 8 +- net/bridge/br_mrp_netlink.c | 266 +++++++++++++++++++++++++++++++++++------ net/bridge/br_private_mrp.h | 2 +- 4 files changed, 290 insertions(+), 50 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index bd8c95488f16..5a43eb86c93b 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -169,17 +169,69 @@ enum { __IFLA_BRIDGE_MRP_MAX, }; +#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_INSTANCE_UNSPEC, + IFLA_BRIDGE_MRP_INSTANCE_RING_ID, + IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX, + IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX, + __IFLA_BRIDGE_MRP_INSTANCE_MAX, +}; + +#define IFLA_BRIDGE_MRP_INSTANCE_MAX (__IFLA_BRIDGE_MRP_INSTANCE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC, + IFLA_BRIDGE_MRP_PORT_STATE_STATE, + __IFLA_BRIDGE_MRP_PORT_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_PORT_STATE_MAX (__IFLA_BRIDGE_MRP_PORT_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_PORT_ROLE_ROLE, + __IFLA_BRIDGE_MRP_PORT_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_PORT_ROLE_MAX (__IFLA_BRIDGE_MRP_PORT_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_RING_STATE_UNSPEC, + IFLA_BRIDGE_MRP_RING_STATE_RING_ID, + IFLA_BRIDGE_MRP_RING_STATE_STATE, + __IFLA_BRIDGE_MRP_RING_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_RING_STATE_MAX (__IFLA_BRIDGE_MRP_RING_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_RING_ROLE_RING_ID, + IFLA_BRIDGE_MRP_RING_ROLE_ROLE, + __IFLA_BRIDGE_MRP_RING_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_RING_ROLE_MAX (__IFLA_BRIDGE_MRP_RING_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_START_TEST_UNSPEC, + IFLA_BRIDGE_MRP_START_TEST_RING_ID, + IFLA_BRIDGE_MRP_START_TEST_INTERVAL, + IFLA_BRIDGE_MRP_START_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_START_TEST_PERIOD, + __IFLA_BRIDGE_MRP_START_TEST_MAX, +}; + +#define IFLA_BRIDGE_MRP_START_TEST_MAX (__IFLA_BRIDGE_MRP_START_TEST_MAX - 1) + struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; __u32 s_ifindex; }; -struct br_mrp_port_role { - __u32 ring_id; - __u32 role; -}; - struct br_mrp_ring_state { __u32 ring_id; __u32 ring_state; @@ -197,8 +249,6 @@ struct br_mrp_start_test { __u32 period; }; -#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1) - struct bridge_stp_xstats { __u64 transition_blk; __u64 transition_fwd; diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 528d767eb026..8ea59504ef47 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -376,24 +376,24 @@ int br_mrp_set_port_state(struct net_bridge_port *p, * note: already called with rtnl_lock */ int br_mrp_set_port_role(struct net_bridge_port *p, - struct br_mrp_port_role *role) + enum br_mrp_port_role_type role) { struct br_mrp *mrp; if (!p || !(p->flags & BR_MRP_AWARE)) return -EINVAL; - mrp = br_mrp_find_id(p->br, role->ring_id); + mrp = br_mrp_find_port(p->br, p); if (!mrp) return -EINVAL; - if (role->role == BR_MRP_PORT_ROLE_PRIMARY) + if (role == BR_MRP_PORT_ROLE_PRIMARY) rcu_assign_pointer(mrp->p_port, p); else rcu_assign_pointer(mrp->s_port, p); - br_mrp_port_switchdev_set_role(p, role->role); + br_mrp_port_switchdev_set_role(p, role); return 0; } diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 4a08a99519b0..d9de780d2ce0 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -8,19 +8,222 @@ static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = { [IFLA_BRIDGE_MRP_UNSPEC] = { .type = NLA_REJECT }, - [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_instance)}, - [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_U32 }, - [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_port_role)}, - [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_ring_state)}, - [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_ring_role)}, - [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_EXACT_LEN, - .len = sizeof(struct br_mrp_start_test)}, + [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_NESTED }, }; +static const struct nla_policy +br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = { + [IFLA_BRIDGE_MRP_INSTANCE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_INSTANCE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX] = { .type = NLA_U32 }, +}; + +static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr, + int cmd, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1]; + struct br_mrp_instance inst; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_INSTANCE_MAX, attr, + br_mrp_instance_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] || + !tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or P_IFINDEX or S_IFINDEX"); + return -EINVAL; + } + + memset(&inst, 0, sizeof(inst)); + + inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]); + inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]); + inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]); + + if (cmd == RTM_SETLINK) + return br_mrp_add(br, &inst); + else + return br_mrp_del(br, &inst); + + return 0; +} + +static const struct nla_policy +br_mrp_port_state_policy[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1] = { + [IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_PORT_STATE_STATE] = { .type = NLA_U32 }, +}; + +static int br_mrp_port_state_parse(struct net_bridge_port *p, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1]; + enum br_mrp_port_state_type state; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_STATE_MAX, attr, + br_mrp_port_state_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing attribute: STATE"); + return -EINVAL; + } + + state = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]); + + return br_mrp_set_port_state(p, state); +} + +static const struct nla_policy +br_mrp_port_role_policy[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1] = { + [IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_PORT_ROLE_ROLE] = { .type = NLA_U32 }, +}; + +static int br_mrp_port_role_parse(struct net_bridge_port *p, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1]; + enum br_mrp_port_role_type role; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_ROLE_MAX, attr, + br_mrp_port_role_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing attribute: ROLE"); + return -EINVAL; + } + + role = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]); + + return br_mrp_set_port_role(p, role); +} + +static const struct nla_policy +br_mrp_ring_state_policy[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1] = { + [IFLA_BRIDGE_MRP_RING_STATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_RING_STATE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_RING_STATE_STATE] = { .type = NLA_U32 }, +}; + +static int br_mrp_ring_state_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1]; + struct br_mrp_ring_state state; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_STATE_MAX, attr, + br_mrp_ring_state_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or STATE"); + return -EINVAL; + } + + memset(&state, 0x0, sizeof(state)); + + state.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID]); + state.ring_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]); + + return br_mrp_set_ring_state(br, &state); +} + +static const struct nla_policy +br_mrp_ring_role_policy[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1] = { + [IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_RING_ROLE_ROLE] = { .type = NLA_U32 }, +}; + +static int br_mrp_ring_role_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1]; + struct br_mrp_ring_role role; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_ROLE_MAX, attr, + br_mrp_ring_role_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or ROLE"); + return -EINVAL; + } + + memset(&role, 0x0, sizeof(role)); + + role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID]); + role.ring_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]); + + return br_mrp_set_ring_role(br, &role); +} + +static const struct nla_policy +br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = { + [IFLA_BRIDGE_MRP_START_TEST_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_START_TEST_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = { .type = NLA_U32 }, +}; + +static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_START_TEST_MAX + 1]; + struct br_mrp_start_test test; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_TEST_MAX, attr, + br_mrp_start_test_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID] || + !tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL] || + !tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] || + !tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD"); + return -EINVAL; + } + + memset(&test, 0x0, sizeof(test)); + + test.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID]); + test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]); + test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]); + test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]); + + return br_mrp_start_test(br, &test); +} + int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { @@ -44,58 +247,45 @@ int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, return err; if (tb[IFLA_BRIDGE_MRP_INSTANCE]) { - struct br_mrp_instance *instance = - nla_data(tb[IFLA_BRIDGE_MRP_INSTANCE]); - - if (cmd == RTM_SETLINK) - err = br_mrp_add(br, instance); - else - err = br_mrp_del(br, instance); + err = br_mrp_instance_parse(br, tb[IFLA_BRIDGE_MRP_INSTANCE], + cmd, extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) { - enum br_mrp_port_state_type state = - nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE]); - - err = br_mrp_set_port_state(p, state); + err = br_mrp_port_state_parse(p, tb[IFLA_BRIDGE_MRP_PORT_STATE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) { - struct br_mrp_port_role *role = - nla_data(tb[IFLA_BRIDGE_MRP_PORT_ROLE]); - - err = br_mrp_set_port_role(p, role); + err = br_mrp_port_role_parse(p, tb[IFLA_BRIDGE_MRP_PORT_ROLE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_RING_STATE]) { - struct br_mrp_ring_state *state = - nla_data(tb[IFLA_BRIDGE_MRP_RING_STATE]); - - err = br_mrp_set_ring_state(br, state); + err = br_mrp_ring_state_parse(br, + tb[IFLA_BRIDGE_MRP_RING_STATE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) { - struct br_mrp_ring_role *role = - nla_data(tb[IFLA_BRIDGE_MRP_RING_ROLE]); - - err = br_mrp_set_ring_role(br, role); + err = br_mrp_ring_role_parse(br, tb[IFLA_BRIDGE_MRP_RING_ROLE], + extack); if (err) return err; } if (tb[IFLA_BRIDGE_MRP_START_TEST]) { - struct br_mrp_start_test *test = - nla_data(tb[IFLA_BRIDGE_MRP_START_TEST]); - - err = br_mrp_start_test(br, test); + err = br_mrp_start_test_parse(br, + tb[IFLA_BRIDGE_MRP_START_TEST], + extack); if (err) return err; } diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 2921a4b59f8e..a0f53cc3ab85 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -37,7 +37,7 @@ int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance); int br_mrp_set_port_state(struct net_bridge_port *p, enum br_mrp_port_state_type state); int br_mrp_set_port_role(struct net_bridge_port *p, - struct br_mrp_port_role *role); + enum br_mrp_port_role_type role); int br_mrp_set_ring_state(struct net_bridge *br, struct br_mrp_ring_state *state); int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role); -- cgit v1.2.3 From 34e2ab57a911f8b32b22580d11a02f0b79108245 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 13:33:00 +0300 Subject: RDMA/ucma: Extend ucma_connect to receive ECE parameters Active side of CMID initiates connection through librdmacm's rdma_connect() and kernel's ucma_connect(). Extend UCMA interface to handle those new parameters. Link: https://lore.kernel.org/r/20200526103304.196371-3-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 21 +++++++++++++++++++++ drivers/infiniband/core/cma_priv.h | 1 + drivers/infiniband/core/ucma.c | 14 +++++++++++--- include/rdma/rdma_cm.h | 3 +++ include/uapi/rdma/rdma_user_cm.h | 6 ++++++ 5 files changed, 42 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 432eec472164..e81b8a523a3e 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4036,6 +4036,27 @@ err: } EXPORT_SYMBOL(rdma_connect); +/** + * rdma_connect_ece - Initiate an active connection request with ECE data. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * @ece: ECE parameters + * + * See rdma_connect() explanation. + */ +int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return rdma_connect(id, conn_param); +} +EXPORT_SYMBOL(rdma_connect_ece); + static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index 5edcf44a9307..caece96ebcf5 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -95,6 +95,7 @@ struct rdma_id_private { * Internal to RDMA/core, don't use in the drivers */ struct rdma_restrack_entry res; + struct rdma_ucm_ece ece; }; #if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 06127c800a49..7cbb63690241 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1072,12 +1072,15 @@ static void ucma_copy_conn_param(struct rdma_cm_id *id, static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_connect cmd; struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; + struct rdma_ucm_connect cmd; struct ucma_context *ctx; + size_t in_size; int ret; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; if (!cmd.conn_param.valid) @@ -1088,8 +1091,13 @@ static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, return PTR_ERR(ctx); ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + mutex_lock(&ctx->mutex); - ret = rdma_connect(ctx->cm_id, &conn_param); + ret = rdma_connect_ece(ctx->cm_id, &conn_param, &ece); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index ea8e794785ed..4e2975eb3643 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -264,6 +264,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, */ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); +int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece); + /** * rdma_listen - This function is called by the passive side to * listen for incoming connection requests. diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 1bb6e75d254b..c1409dd7225f 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -210,10 +210,16 @@ struct rdma_ucm_ud_param { __u8 reserved[7]; }; +struct rdma_ucm_ece { + __u32 vendor_id; + __u32 attr_mod; +}; + struct rdma_ucm_connect { struct rdma_ucm_conn_param conn_param; __u32 id; __u32 reserved; + struct rdma_ucm_ece ece; }; struct rdma_ucm_listen { -- cgit v1.2.3 From 93531ee7b9d1313227d2b4f354989895e8d57b72 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 13:33:01 +0300 Subject: RDMA/ucma: Deliver ECE parameters through UCMA events Passive side of CMID connection receives ECE request through REQ message and needs to respond with relevant REP message which will be forwarded to active side. The UCMA events interface is responsible for such communication with the user space (librdmacm). Extend it to provide ECE wire data. Link: https://lore.kernel.org/r/20200526103304.196371-4-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 6 +++++- include/rdma/rdma_cm.h | 1 + include/uapi/rdma/rdma_user_cm.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 7cbb63690241..3e5268cfa164 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -360,6 +360,9 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); + uevent->resp.ece.vendor_id = event->ece.vendor_id; + uevent->resp.ece.attr_mod = event->ece.attr_mod; + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) { if (!ctx->backlog) { ret = -ENOMEM; @@ -404,7 +407,8 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, * Old 32 bit user space does not send the 4 byte padding in the * reserved field. We don't care, allow it to keep working. */ - if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved)) + if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved) - + sizeof(uevent->resp.ece)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 4e2975eb3643..418590c9a9e8 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -111,6 +111,7 @@ struct rdma_cm_event { struct rdma_conn_param conn; struct rdma_ud_param ud; } param; + struct rdma_ucm_ece ece; }; struct rdma_cm_id; diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index c1409dd7225f..19c5c3f74af9 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -297,6 +297,7 @@ struct rdma_ucm_event_resp { struct rdma_ucm_ud_param ud; } param; __u32 reserved; + struct rdma_ucm_ece ece; }; /* Option levels */ -- cgit v1.2.3 From 0cb15372a615a9835893f43e86ae45399eb63996 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 13:33:03 +0300 Subject: RDMA/cma: Connect ECE to rdma_accept The rdma_accept() is called by both passive and active sides of CMID connection to mark readiness to start data transfer. For passive side, this is called explicitly, for active side, it is called implicitly while receiving REP message. Provide ECE data to rdma_accept function needed for passive side to send that REP message. Link: https://lore.kernel.org/r/20200526103304.196371-6-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 19 +++++++++++++++++++ drivers/infiniband/core/ucma.c | 14 +++++++++++--- include/rdma/rdma_cm.h | 3 +++ include/uapi/rdma/rdma_user_cm.h | 1 + 4 files changed, 34 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f554a371f4fa..d449afe5557b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4090,6 +4090,8 @@ static int cma_accept_ib(struct rdma_id_private *id_priv, rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_rep(id_priv); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); @@ -4137,7 +4139,11 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; + + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; } + rep.private_data = private_data; rep.private_data_len = private_data_len; @@ -4195,6 +4201,19 @@ reject: } EXPORT_SYMBOL(__rdma_accept); +int __rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + const char *caller, struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return __rdma_accept(id, conn_param, caller); +} +EXPORT_SYMBOL(__rdma_accept_ece); + int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 3e5268cfa164..6b27b210b890 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1135,28 +1135,36 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, { struct rdma_ucm_accept cmd; struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; struct ucma_context *ctx; + size_t in_size; int ret; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + if (cmd.conn_param.valid) { ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); mutex_lock(&file->mut); mutex_lock(&ctx->mutex); - ret = __rdma_accept(ctx->cm_id, &conn_param, NULL); + ret = __rdma_accept_ece(ctx->cm_id, &conn_param, NULL, &ece); mutex_unlock(&ctx->mutex); if (!ret) ctx->uid = cmd.uid; mutex_unlock(&file->mut); } else { mutex_lock(&ctx->mutex); - ret = __rdma_accept(ctx->cm_id, NULL, NULL); + ret = __rdma_accept_ece(ctx->cm_id, NULL, NULL, &ece); mutex_unlock(&ctx->mutex); } ucma_put_ctx(ctx); diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 418590c9a9e8..7ac91677660f 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -280,6 +280,9 @@ int rdma_listen(struct rdma_cm_id *id, int backlog); int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, const char *caller); +int __rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + const char *caller, struct rdma_ucm_ece *ece); + /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 19c5c3f74af9..6b883dde7064 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -232,6 +232,7 @@ struct rdma_ucm_accept { struct rdma_ucm_conn_param conn_param; __u32 id; __u32 reserved; + struct rdma_ucm_ece ece; }; struct rdma_ucm_reject { -- cgit v1.2.3 From 8094ba0ace7f6cd1e31ea8b151fba3594cadfa9a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 13:33:04 +0300 Subject: RDMA/cma: Provide ECE reject reason IBTA declares "vendor option not supported" reject reason in REJ messages if passive side doesn't want to accept proposed ECE options. Due to the fact that ECE is managed by userspace, there is a need to let users to provide such rejected reason. Link: https://lore.kernel.org/r/20200526103304.196371-7-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 9 ++++----- drivers/infiniband/core/ucma.c | 15 ++++++++++++++- drivers/infiniband/ulp/isert/ib_isert.c | 5 +++-- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 3 ++- drivers/infiniband/ulp/srpt/ib_srpt.c | 3 ++- drivers/nvme/target/rdma.c | 4 +++- include/rdma/rdma_cm.h | 2 +- include/uapi/rdma/rdma_user_cm.h | 3 ++- net/rds/ib_cm.c | 4 +++- 9 files changed, 34 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index d449afe5557b..8026ee56546a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4196,7 +4196,7 @@ int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, return 0; reject: cma_modify_qp_err(id_priv); - rdma_reject(id, NULL, 0); + rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } EXPORT_SYMBOL(__rdma_accept); @@ -4236,7 +4236,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, - u8 private_data_len) + u8 private_data_len, u8 reason) { struct rdma_id_private *id_priv; int ret; @@ -4251,9 +4251,8 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, private_data, private_data_len); } else { trace_cm_send_rej(id_priv); - ret = ib_send_cm_rej(id_priv->cm_id.ib, - IB_CM_REJ_CONSUMER_DEFINED, NULL, - 0, private_data, private_data_len); + ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, + private_data, private_data_len); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 6b27b210b890..5b87eee8ccc8 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include "core_priv.h" @@ -1181,12 +1182,24 @@ static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + if (!cmd.reason) + cmd.reason = IB_CM_REJ_CONSUMER_DEFINED; + + switch (cmd.reason) { + case IB_CM_REJ_CONSUMER_DEFINED: + case IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED: + break; + default: + return -EINVAL; + } + ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); - ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len); + ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len, + cmd.reason); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index a1a035270cab..b7df38ee8ae0 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -502,7 +503,7 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) if (!np->enabled) { spin_unlock_bh(&np->np_thread_lock); isert_dbg("iscsi_np is not enabled, reject connect request\n"); - return rdma_reject(cma_id, NULL, 0); + return rdma_reject(cma_id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); } spin_unlock_bh(&np->np_thread_lock); @@ -553,7 +554,7 @@ out_rsp_dma_map: isert_free_login_buf(isert_conn); out: kfree(isert_conn); - rdma_reject(cma_id, NULL, 0); + rdma_reject(cma_id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 5ef8988ee75b..0d9241f5d9e6 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -15,6 +15,7 @@ #include "rtrs-srv.h" #include "rtrs-log.h" +#include MODULE_DESCRIPTION("RDMA Transport Server"); MODULE_LICENSE("GPL"); @@ -1576,7 +1577,7 @@ static int rtrs_rdma_do_reject(struct rdma_cm_id *cm_id, int errno) .errno = cpu_to_le16(errno), }; - err = rdma_reject(cm_id, &msg, sizeof(msg)); + err = rdma_reject(cm_id, &msg, sizeof(msg), IB_CM_REJ_CONSUMER_DEFINED); if (err) pr_err("rdma_reject(), err: %d\n", err); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index a294630f2100..cdc8c239d6c0 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -2497,7 +2497,8 @@ reject: SRP_BUF_FORMAT_INDIRECT); if (rdma_cm_id) - rdma_reject(rdma_cm_id, rej, sizeof(*rej)); + rdma_reject(rdma_cm_id, rej, sizeof(*rej), + IB_CM_REJ_CONSUMER_DEFINED); else ib_send_cm_rej(ib_cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, rej, sizeof(*rej)); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index fd47de0e4e4e..55aaf03a9580 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "nvmet.h" @@ -1138,7 +1139,8 @@ static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); rej.sts = cpu_to_le16(status); - return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); + return rdma_reject(cm_id, (void *)&rej, sizeof(rej), + IB_CM_REJ_CONSUMER_DEFINED); } static struct nvmet_rdma_queue * diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 7ac91677660f..939d7abe026f 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -320,7 +320,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event); * rdma_reject - Called to reject a connection request or response. */ int rdma_reject(struct rdma_cm_id *id, const void *private_data, - u8 private_data_len); + u8 private_data_len, u8 reason); /** * rdma_disconnect - This function disconnects the associated QP and diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 6b883dde7064..ed5a514305c1 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -238,7 +238,8 @@ struct rdma_ucm_accept { struct rdma_ucm_reject { __u32 id; __u8 private_data_len; - __u8 reserved[3]; + __u8 reason; + __u8 reserved[2]; __u8 private_data[RDMA_MAX_PRIVATE_DATA]; }; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index c71f4328d138..0fec4171564e 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "rds_single_path.h" #include "rds.h" @@ -927,7 +928,8 @@ out: if (conn) mutex_unlock(&conn->c_cm_lock); if (err) - rdma_reject(cm_id, &err, sizeof(int)); + rdma_reject(cm_id, &err, sizeof(int), + IB_CM_REJ_CONSUMER_DEFINED); return destroy; } -- cgit v1.2.3 From 3e09a427ae7ac347e08dca5ffac64c902860d675 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:34 +0300 Subject: RDMA/mlx5: Get ECE options from FW during create QP Supported ECE options are returned from FW in the create_qp phase and zero means that field is not valid. Such default value allows us to reuse reserved field without worries about comp_mask. Update create QP API to return ECE options. Link: https://lore.kernel.org/r/20200526115440.205922-3-leon@kernel.org Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 16 +++++++++++----- drivers/infiniband/hw/mlx5/qp.h | 4 ++-- drivers/infiniband/hw/mlx5/qpc.c | 8 ++++---- include/uapi/rdma/mlx5-abi.h | 2 +- 4 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 2e28752e8cd2..be7289c480f7 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1842,6 +1842,7 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr = params->attr; u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_qp_base *base; @@ -1894,13 +1895,14 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, } base = &qp->trans_qp.base; - err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); + err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out); kvfree(in); if (err) return err; base->container_mibqp = qp; base->mqp.event = mlx5_ib_qp_event; + params->resp.ece_options = MLX5_GET(create_qp_out, out, ece); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); list_add_tail(&qp->qps_list, &dev->qp_list); @@ -1916,6 +1918,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, { struct ib_qp_init_attr *init_attr = params->attr; struct mlx5_ib_create_qp *ucmd = params->ucmd; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; struct ib_udata *udata = params->udata; u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; @@ -2065,7 +2068,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, ¶ms->resp); } else - err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); + err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out); kvfree(in); if (err) @@ -2073,6 +2076,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, base->container_mibqp = qp; base->mqp.event = mlx5_ib_qp_event; + params->resp.ece_options = MLX5_GET(create_qp_out, out, ece); get_cqs(qp->type, init_attr->send_cq, init_attr->recv_cq, &send_cq, &recv_cq); @@ -2105,6 +2109,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *attr = params->attr; u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_cq *send_cq; @@ -2195,7 +2200,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); - err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); + err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out); kvfree(in); if (err) goto err_create; @@ -2779,12 +2784,13 @@ out: qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; mlx5_ib_dbg(dev, - "QP type %d, ib qpn 0x%X, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", + "QP type %d, ib qpn 0x%X, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x, ece 0x%x\n", qp->type, qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn, params->attr->recv_cq ? to_mcq(params->attr->recv_cq)->mcq.cqn : -1, params->attr->send_cq ? to_mcq(params->attr->send_cq)->mcq.cqn : - -1); + -1, + params->resp.ece_options); return 0; } diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index ad9d76e3e18a..795c21f88962 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -13,8 +13,8 @@ void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev); int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *qp, u32 *in, int inlen, u32 *out, int outlen); -int mlx5_core_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, - u32 *in, int inlen); +int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *in, int inlen, u32 *out); int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, void *qpc, struct mlx5_core_qp *qp); int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp); diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index ea62735042f0..69c80859a6ee 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -236,16 +236,16 @@ err_cmd: return err; } -int mlx5_core_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, - u32 *in, int inlen) +int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *in, int inlen, u32 *out) { - u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; u32 din[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; int err; MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); - err = mlx5_cmd_exec(dev->mdev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec(dev->mdev, in, inlen, out, + MLX5_ST_SZ_BYTES(create_qp_out)); if (err) return err; diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index df1cc3641bda..106fbb3bec6a 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -371,7 +371,7 @@ enum mlx5_ib_create_qp_resp_mask { struct mlx5_ib_create_qp_resp { __u32 bfreg_index; - __u32 reserved; + __u32 ece_options; __u32 comp_mask; __u32 tirn; __u32 tisn; -- cgit v1.2.3 From e383085c24255821e79d3c2aa6302d804b6a1c48 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:35 +0300 Subject: RDMA/mlx5: Set ECE options during QP create Allow users to ask creation of QPs with specific ECE options. Such early set even before RDMA-CM connection is established is useful if user knows exactly which option he needs. Link: https://lore.kernel.org/r/20200526115440.205922-4-leon@kernel.org Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 75 ++++++++++++++++++++++++++++++++++++----- include/uapi/rdma/mlx5-abi.h | 2 ++ 2 files changed, 68 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index be7289c480f7..eb70eb371b4b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1552,6 +1552,7 @@ struct mlx5_create_qp_params { struct ib_udata *udata; size_t inlen; size_t outlen; + size_t ucmd_size; void *ucmd; u8 is_rss_raw : 1; struct ib_qp_init_attr *attr; @@ -1839,6 +1840,7 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_create_qp_params *params) { + struct mlx5_ib_create_qp *ucmd = params->ucmd; struct ib_qp_init_attr *attr = params->attr; u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; @@ -1860,6 +1862,8 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (!in) return -ENOMEM; + if (MLX5_CAP_GEN(mdev, ece_support)) + MLX5_SET(create_qp_in, in, ece, ucmd->ece_options); qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); MLX5_SET(qpc, qpc, st, MLX5_QP_ST_XRC); @@ -1974,6 +1978,8 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (is_sqp(init_attr->qp_type)) qp->port = init_attr->port_num; + if (MLX5_CAP_GEN(mdev, ece_support)) + MLX5_SET(create_qp_in, in, ece, ucmd->ece_options); qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); MLX5_SET(qpc, qpc, st, mlx5_st); @@ -2709,19 +2715,22 @@ static int process_udata_size(struct mlx5_ib_dev *dev, struct mlx5_create_qp_params *params) { size_t ucmd = sizeof(struct mlx5_ib_create_qp); - struct ib_qp_init_attr *attr = params->attr; struct ib_udata *udata = params->udata; size_t outlen = udata->outlen; size_t inlen = udata->inlen; params->outlen = min(outlen, sizeof(struct mlx5_ib_create_qp_resp)); - if (attr->qp_type == IB_QPT_DRIVER) { - params->inlen = (inlen < ucmd) ? 0 : ucmd; - goto out; - } - + params->ucmd_size = ucmd; if (!params->is_rss_raw) { - params->inlen = ucmd; + /* User has old rdma-core, which doesn't support ECE */ + size_t min_inlen = + offsetof(struct mlx5_ib_create_qp, ece_options); + + /* + * We will check in check_ucmd_data() that user + * cleared everything after inlen. + */ + params->inlen = (inlen < min_inlen) ? 0 : min(inlen, ucmd); goto out; } @@ -2733,13 +2742,14 @@ static int process_udata_size(struct mlx5_ib_dev *dev, return -EINVAL; ucmd = sizeof(struct mlx5_ib_create_qp_rss); + params->ucmd_size = ucmd; if (inlen > ucmd && !ib_is_udata_cleared(udata, ucmd, inlen - ucmd)) return -EINVAL; params->inlen = min(ucmd, inlen); out: if (!params->inlen) - mlx5_ib_dbg(dev, "udata is too small or not cleared\n"); + mlx5_ib_dbg(dev, "udata is too small\n"); return (params->inlen) ? 0 : -EINVAL; } @@ -2855,6 +2865,49 @@ static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) return 0; } +static int check_ucmd_data(struct mlx5_ib_dev *dev, + struct mlx5_create_qp_params *params) +{ + struct ib_qp_init_attr *attr = params->attr; + struct ib_udata *udata = params->udata; + size_t size, last; + int ret; + + if (params->is_rss_raw) + /* + * These QPs don't have "reserved" field in their + * create_qp input struct, so their data is always valid. + */ + last = sizeof(struct mlx5_ib_create_qp_rss); + else + /* IB_QPT_RAW_PACKET and IB_QPT_DRIVER don't have ECE data */ + switch (attr->qp_type) { + case IB_QPT_DRIVER: + case IB_QPT_RAW_PACKET: + last = offsetof(struct mlx5_ib_create_qp, ece_options); + break; + default: + last = offsetof(struct mlx5_ib_create_qp, reserved); + } + + if (udata->inlen <= last) + return 0; + + /* + * User provides different create_qp structures based on the + * flow and we need to know if he cleared memory after our + * struct create_qp ends. + */ + size = udata->inlen - last; + ret = ib_is_udata_cleared(params->udata, last, size); + if (!ret) + mlx5_ib_dbg( + dev, + "udata is not cleared, inlen = %lu, ucmd = %lu, last = %lu, size = %lu\n", + udata->inlen, params->ucmd_size, last, size); + return ret ? 0 : -EINVAL; +} + struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, struct ib_udata *udata) { @@ -2888,7 +2941,11 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, if (err) return ERR_PTR(err); - params.ucmd = kzalloc(params.inlen, GFP_KERNEL); + err = check_ucmd_data(dev, ¶ms); + if (err) + return ERR_PTR(err); + + params.ucmd = kzalloc(params.ucmd_size, GFP_KERNEL); if (!params.ucmd) return ERR_PTR(-ENOMEM); diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 106fbb3bec6a..bc9d9e3cb369 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -322,6 +322,8 @@ struct mlx5_ib_create_qp { __aligned_u64 sq_buf_addr; __aligned_u64 access_key; }; + __u32 ece_options; + __u32 reserved; }; /* RX Hash function flags */ -- cgit v1.2.3 From 5f62a521ff20e0b47a8d33421334bd245d6714ff Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:39 +0300 Subject: RDMA/mlx5: Set ECE options during modify QP The most common way to set ECE option will be during modify QP command in INIT2RTR, RTR2RTS and RTS2RTS stages, so update mlx5 to support it. The new bit in the comp_mask is needed to mark that kernel supports ECE and can receive data instead of "reserved" field in the struct mlx5_ib_modify_qp. Link: https://lore.kernel.org/r/20200526115440.205922-8-leon@kernel.org Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 3 +++ drivers/infiniband/hw/mlx5/qp.c | 21 ++++++++++++--------- drivers/infiniband/hw/mlx5/qp.h | 2 +- drivers/infiniband/hw/mlx5/qpc.c | 11 +++++++---- include/uapi/rdma/mlx5-abi.h | 3 ++- 5 files changed, 25 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 623d7898ae6d..6557c8339161 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1959,6 +1959,9 @@ uar_done: resp.response_length += sizeof(resp.dump_fill_mkey); } + if (MLX5_CAP_GEN(dev->mdev, ece_support)) + resp.comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE; + err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) goto out_mdev; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a24176a8ec83..bfa0f7e43e3b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2353,7 +2353,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET && !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) { err = mlx5_core_qp_modify(dev, MLX5_CMD_OP_2RST_QP, 0, - NULL, &base->mqp); + NULL, &base->mqp, NULL); } else { struct mlx5_modify_raw_qp_param raw_qp_param = { .operation = MLX5_CMD_OP_2RST_QP @@ -3978,7 +3978,10 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity); } else { - err = mlx5_core_qp_modify(dev, op, optpar, qpc, &base->mqp); + u32 ece = MLX5_CAP_GEN(dev->mdev, ece_support) ? + ucmd->ece_options : 0; + err = mlx5_core_qp_modify(dev, op, optpar, qpc, &base->mqp, + &ece); } if (err) @@ -4131,7 +4134,6 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1); MLX5_SET(dctc, dctc, counter_set_id, set_id); - } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { struct mlx5_ib_modify_qp_resp resp = {}; u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0}; @@ -4182,7 +4184,6 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct mlx5_ib_modify_qp ucmd = {}; enum ib_qp_type qp_type; enum ib_qp_state cur_state, new_state; - size_t required_cmd_sz; int err = -EINVAL; int port; @@ -4190,9 +4191,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, return -ENOSYS; if (udata && udata->inlen) { - required_cmd_sz = offsetof(typeof(ucmd), reserved) + - sizeof(ucmd.reserved); - if (udata->inlen < required_cmd_sz) + if (udata->inlen < offsetofend(typeof(ucmd), ece_options)) return -EINVAL; if (udata->inlen > sizeof(ucmd) && @@ -4205,10 +4204,10 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, return -EFAULT; if (ucmd.comp_mask || - memchr_inv(&ucmd.reserved, 0, sizeof(ucmd.reserved)) || memchr_inv(&ucmd.burst_info.reserved, 0, sizeof(ucmd.burst_info.reserved))) return -EOPNOTSUPP; + } if (unlikely(ibqp->qp_type == IB_QPT_GSI)) @@ -4217,8 +4216,12 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? IB_QPT_GSI : qp->type; - if (qp_type == MLX5_IB_QPT_DCT) + if (qp_type == MLX5_IB_QPT_DCT) { + if (memchr_inv(&ucmd.ece_options, 0, sizeof(ucmd.ece_options))) + return -EOPNOTSUPP; + return mlx5_ib_modify_dct(ibqp, attr, attr_mask, udata); + } mutex_lock(&qp->mutex); diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index 795c21f88962..82ea2b94dfa6 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -16,7 +16,7 @@ int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *qp, int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, u32 *in, int inlen, u32 *out); int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, - void *qpc, struct mlx5_core_qp *qp); + void *qpc, struct mlx5_core_qp *qp, u32 *ece); int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp); int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct); int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index 69c80859a6ee..d61bc1a88925 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -343,7 +343,7 @@ static void mbox_free(struct mbox_info *mbox) static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, u32 opt_param_mask, void *qpc, - struct mbox_info *mbox, u16 uid) + struct mbox_info *mbox, u16 uid, u32 ece) { mbox->out = NULL; mbox->in = NULL; @@ -391,18 +391,21 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, return -ENOMEM; MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn, opt_param_mask, qpc, uid); + MLX5_SET(init2rtr_qp_in, mbox->in, ece, ece); break; case MLX5_CMD_OP_RTR2RTS_QP: if (MBOX_ALLOC(mbox, rtr2rts_qp)) return -ENOMEM; MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn, opt_param_mask, qpc, uid); + MLX5_SET(rtr2rts_qp_in, mbox->in, ece, ece); break; case MLX5_CMD_OP_RTS2RTS_QP: if (MBOX_ALLOC(mbox, rts2rts_qp)) return -ENOMEM; MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn, opt_param_mask, qpc, uid); + MLX5_SET(rts2rts_qp_in, mbox->in, ece, ece); break; case MLX5_CMD_OP_SQERR2RTS_QP: if (MBOX_ALLOC(mbox, sqerr2rts_qp)) @@ -423,13 +426,13 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, } int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, - void *qpc, struct mlx5_core_qp *qp) + void *qpc, struct mlx5_core_qp *qp, u32 *ece) { struct mbox_info mbox; int err; - err = modify_qp_mbox_alloc(dev->mdev, opcode, qp->qpn, - opt_param_mask, qpc, &mbox, qp->uid); + err = modify_qp_mbox_alloc(dev->mdev, opcode, qp->qpn, opt_param_mask, + qpc, &mbox, qp->uid, (ece) ? *ece : 0); if (err) return err; diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index bc9d9e3cb369..24e29a678177 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -100,6 +100,7 @@ struct mlx5_ib_alloc_ucontext_req_v2 { enum mlx5_ib_alloc_ucontext_resp_mask { MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY = 1UL << 1, + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE = 1UL << 2, }; enum mlx5_user_cmds_supp_uhw { @@ -422,7 +423,7 @@ struct mlx5_ib_burst_info { struct mlx5_ib_modify_qp { __u32 comp_mask; struct mlx5_ib_burst_info burst_info; - __u32 reserved; + __u32 ece_options; }; struct mlx5_ib_modify_qp_resp { -- cgit v1.2.3 From 50aec2c3135efd985291adc2e4d1278d52b03de9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:40 +0300 Subject: RDMA/mlx5: Return ECE data after modify QP After users sets the ECE option, FW will return the agreed/supported bits through an output structures of modify QP stages for regular QPs or through create QP for the DCT. Link: https://lore.kernel.org/r/20200526115440.205922-9-leon@kernel.org Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 25 +++++++++++++++++++++---- drivers/infiniband/hw/mlx5/qpc.c | 25 +++++++++++++++++++++++++ include/uapi/rdma/mlx5-abi.h | 2 ++ 3 files changed, 48 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index bfa0f7e43e3b..1988a0375696 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3708,6 +3708,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, enum ib_qp_state cur_state, enum ib_qp_state new_state, const struct mlx5_ib_modify_qp *ucmd, + struct mlx5_ib_modify_qp_resp *resp, struct ib_udata *udata) { static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { @@ -3978,10 +3979,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity); } else { - u32 ece = MLX5_CAP_GEN(dev->mdev, ece_support) ? - ucmd->ece_options : 0; + if (udata) { + /* For the kernel flows, the resp will stay zero */ + resp->ece_options = + MLX5_CAP_GEN(dev->mdev, ece_support) ? + ucmd->ece_options : 0; + resp->response_length = sizeof(*resp); + } err = mlx5_core_qp_modify(dev, op, optpar, qpc, &base->mqp, - &ece); + &resp->ece_options); } if (err) @@ -4180,6 +4186,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_modify_qp_resp resp = {}; struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_ib_modify_qp ucmd = {}; enum ib_qp_type qp_type; @@ -4292,7 +4299,17 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, - new_state, &ucmd, udata); + new_state, &ucmd, &resp, udata); + + /* resp.response_length is set in ECE supported flows only */ + if (!err && resp.response_length && + udata->outlen >= resp.response_length) + /* + * We don't check return value of the function below + * on purpose, because it is unclear how to unwind the + * error flow after QP was modified to the new state. + */ + ib_copy_to_udata(udata, &resp, resp.response_length); out: mutex_unlock(&qp->mutex); diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index d61bc1a88925..c19d91d6dce8 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -341,6 +341,27 @@ static void mbox_free(struct mbox_info *mbox) kfree(mbox->out); } +static int get_ece_from_mbox(void *out, u16 opcode) +{ + int ece = 0; + + switch (opcode) { + case MLX5_CMD_OP_INIT2RTR_QP: + ece = MLX5_GET(init2rtr_qp_out, out, ece); + break; + case MLX5_CMD_OP_RTR2RTS_QP: + ece = MLX5_GET(rtr2rts_qp_out, out, ece); + break; + case MLX5_CMD_OP_RTS2RTS_QP: + ece = MLX5_GET(rts2rts_qp_out, out, ece); + break; + default: + break; + } + + return ece; +} + static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, u32 opt_param_mask, void *qpc, struct mbox_info *mbox, u16 uid, u32 ece) @@ -438,6 +459,10 @@ int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, err = mlx5_cmd_exec(dev->mdev, mbox.in, mbox.inlen, mbox.out, mbox.outlen); + + if (ece) + *ece = get_ece_from_mbox(mbox.out, opcode); + mbox_free(&mbox); return err; } diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 24e29a678177..27905a0268c9 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -429,6 +429,8 @@ struct mlx5_ib_modify_qp { struct mlx5_ib_modify_qp_resp { __u32 response_length; __u32 dctn; + __u32 ece_options; + __u32 reserved; }; struct mlx5_ib_create_wq_resp { -- cgit v1.2.3 From cb8aa9a3affb7d23b11b11fbed41e2feaabc4b0a Mon Sep 17 00:00:00 2001 From: Romain Bellan Date: Mon, 4 May 2020 21:34:29 +0200 Subject: netfilter: ctnetlink: add kernel side filtering for dump Conntrack dump does not support kernel side filtering (only get exists, but it returns only one entry. And user has to give a full valid tuple) It means that userspace has to implement filtering after receiving many irrelevant entries, consuming resources (conntrack table is sometimes very huge, much more than a routing table for example). This patch adds filtering in kernel side. To achieve this goal, we: * Add a new CTA_FILTER netlink attributes, actually a flag list to parametize filtering * Convert some *nlattr_to_tuple() functions, to allow a partial parsing of CTA_TUPLE_ORIG and CTA_TUPLE_REPLY (so nf_conntrack_tuple it not fully set) Filtering is now possible on: * IP SRC/DST values * Ports for TCP and UDP flows * IMCP(v6) codes types and IDs Filtering is done as an "AND" operator. For example, when flags PROTO_SRC_PORT, PROTO_NUM and IP_SRC are sets, only entries matching all values are dumped. Changes since v1: Set NLM_F_DUMP_FILTERED in nlm flags if entries are filtered Changes since v2: Move several constants to nf_internals.h Move a fix on netlink values check in a separate patch Add a check on not-supported flags Return EOPNOTSUPP if CDA_FILTER is set in ctnetlink_flush_conntrack (not yet implemented) Code style issues Changes since v3: Fix compilation warning reported by kbuild test robot Changes since v4: Fix a regression introduced in v3 (returned EINVAL for valid netlink messages without CTA_MARK) Changes since v5: Change definition of CTA_FILTER_F_ALL Fix a regression when CTA_TUPLE_ZONE is not set Signed-off-by: Romain Bellan Signed-off-by: Florent Fourcot Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 6 +- include/uapi/linux/netfilter/nfnetlink_conntrack.h | 9 + net/netfilter/nf_conntrack_core.c | 19 +- net/netfilter/nf_conntrack_netlink.c | 334 ++++++++++++++++++--- net/netfilter/nf_conntrack_proto_icmp.c | 40 ++- net/netfilter/nf_conntrack_proto_icmpv6.c | 42 ++- net/netfilter/nf_internals.h | 17 ++ 7 files changed, 394 insertions(+), 73 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 4cad1f0a327a..88186b95b3c2 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -42,7 +42,8 @@ struct nf_conntrack_l4proto { /* Calculate tuple nlattr size */ unsigned int (*nlattr_tuple_size)(void); int (*nlattr_to_tuple)(struct nlattr *tb[], - struct nf_conntrack_tuple *t); + struct nf_conntrack_tuple *t, + u_int32_t flags); const struct nla_policy *nla_policy; struct { @@ -152,7 +153,8 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto); int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t); + struct nf_conntrack_tuple *t, + u_int32_t flags); unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 1d41810d17e2..262881792671 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -55,6 +55,7 @@ enum ctattr_type { CTA_LABELS, CTA_LABELS_MASK, CTA_SYNPROXY, + CTA_FILTER, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) @@ -276,4 +277,12 @@ enum ctattr_expect_stats { }; #define CTA_STATS_EXP_MAX (__CTA_STATS_EXP_MAX - 1) +enum ctattr_filter { + CTA_FILTER_UNSPEC, + CTA_FILTER_ORIG_FLAGS, + CTA_FILTER_REPLY_FLAGS, + __CTA_FILTER_MAX +}; +#define CTA_FILTER_MAX (__CTA_FILTER_MAX - 1) + #endif /* _IPCONNTRACK_NETLINK_H */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1d57b95d3481..8abb1727bcc4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1974,13 +1974,22 @@ const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { + if (!tb[CTA_PROTO_SRC_PORT]) + return -EINVAL; + + t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); + } - t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); - t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); + if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { + if (!tb[CTA_PROTO_DST_PORT]) + return -EINVAL; + + t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); + } return 0; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9ddfcd002d3b..d7bd8b1f27d5 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -54,6 +54,8 @@ #include #include +#include "nf_internals.h" + MODULE_LICENSE("GPL"); static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, @@ -544,14 +546,16 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, - struct nf_conn *ct, bool extinfo) + struct nf_conn *ct, bool extinfo, unsigned int flags) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; struct nlattr *nest_parms; - unsigned int flags = portid ? NLM_F_MULTI : 0, event; + unsigned int event; + if (portid) + flags |= NLM_F_MULTI; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) @@ -847,17 +851,70 @@ static int ctnetlink_done(struct netlink_callback *cb) } struct ctnetlink_filter { + u_int32_t cta_flags; u8 family; + + u_int32_t orig_flags; + u_int32_t reply_flags; + + struct nf_conntrack_tuple orig; + struct nf_conntrack_tuple reply; + struct nf_conntrack_zone zone; + struct { u_int32_t val; u_int32_t mask; } mark; }; +static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = { + [CTA_FILTER_ORIG_FLAGS] = { .type = NLA_U32 }, + [CTA_FILTER_REPLY_FLAGS] = { .type = NLA_U32 }, +}; + +static int ctnetlink_parse_filter(const struct nlattr *attr, + struct ctnetlink_filter *filter) +{ + struct nlattr *tb[CTA_FILTER_MAX + 1]; + int ret = 0; + + ret = nla_parse_nested(tb, CTA_FILTER_MAX, attr, cta_filter_nla_policy, + NULL); + if (ret) + return ret; + + if (tb[CTA_FILTER_ORIG_FLAGS]) { + filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]); + if (filter->orig_flags & ~CTA_FILTER_F_ALL) + return -EOPNOTSUPP; + } + + if (tb[CTA_FILTER_REPLY_FLAGS]) { + filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]); + if (filter->reply_flags & ~CTA_FILTER_F_ALL) + return -EOPNOTSUPP; + } + + return 0; +} + +static int ctnetlink_parse_zone(const struct nlattr *attr, + struct nf_conntrack_zone *zone); +static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, + u32 type, u_int8_t l3num, + struct nf_conntrack_zone *zone, + u_int32_t flags); + +/* applied on filters */ +#define CTA_FILTER_F_CTA_MARK (1 << 0) +#define CTA_FILTER_F_CTA_MARK_MASK (1 << 1) + static struct ctnetlink_filter * ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) { struct ctnetlink_filter *filter; + int err; #ifndef CONFIG_NF_CONNTRACK_MARK if (cda[CTA_MARK] || cda[CTA_MARK_MASK]) @@ -871,14 +928,65 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) filter->family = family; #ifdef CONFIG_NF_CONNTRACK_MARK - if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { + if (cda[CTA_MARK]) { filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); - filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK); + + if (cda[CTA_MARK_MASK]) { + filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK_MASK); + } else { + filter->mark.mask = 0xffffffff; + } + } else if (cda[CTA_MARK_MASK]) { + return ERR_PTR(-EINVAL); } #endif + if (!cda[CTA_FILTER]) + return filter; + + err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); + if (err < 0) + return ERR_PTR(err); + + err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); + if (err < 0) + return ERR_PTR(err); + + if (filter->orig_flags) { + if (!cda[CTA_TUPLE_ORIG]) + return ERR_PTR(-EINVAL); + + err = ctnetlink_parse_tuple_filter(cda, &filter->orig, + CTA_TUPLE_ORIG, + filter->family, + &filter->zone, + filter->orig_flags); + if (err < 0) + return ERR_PTR(err); + } + + if (filter->reply_flags) { + if (!cda[CTA_TUPLE_REPLY]) + return ERR_PTR(-EINVAL); + + err = ctnetlink_parse_tuple_filter(cda, &filter->reply, + CTA_TUPLE_REPLY, + filter->family, + &filter->zone, + filter->orig_flags); + if (err < 0) + return ERR_PTR(err); + } + return filter; } +static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) +{ + return family || cda[CTA_MARK] || cda[CTA_FILTER]; +} + static int ctnetlink_start(struct netlink_callback *cb) { const struct nlattr * const *cda = cb->data; @@ -886,7 +994,7 @@ static int ctnetlink_start(struct netlink_callback *cb) struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u8 family = nfmsg->nfgen_family; - if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + if (ctnetlink_needs_filter(family, cda)) { filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); @@ -896,9 +1004,79 @@ static int ctnetlink_start(struct netlink_callback *cb) return 0; } +static int ctnetlink_filter_match_tuple(struct nf_conntrack_tuple *filter_tuple, + struct nf_conntrack_tuple *ct_tuple, + u_int32_t flags, int family) +{ + switch (family) { + case NFPROTO_IPV4: + if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && + filter_tuple->src.u3.ip != ct_tuple->src.u3.ip) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && + filter_tuple->dst.u3.ip != ct_tuple->dst.u3.ip) + return 0; + break; + case NFPROTO_IPV6: + if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && + !ipv6_addr_cmp(&filter_tuple->src.u3.in6, + &ct_tuple->src.u3.in6)) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && + !ipv6_addr_cmp(&filter_tuple->dst.u3.in6, + &ct_tuple->dst.u3.in6)) + return 0; + break; + } + + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) && + filter_tuple->dst.protonum != ct_tuple->dst.protonum) + return 0; + + switch (ct_tuple->dst.protonum) { + case IPPROTO_TCP: + case IPPROTO_UDP: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) && + filter_tuple->src.u.tcp.port != ct_tuple->src.u.tcp.port) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) && + filter_tuple->dst.u.tcp.port != ct_tuple->dst.u.tcp.port) + return 0; + break; + case IPPROTO_ICMP: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) && + filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) && + filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) && + filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) + return 0; + break; + case IPPROTO_ICMPV6: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) && + filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) && + filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) && + filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) + return 0; + break; + } + + return 1; +} + static int ctnetlink_filter_match(struct nf_conn *ct, void *data) { struct ctnetlink_filter *filter = data; + struct nf_conntrack_tuple *tuple; if (filter == NULL) goto out; @@ -910,8 +1088,28 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) if (filter->family && nf_ct_l3num(ct) != filter->family) goto ignore_entry; + if (filter->orig_flags) { + tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL); + if (!ctnetlink_filter_match_tuple(&filter->orig, tuple, + filter->orig_flags, + filter->family)) + goto ignore_entry; + } + + if (filter->reply_flags) { + tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY); + if (!ctnetlink_filter_match_tuple(&filter->reply, tuple, + filter->reply_flags, + filter->family)) + goto ignore_entry; + } + #ifdef CONFIG_NF_CONNTRACK_MARK - if ((ct->mark & filter->mark.mask) != filter->mark.val) + if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK_MASK)) && + (ct->mark & filter->mark.mask) != filter->mark.val) + goto ignore_entry; + else if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK)) && + ct->mark != filter->mark.val) goto ignore_entry; #endif @@ -925,6 +1123,7 @@ ignore_entry: static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { + unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; struct net *net = sock_net(skb->sk); struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; @@ -979,7 +1178,7 @@ restart: ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct, true); + ct, true, flags); if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; @@ -1014,31 +1213,50 @@ out: } static int ipv4_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_IP_V4_SRC]) + return -EINVAL; + + t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); + } + + if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { + if (!tb[CTA_IP_V4_DST]) + return -EINVAL; - t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); + t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); + } return 0; } static int ipv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_IP_V6_SRC]) + return -EINVAL; - t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); - t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); + t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); + } + + if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { + if (!tb[CTA_IP_V6_DST]) + return -EINVAL; + + t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); + } return 0; } static int ctnetlink_parse_tuple_ip(struct nlattr *attr, - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; @@ -1054,10 +1272,10 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, switch (tuple->src.l3num) { case NFPROTO_IPV4: - ret = ipv4_nlattr_to_tuple(tb, tuple); + ret = ipv4_nlattr_to_tuple(tb, tuple, flags); break; case NFPROTO_IPV6: - ret = ipv6_nlattr_to_tuple(tb, tuple); + ret = ipv6_nlattr_to_tuple(tb, tuple, flags); break; } @@ -1069,7 +1287,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { }; static int ctnetlink_parse_tuple_proto(struct nlattr *attr, - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTO_MAX+1]; @@ -1080,8 +1299,12 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, if (ret < 0) return ret; + if (!(flags & CTA_FILTER_FLAG(CTA_PROTO_NUM))) + return 0; + if (!tb[CTA_PROTO_NUM]) return -EINVAL; + tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); rcu_read_lock(); @@ -1092,7 +1315,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, l4proto->nla_policy, NULL); if (ret == 0) - ret = l4proto->nlattr_to_tuple(tb, tuple); + ret = l4proto->nlattr_to_tuple(tb, tuple, flags); } rcu_read_unlock(); @@ -1143,10 +1366,21 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { [CTA_TUPLE_ZONE] = { .type = NLA_U16 }, }; +#define CTA_FILTER_F_ALL_CTA_PROTO \ + (CTA_FILTER_F_CTA_PROTO_SRC_PORT | \ + CTA_FILTER_F_CTA_PROTO_DST_PORT | \ + CTA_FILTER_F_CTA_PROTO_ICMP_TYPE | \ + CTA_FILTER_F_CTA_PROTO_ICMP_CODE | \ + CTA_FILTER_F_CTA_PROTO_ICMP_ID | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_ID) + static int -ctnetlink_parse_tuple(const struct nlattr * const cda[], - struct nf_conntrack_tuple *tuple, u32 type, - u_int8_t l3num, struct nf_conntrack_zone *zone) +ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, u32 type, + u_int8_t l3num, struct nf_conntrack_zone *zone, + u_int32_t flags) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; @@ -1158,23 +1392,32 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], if (err < 0) return err; - if (!tb[CTA_TUPLE_IP]) - return -EINVAL; tuple->src.l3num = l3num; - err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple); - if (err < 0) - return err; + if (flags & CTA_FILTER_FLAG(CTA_IP_DST) || + flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_TUPLE_IP]) + return -EINVAL; - if (!tb[CTA_TUPLE_PROTO]) - return -EINVAL; + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple, flags); + if (err < 0) + return err; + } - err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple); - if (err < 0) - return err; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) { + if (!tb[CTA_TUPLE_PROTO]) + return -EINVAL; - if (tb[CTA_TUPLE_ZONE]) { + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple, flags); + if (err < 0) + return err; + } else if (flags & CTA_FILTER_FLAG(ALL_CTA_PROTO)) { + /* Can't manage proto flags without a protonum */ + return -EINVAL; + } + + if ((flags & CTA_FILTER_FLAG(CTA_TUPLE_ZONE)) && tb[CTA_TUPLE_ZONE]) { if (!zone) return -EINVAL; @@ -1193,6 +1436,15 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], return 0; } +static int +ctnetlink_parse_tuple(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, u32 type, + u_int8_t l3num, struct nf_conntrack_zone *zone) +{ + return ctnetlink_parse_tuple_filter(cda, tuple, type, l3num, zone, + CTA_FILTER_FLAG(ALL)); +} + static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, @@ -1240,6 +1492,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { .len = NF_CT_LABELS_MAX_SIZE }, [CTA_LABELS_MASK] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, + [CTA_FILTER] = { .type = NLA_NESTED }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) @@ -1256,7 +1509,10 @@ static int ctnetlink_flush_conntrack(struct net *net, { struct ctnetlink_filter *filter = NULL; - if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + if (ctnetlink_needs_filter(family, cda)) { + if (cda[CTA_FILTER]) + return -EOPNOTSUPP; + filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); @@ -1385,7 +1641,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, } err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true); + NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); if (err <= 0) goto free; @@ -1458,7 +1714,7 @@ restart: res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct, dying ? true : false); + ct, dying ? true : false, 0); if (res < 0) { if (!atomic_inc_not_zero(&ct->ct_general.use)) continue; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index c2e3dff773bc..4efd8741c105 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -20,6 +20,8 @@ #include #include +#include "nf_internals.h" + static const unsigned int nf_ct_icmp_timeout = 30*HZ; bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, @@ -271,20 +273,32 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { }; static int icmp_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { - if (!tb[CTA_PROTO_ICMP_TYPE] || - !tb[CTA_PROTO_ICMP_CODE] || - !tb[CTA_PROTO_ICMP_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); - - if (tuple->dst.u.icmp.type >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) { + if (!tb[CTA_PROTO_ICMP_TYPE]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); + if (tuple->dst.u.icmp.type >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type]) + return -EINVAL; + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) { + if (!tb[CTA_PROTO_ICMP_CODE]) + return -EINVAL; + + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) { + if (!tb[CTA_PROTO_ICMP_ID]) + return -EINVAL; + + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); + } return 0; } diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 6f9144e1f1c1..facd8c64ec4e 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -24,6 +24,8 @@ #include #include +#include "nf_internals.h" + static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, @@ -193,21 +195,33 @@ static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { }; static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { - if (!tb[CTA_PROTO_ICMPV6_TYPE] || - !tb[CTA_PROTO_ICMPV6_CODE] || - !tb[CTA_PROTO_ICMPV6_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); - - if (tuple->dst.u.icmp.type < 128 || - tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type - 128]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) { + if (!tb[CTA_PROTO_ICMPV6_TYPE]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); + if (tuple->dst.u.icmp.type < 128 || + tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type - 128]) + return -EINVAL; + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) { + if (!tb[CTA_PROTO_ICMPV6_CODE]) + return -EINVAL; + + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) { + if (!tb[CTA_PROTO_ICMPV6_ID]) + return -EINVAL; + + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); + } return 0; } diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index d6c43902ebd7..832ae64179f0 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -6,6 +6,23 @@ #include #include +/* nf_conntrack_netlink.c: applied on tuple filters */ +#define CTA_FILTER_F_CTA_IP_SRC (1 << 0) +#define CTA_FILTER_F_CTA_IP_DST (1 << 1) +#define CTA_FILTER_F_CTA_TUPLE_ZONE (1 << 2) +#define CTA_FILTER_F_CTA_PROTO_NUM (1 << 3) +#define CTA_FILTER_F_CTA_PROTO_SRC_PORT (1 << 4) +#define CTA_FILTER_F_CTA_PROTO_DST_PORT (1 << 5) +#define CTA_FILTER_F_CTA_PROTO_ICMP_TYPE (1 << 6) +#define CTA_FILTER_F_CTA_PROTO_ICMP_CODE (1 << 7) +#define CTA_FILTER_F_CTA_PROTO_ICMP_ID (1 << 8) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE (1 << 9) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE (1 << 10) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_ID (1 << 11) +#define CTA_FILTER_F_MAX (1 << 12) +#define CTA_FILTER_F_ALL (CTA_FILTER_F_MAX-1) +#define CTA_FILTER_FLAG(ctattr) CTA_FILTER_F_ ## ctattr + /* nf_queue.c */ void nf_queue_nf_hook_drop(struct net *net); -- cgit v1.2.3 From 83fc5dd57f86c3ec7d6d22565a6ff6c948853b64 Mon Sep 17 00:00:00 2001 From: Jérôme Pouiller Date: Mon, 11 May 2020 18:19:02 +0200 Subject: mmc: fix compilation of user API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The definitions of MMC_IOC_CMD and of MMC_IOC_MULTI_CMD rely on MMC_BLOCK_MAJOR: #define MMC_IOC_CMD _IOWR(MMC_BLOCK_MAJOR, 0, struct mmc_ioc_cmd) #define MMC_IOC_MULTI_CMD _IOWR(MMC_BLOCK_MAJOR, 1, struct mmc_ioc_multi_cmd) However, MMC_BLOCK_MAJOR is defined in linux/major.h and linux/mmc/ioctl.h did not include it. Signed-off-by: Jérôme Pouiller Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200511161902.191405-1-Jerome.Pouiller@silabs.com Signed-off-by: Ulf Hansson --- include/uapi/linux/mmc/ioctl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/mmc/ioctl.h b/include/uapi/linux/mmc/ioctl.h index 00c08120f3ba..27a39847d55c 100644 --- a/include/uapi/linux/mmc/ioctl.h +++ b/include/uapi/linux/mmc/ioctl.h @@ -3,6 +3,7 @@ #define LINUX_MMC_IOCTL_H #include +#include struct mmc_ioc_cmd { /* -- cgit v1.2.3 From 130c88931f6cbdb4513d307b4a13fcfff08a8041 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 26 May 2020 20:53:21 -0400 Subject: drm/amdgpu: Improve the MTYPE comments Use words insteads of acronyms for better understanding. Signed-off-by: Yong Zhao Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- include/uapi/drm/amdgpu_drm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 4e873dcbe68f..3218576e109d 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -502,15 +502,15 @@ struct drm_amdgpu_gem_op { #define AMDGPU_VM_MTYPE_MASK (0xf << 5) /* Default MTYPE. Pre-AI must use this. Recommended for newer ASICs. */ #define AMDGPU_VM_MTYPE_DEFAULT (0 << 5) -/* Use NC MTYPE instead of default MTYPE */ +/* Use Non Coherent MTYPE instead of default MTYPE */ #define AMDGPU_VM_MTYPE_NC (1 << 5) -/* Use WC MTYPE instead of default MTYPE */ +/* Use Write Combine MTYPE instead of default MTYPE */ #define AMDGPU_VM_MTYPE_WC (2 << 5) -/* Use CC MTYPE instead of default MTYPE */ +/* Use Cache Coherent MTYPE instead of default MTYPE */ #define AMDGPU_VM_MTYPE_CC (3 << 5) -/* Use UC MTYPE instead of default MTYPE */ +/* Use UnCached MTYPE instead of default MTYPE */ #define AMDGPU_VM_MTYPE_UC (4 << 5) -/* Use RW MTYPE instead of default MTYPE */ +/* Use Read Write MTYPE instead of default MTYPE */ #define AMDGPU_VM_MTYPE_RW (5 << 5) struct drm_amdgpu_gem_va { -- cgit v1.2.3 From a8a24f3f6e38103b77cf399c38eb54e1219d00d6 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Fri, 29 May 2020 02:00:47 +0530 Subject: vfio: UAPI for migration interface for device state - Defined MIGRATION region type and sub-type. - Defined vfio_device_migration_info structure which will be placed at the 0th offset of migration region to get/set VFIO device related information. Defined members of structure and usage on read/write access. - Defined device states and state transition details. - Defined sequence to be followed while saving and resuming VFIO device. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Cornelia Huck Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 228 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 015516bcfaa3..ad9bb5af3463 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -305,6 +305,7 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) #define VFIO_REGION_TYPE_GFX (1) #define VFIO_REGION_TYPE_CCW (2) +#define VFIO_REGION_TYPE_MIGRATION (3) /* sub-types for VFIO_REGION_TYPE_PCI_* */ @@ -379,6 +380,233 @@ struct vfio_region_gfx_edid { /* sub-types for VFIO_REGION_TYPE_CCW */ #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) +/* sub-types for VFIO_REGION_TYPE_MIGRATION */ +#define VFIO_REGION_SUBTYPE_MIGRATION (1) + +/* + * The structure vfio_device_migration_info is placed at the 0th offset of + * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related + * migration information. Field accesses from this structure are only supported + * at their native width and alignment. Otherwise, the result is undefined and + * vendor drivers should return an error. + * + * device_state: (read/write) + * - The user application writes to this field to inform the vendor driver + * about the device state to be transitioned to. + * - The vendor driver should take the necessary actions to change the + * device state. After successful transition to a given state, the + * vendor driver should return success on write(device_state, state) + * system call. If the device state transition fails, the vendor driver + * should return an appropriate -errno for the fault condition. + * - On the user application side, if the device state transition fails, + * that is, if write(device_state, state) returns an error, read + * device_state again to determine the current state of the device from + * the vendor driver. + * - The vendor driver should return previous state of the device unless + * the vendor driver has encountered an internal error, in which case + * the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR. + * - The user application must use the device reset ioctl to recover the + * device from VFIO_DEVICE_STATE_ERROR state. If the device is + * indicated to be in a valid device state by reading device_state, the + * user application may attempt to transition the device to any valid + * state reachable from the current state or terminate itself. + * + * device_state consists of 3 bits: + * - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear, + * it indicates the _STOP state. When the device state is changed to + * _STOP, driver should stop the device before write() returns. + * - If bit 1 is set, it indicates the _SAVING state, which means that the + * driver should start gathering device state information that will be + * provided to the VFIO user application to save the device's state. + * - If bit 2 is set, it indicates the _RESUMING state, which means that + * the driver should prepare to resume the device. Data provided through + * the migration region should be used to resume the device. + * Bits 3 - 31 are reserved for future use. To preserve them, the user + * application should perform a read-modify-write operation on this + * field when modifying the specified bits. + * + * +------- _RESUMING + * |+------ _SAVING + * ||+----- _RUNNING + * ||| + * 000b => Device Stopped, not saving or resuming + * 001b => Device running, which is the default state + * 010b => Stop the device & save the device state, stop-and-copy state + * 011b => Device running and save the device state, pre-copy state + * 100b => Device stopped and the device state is resuming + * 101b => Invalid state + * 110b => Error state + * 111b => Invalid state + * + * State transitions: + * + * _RESUMING _RUNNING Pre-copy Stop-and-copy _STOP + * (100b) (001b) (011b) (010b) (000b) + * 0. Running or default state + * | + * + * 1. Normal Shutdown (optional) + * |------------------------------------->| + * + * 2. Save the state or suspend + * |------------------------->|---------->| + * + * 3. Save the state during live migration + * |----------->|------------>|---------->| + * + * 4. Resuming + * |<---------| + * + * 5. Resumed + * |--------->| + * + * 0. Default state of VFIO device is _RUNNNG when the user application starts. + * 1. During normal shutdown of the user application, the user application may + * optionally change the VFIO device state from _RUNNING to _STOP. This + * transition is optional. The vendor driver must support this transition but + * must not require it. + * 2. When the user application saves state or suspends the application, the + * device state transitions from _RUNNING to stop-and-copy and then to _STOP. + * On state transition from _RUNNING to stop-and-copy, driver must stop the + * device, save the device state and send it to the application through the + * migration region. The sequence to be followed for such transition is given + * below. + * 3. In live migration of user application, the state transitions from _RUNNING + * to pre-copy, to stop-and-copy, and to _STOP. + * On state transition from _RUNNING to pre-copy, the driver should start + * gathering the device state while the application is still running and send + * the device state data to application through the migration region. + * On state transition from pre-copy to stop-and-copy, the driver must stop + * the device, save the device state and send it to the user application + * through the migration region. + * Vendor drivers must support the pre-copy state even for implementations + * where no data is provided to the user before the stop-and-copy state. The + * user must not be required to consume all migration data before the device + * transitions to a new state, including the stop-and-copy state. + * The sequence to be followed for above two transitions is given below. + * 4. To start the resuming phase, the device state should be transitioned from + * the _RUNNING to the _RESUMING state. + * In the _RESUMING state, the driver should use the device state data + * received through the migration region to resume the device. + * 5. After providing saved device data to the driver, the application should + * change the state from _RESUMING to _RUNNING. + * + * reserved: + * Reads on this field return zero and writes are ignored. + * + * pending_bytes: (read only) + * The number of pending bytes still to be migrated from the vendor driver. + * + * data_offset: (read only) + * The user application should read data_offset field from the migration + * region. The user application should read the device data from this + * offset within the migration region during the _SAVING state or write + * the device data during the _RESUMING state. See below for details of + * sequence to be followed. + * + * data_size: (read/write) + * The user application should read data_size to get the size in bytes of + * the data copied in the migration region during the _SAVING state and + * write the size in bytes of the data copied in the migration region + * during the _RESUMING state. + * + * The format of the migration region is as follows: + * ------------------------------------------------------------------ + * |vfio_device_migration_info| data section | + * | | /////////////////////////////// | + * ------------------------------------------------------------------ + * ^ ^ + * offset 0-trapped part data_offset + * + * The structure vfio_device_migration_info is always followed by the data + * section in the region, so data_offset will always be nonzero. The offset + * from where the data is copied is decided by the kernel driver. The data + * section can be trapped, mmapped, or partitioned, depending on how the kernel + * driver defines the data section. The data section partition can be defined + * as mapped by the sparse mmap capability. If mmapped, data_offset must be + * page aligned, whereas initial section which contains the + * vfio_device_migration_info structure, might not end at the offset, which is + * page aligned. The user is not required to access through mmap regardless + * of the capabilities of the region mmap. + * The vendor driver should determine whether and how to partition the data + * section. The vendor driver should return data_offset accordingly. + * + * The sequence to be followed while in pre-copy state and stop-and-copy state + * is as follows: + * a. Read pending_bytes, indicating the start of a new iteration to get device + * data. Repeated read on pending_bytes at this stage should have no side + * effects. + * If pending_bytes == 0, the user application should not iterate to get data + * for that device. + * If pending_bytes > 0, perform the following steps. + * b. Read data_offset, indicating that the vendor driver should make data + * available through the data section. The vendor driver should return this + * read operation only after data is available from (region + data_offset) + * to (region + data_offset + data_size). + * c. Read data_size, which is the amount of data in bytes available through + * the migration region. + * Read on data_offset and data_size should return the offset and size of + * the current buffer if the user application reads data_offset and + * data_size more than once here. + * d. Read data_size bytes of data from (region + data_offset) from the + * migration region. + * e. Process the data. + * f. Read pending_bytes, which indicates that the data from the previous + * iteration has been read. If pending_bytes > 0, go to step b. + * + * The user application can transition from the _SAVING|_RUNNING + * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the + * number of pending bytes. The user application should iterate in _SAVING + * (stop-and-copy) until pending_bytes is 0. + * + * The sequence to be followed while _RESUMING device state is as follows: + * While data for this device is available, repeat the following steps: + * a. Read data_offset from where the user application should write data. + * b. Write migration data starting at the migration region + data_offset for + * the length determined by data_size from the migration source. + * c. Write data_size, which indicates to the vendor driver that data is + * written in the migration region. Vendor driver must return this write + * operations on consuming data. Vendor driver should apply the + * user-provided migration region data to the device resume state. + * + * If an error occurs during the above sequences, the vendor driver can return + * an error code for next read() or write() operation, which will terminate the + * loop. The user application should then take the next necessary action, for + * example, failing migration or terminating the user application. + * + * For the user application, data is opaque. The user application should write + * data in the same order as the data is received and the data should be of + * same transaction size at the source. + */ + +struct vfio_device_migration_info { + __u32 device_state; /* VFIO device state */ +#define VFIO_DEVICE_STATE_STOP (0) +#define VFIO_DEVICE_STATE_RUNNING (1 << 0) +#define VFIO_DEVICE_STATE_SAVING (1 << 1) +#define VFIO_DEVICE_STATE_RESUMING (1 << 2) +#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \ + VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + +#define VFIO_DEVICE_STATE_VALID(state) \ + (state & VFIO_DEVICE_STATE_RESUMING ? \ + (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) + +#define VFIO_DEVICE_STATE_IS_ERROR(state) \ + ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING)) + +#define VFIO_DEVICE_STATE_SET_ERROR(state) \ + ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + + __u32 reserved; + __u64 pending_bytes; + __u64 data_offset; + __u64 data_size; +}; + /* * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped * which allows direct access to non-MSIX registers which happened to be within -- cgit v1.2.3 From b704fd14a06f678f080e44db28061f1bcb1f595c Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Fri, 29 May 2020 02:00:50 +0530 Subject: vfio iommu: Add ioctl definition for dirty pages tracking IOMMU container maintains a list of all pages pinned by vfio_pin_pages API. All pages pinned by vendor driver through this API should be considered as dirty during migration. When container consists of IOMMU capable device and all pages are pinned and mapped, then all pages are marked dirty. Added support to start/stop dirtied pages tracking and to get bitmap of all dirtied pages for requested IO virtual address range. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Cornelia Huck Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index ad9bb5af3463..009a8c80079d 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1033,6 +1033,12 @@ struct vfio_iommu_type1_dma_map { #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) +struct vfio_bitmap { + __u64 pgsize; /* page size for bitmap in bytes */ + __u64 size; /* in bytes */ + __u64 __user *data; /* one bit per page */ +}; + /** * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14, * struct vfio_dma_unmap) @@ -1059,6 +1065,57 @@ struct vfio_iommu_type1_dma_unmap { #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) +/** + * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17, + * struct vfio_iommu_type1_dirty_bitmap) + * IOCTL is used for dirty pages logging. + * Caller should set flag depending on which operation to perform, details as + * below: + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs + * the IOMMU driver to log pages that are dirtied or potentially dirtied by + * the device; designed to be used when a migration is in progress. Dirty pages + * are logged until logging is disabled by user application by calling the IOCTL + * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag. + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs + * the IOMMU driver to stop logging dirtied pages. + * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set + * returns the dirty pages bitmap for IOMMU container for a given IOVA range. + * The user must specify the IOVA range and the pgsize through the structure + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface + * supports getting a bitmap of the smallest supported pgsize only and can be + * modified in future to get a bitmap of any specified supported pgsize. The + * user must provide a zeroed memory area for the bitmap memory and specify its + * size in bitmap.size. One bit is used to represent one page consecutively + * starting from iova offset. The user should provide page size in bitmap.pgsize + * field. A bit set in the bitmap indicates that the page at that offset from + * iova is dirty. The caller must set argsz to a value including the size of + * structure vfio_iommu_type1_dirty_bitmap_get, but excluding the size of the + * actual bitmap. If dirty pages logging is not enabled, an error will be + * returned. + * + * Only one of the flags _START, _STOP and _GET may be specified at a time. + * + */ +struct vfio_iommu_type1_dirty_bitmap { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) + __u8 data[]; +}; + +struct vfio_iommu_type1_dirty_bitmap_get { + __u64 iova; /* IO virtual address */ + __u64 size; /* Size of iova range */ + struct vfio_bitmap bitmap; +}; + +#define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17) + /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ /* -- cgit v1.2.3 From 331e33d2960c8292bad8b02578fcfac18f721517 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Fri, 29 May 2020 02:00:52 +0530 Subject: vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap DMA mapped pages, including those pinned by mdev vendor drivers, might get unpinned and unmapped while migration is active and device is still running. For example, in pre-copy phase while guest driver could access those pages, host device or vendor driver can dirty these mapped pages. Such pages should be marked dirty so as to maintain memory consistency for a user making use of dirty page tracking. To get bitmap during unmap, user should allocate memory for bitmap, set it all zeros, set size of allocated memory, set page size to be considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Cornelia Huck Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 61 +++++++++++++++++++++++++++++++++-------- include/uapi/linux/vfio.h | 11 ++++++++ 2 files changed, 61 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index be6fc0d88633..e31fcc23e81a 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -1049,23 +1049,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size) } static int vfio_dma_do_unmap(struct vfio_iommu *iommu, - struct vfio_iommu_type1_dma_unmap *unmap) + struct vfio_iommu_type1_dma_unmap *unmap, + struct vfio_bitmap *bitmap) { - uint64_t mask; struct vfio_dma *dma, *dma_last = NULL; - size_t unmapped = 0; + size_t unmapped = 0, pgsize; int ret = 0, retries = 0; + unsigned long pgshift; mutex_lock(&iommu->lock); - mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1; + pgshift = __ffs(iommu->pgsize_bitmap); + pgsize = (size_t)1 << pgshift; - if (unmap->iova & mask) { + if (unmap->iova & (pgsize - 1)) { ret = -EINVAL; goto unlock; } - if (!unmap->size || unmap->size & mask) { + if (!unmap->size || unmap->size & (pgsize - 1)) { ret = -EINVAL; goto unlock; } @@ -1076,9 +1078,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, goto unlock; } - WARN_ON(mask & PAGE_MASK); -again: + /* When dirty tracking is enabled, allow only min supported pgsize */ + if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) && + (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) { + ret = -EINVAL; + goto unlock; + } + WARN_ON((pgsize - 1) & PAGE_MASK); +again: /* * vfio-iommu-type1 (v1) - User mappings were coalesced together to * avoid tracking individual mappings. This means that the granularity @@ -1159,6 +1167,14 @@ again: mutex_lock(&iommu->lock); goto again; } + + if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { + ret = update_user_bitmap(bitmap->data, dma, + unmap->iova, pgsize); + if (ret) + break; + } + unmapped += dma->size; vfio_remove_dma(iommu, dma); } @@ -2497,17 +2513,40 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { struct vfio_iommu_type1_dma_unmap unmap; - long ret; + struct vfio_bitmap bitmap = { 0 }; + int ret; minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); if (copy_from_user(&unmap, (void __user *)arg, minsz)) return -EFAULT; - if (unmap.argsz < minsz || unmap.flags) + if (unmap.argsz < minsz || + unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) return -EINVAL; - ret = vfio_dma_do_unmap(iommu, &unmap); + if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { + unsigned long pgshift; + + if (unmap.argsz < (minsz + sizeof(bitmap))) + return -EINVAL; + + if (copy_from_user(&bitmap, + (void __user *)(arg + minsz), + sizeof(bitmap))) + return -EFAULT; + + if (!access_ok((void __user *)bitmap.data, bitmap.size)) + return -EINVAL; + + pgshift = __ffs(bitmap.pgsize); + ret = verify_bitmap_size(unmap.size >> pgshift, + bitmap.size); + if (ret) + return ret; + } + + ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap); if (ret) return ret; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 009a8c80079d..ff4b6706f7df 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1048,12 +1048,23 @@ struct vfio_bitmap { * field. No guarantee is made to the user that arbitrary unmaps of iova * or size different from those used in the original mapping call will * succeed. + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap + * before unmapping IO virtual addresses. When this flag is set, the user must + * provide a struct vfio_bitmap in data[]. User must provide zero-allocated + * memory via vfio_bitmap.data and its size in the vfio_bitmap.size field. + * A bit in the bitmap represents one page, of user provided page size in + * vfio_bitmap.pgsize field, consecutively starting from iova offset. Bit set + * indicates that the page at that offset from iova is dirty. A Bitmap of the + * pages in the range of unmapped size is returned in the user-provided + * vfio_bitmap.data. */ struct vfio_iommu_type1_dma_unmap { __u32 argsz; __u32 flags; +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0) __u64 iova; /* IO virtual address */ __u64 size; /* Size of mapping (bytes) */ + __u8 data[]; }; #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) -- cgit v1.2.3 From ad721705d09c62f0d108a6b4f59867ebfd592c90 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Fri, 29 May 2020 02:00:53 +0530 Subject: vfio iommu: Add migration capability to report supported features Added migration capability in IOMMU info chain. User application should check IOMMU info chain for migration capability to use dirty page tracking feature provided by kernel module. User application must check page sizes supported and maximum dirty bitmap size returned by this capability structure for ioctls used to get dirty bitmap. Signed-off-by: Kirti Wankhede Reviewed-by: Cornelia Huck Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 23 ++++++++++++++++++++++- include/uapi/linux/vfio.h | 23 +++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index e31fcc23e81a..f5c6ca46f165 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2423,6 +2423,22 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu, return ret; } +static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, + struct vfio_info_cap *caps) +{ + struct vfio_iommu_type1_info_cap_migration cap_mig; + + cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION; + cap_mig.header.version = 1; + + cap_mig.flags = 0; + /* support minimum pgsize */ + cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap); + cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX; + + return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig)); +} + static long vfio_iommu_type1_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -2469,8 +2485,13 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, info.iova_pgsizes = iommu->pgsize_bitmap; - ret = vfio_iommu_iova_build_caps(iommu, &caps); + ret = vfio_iommu_migration_build_caps(iommu, &caps); + + if (!ret) + ret = vfio_iommu_iova_build_caps(iommu, &caps); + mutex_unlock(&iommu->lock); + if (ret) return ret; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index ff4b6706f7df..fde4692a6989 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1013,6 +1013,29 @@ struct vfio_iommu_type1_info_cap_iova_range { struct vfio_iova_range iova_ranges[]; }; +/* + * The migration capability allows to report supported features for migration. + * + * The structures below define version 1 of this capability. + * + * The existence of this capability indicates that IOMMU kernel driver supports + * dirty page logging. + * + * pgsize_bitmap: Kernel driver returns bitmap of supported page sizes for dirty + * page logging. + * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap + * size in bytes that can be used by user applications when getting the dirty + * bitmap. + */ +#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION 1 + +struct vfio_iommu_type1_info_cap_migration { + struct vfio_info_cap_header header; + __u32 flags; + __u64 pgsize_bitmap; + __u64 max_dirty_bitmap_size; /* in bytes */ +}; + #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) /** -- cgit v1.2.3 From b383a73f2b832491a2f9e6e8ada26aad53b5763d Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 May 2020 08:00:02 -0700 Subject: fs/ext4: Introduce DAX inode flag Add a flag ([EXT4|FS]_DAX_FL) to preserve FS_XFLAG_DAX in the ext4 inode. Set the flag to be user visible and changeable. Set the flag to be inherited. Allow applications to change the flag at any time except if it conflicts with the set of mutually exclusive flags (Currently VERITY, ENCRYPT, JOURNAL_DATA). Furthermore, restrict setting any of the exclusive flags if DAX is set. While conceptually possible, we do not allow setting EXT4_DAX_FL while at the same time clearing exclusion flags (or vice versa) for 2 reasons: 1) The DAX flag does not take effect immediately which introduces quite a bit of complexity 2) There is no clear use case for being this flexible Finally, on regular files, flag the inode to not be cached to facilitate changing S_DAX on the next creation of the inode. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20200528150003.828793-9-ira.weiny@intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 18 ++++++++++++++---- fs/ext4/inode.c | 2 +- fs/ext4/ioctl.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/super.c | 3 +++ fs/ext4/verity.c | 2 +- include/uapi/linux/fs.h | 1 + 6 files changed, 66 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 65ffb831b2b9..598e00a9453f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -415,13 +415,16 @@ struct flex_groups { #define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ + +#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ + #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x725BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x624BC0FF /* User modifiable flags */ /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ @@ -429,14 +432,16 @@ struct flex_groups { EXT4_APPEND_FL | \ EXT4_NODUMP_FL | \ EXT4_NOATIME_FL | \ - EXT4_PROJINHERIT_FL) + EXT4_PROJINHERIT_FL | \ + EXT4_DAX_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ - EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL) + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ + EXT4_DAX_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ @@ -448,6 +453,10 @@ struct flex_groups { /* The only flags that should be swapped */ #define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) +/* Flags which are mutually exclusive to DAX */ +#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ + EXT4_JOURNAL_DATA_FL) + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { @@ -488,6 +497,7 @@ enum { EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ /* 22 was formerly EXT4_INODE_EOFBLOCKS */ + EXT4_INODE_DAX = 25, /* Inode is DAX */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 68fac9289109..778b0dbe3da6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4419,7 +4419,7 @@ static bool ext4_should_enable_dax(struct inode *inode) if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; - return false; + return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 779631e8e849..1b520d07d371 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -292,6 +292,38 @@ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid, return 0; } +static void ext4_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (S_ISDIR(inode->i_mode)) + return; + + if (test_opt2(inode->i_sb, DAX_NEVER) || + test_opt(inode->i_sb, DAX_ALWAYS)) + return; + + if ((ei->i_flags ^ flags) & EXT4_DAX_FL) + d_mark_dontcache(inode); +} + +static bool dax_compatible(struct inode *inode, unsigned int oldflags, + unsigned int flags) +{ + if (flags & EXT4_DAX_FL) { + if ((oldflags & EXT4_DAX_MUT_EXCL) || + ext4_test_inode_state(inode, + EXT4_STATE_VERITY_IN_PROGRESS)) { + return false; + } + } + + if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL)) + return false; + + return true; +} + static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) { @@ -320,6 +352,12 @@ static int ext4_ioctl_setflags(struct inode *inode, if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } + + if (!dax_compatible(inode, oldflags, flags)) { + err = -EOPNOTSUPP; + goto flags_out; + } + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; @@ -365,6 +403,8 @@ static int ext4_ioctl_setflags(struct inode *inode, if (err) goto flags_err; + ext4_dax_dontcache(inode, flags); + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { if (!(mask & EXT4_FL_USER_MODIFIABLE)) continue; @@ -525,12 +565,15 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) xflags |= FS_XFLAG_NOATIME; if (iflags & EXT4_PROJINHERIT_FL) xflags |= FS_XFLAG_PROJINHERIT; + if (iflags & EXT4_DAX_FL) + xflags |= FS_XFLAG_DAX; return xflags; } #define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ - FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT | \ + FS_XFLAG_DAX) /* Transfer xflags flags to internal */ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) @@ -549,6 +592,8 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) iflags |= EXT4_NOATIME_FL; if (xflags & FS_XFLAG_PROJINHERIT) iflags |= EXT4_PROJINHERIT_FL; + if (xflags & FS_XFLAG_DAX) + iflags |= EXT4_DAX_FL; return iflags; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5e056aa20ce9..3658e3016999 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1323,6 +1323,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) return -EINVAL; + if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EOPNOTSUPP; + res = ext4_convert_inline_data(inode); if (res) return res; diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 89a155ece323..4fecb3e4e338 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -113,7 +113,7 @@ static int ext4_begin_enable_verity(struct file *filp) handle_t *handle; int err; - if (IS_DAX(inode)) + if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX)) return -EINVAL; if (ext4_verity_in_progress(inode)) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 379a612f8f1d..f44eb0a04afd 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -262,6 +262,7 @@ struct fsxattr { #define FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define FS_EOFBLOCKS_FL 0x00400000 /* Reserved for ext4 */ #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ +#define FS_DAX_FL 0x02000000 /* Inode is DAX */ #define FS_INLINE_DATA_FL 0x10000000 /* Reserved for ext4 */ #define FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define FS_CASEFOLD_FL 0x40000000 /* Folder is case insensitive */ -- cgit v1.2.3 From 43e64bf301fd8c54f0082d91c6ffd4de861baf96 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 28 May 2020 21:34:29 +0200 Subject: cfg80211: handle 6 GHz capability of new station Handle 6 GHz HE capability while adding new station. It will be used later in mac80211 station processing. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1589399105-25472-2-git-send-email-rmanohar@codeaurora.org [handle nl80211_set_station, require WME, remove NL80211_HE_6GHZ_CAPABILITY_LEN] Link: https://lore.kernel.org/r/20200528213443.b6b711fd4312.Ic9b97d57b6c4f2b28d4b2d23d2849d8bc20bd8cc@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 5 +++++ net/wireless/nl80211.c | 18 +++++++++++++++++- 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a38653358885..da734ea71b5a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1238,6 +1238,7 @@ struct sta_txpwr { * @he_capa_len: the length of the HE capabilities * @airtime_weight: airtime scheduler weight for this station * @txpwr: transmit power for an associated station + * @he_6ghz_capa: HE 6 GHz Band capabilities of station */ struct station_parameters { const u8 *supported_rates; @@ -1270,6 +1271,7 @@ struct station_parameters { u8 he_capa_len; u16 airtime_weight; struct sta_txpwr txpwr; + const struct ieee80211_he_6ghz_capa *he_6ghz_capa; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c14666b75e57..e42ae429383e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2502,6 +2502,9 @@ enum nl80211_commands { * first channel segment specified in %NL80211_ATTR_CENTER_FREQ1. * @NL80211_ATTR_SCAN_FREQ_KHZ: nested attribute with KHz frequencies * + * @NL80211_ATTR_HE_6GHZ_CAPABILITY: HE 6 GHz Band Capability element (from + * association request when used with NL80211_CMD_NEW_STATION). + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2982,6 +2985,8 @@ enum nl80211_attrs { NL80211_ATTR_CENTER_FREQ1_OFFSET, NL80211_ATTR_SCAN_FREQ_KHZ, + NL80211_ATTR_HE_6GHZ_CAPABILITY, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 22c4d13e28cb..bf8bd8268cb7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -654,6 +654,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), [NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED }, + [NL80211_ATTR_HE_6GHZ_CAPABILITY] = { + .type = NLA_EXACT_LEN, + .len = sizeof(struct ieee80211_he_6ghz_capa), + }, }; /* policy for the key attributes */ @@ -5989,6 +5993,10 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) + params.he_6ghz_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]) params.airtime_weight = nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]); @@ -6123,6 +6131,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) + params.he_6ghz_capa = + nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { params.opmode_notif_used = true; params.opmode_notif = @@ -6167,10 +6179,14 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.vht_capa = NULL; /* HE requires WME */ - if (params.he_capa_len) + if (params.he_capa_len || params.he_6ghz_capa) return -EINVAL; } + /* Ensure that HT/VHT capabilities are not set for 6 GHz HE STA */ + if (params.he_6ghz_capa && (params.ht_capa || params.vht_capa)) + return -EINVAL; + /* When you run into this, adjust the code below for the new flag */ BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); -- cgit v1.2.3 From 223952177296c34d9c8de9cde33204caffe55725 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 May 2020 21:34:31 +0200 Subject: cfg80211: add and expose HE 6 GHz band capabilities These capabilities cover what would otherwise be transported in HT/VHT capabilities, but only a subset thereof that is actually needed on 6 GHz with HE already present. Expose the capabilities to userspace, drivers are expected to set them as using the 6 GHz band (currently) requires HE capability. Link: https://lore.kernel.org/r/20200528213443.244cd5cb9db8.Icd8c773277a88c837e7e3af1d4d1013cc3b66543@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 3 +++ net/wireless/nl80211.c | 9 ++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index da734ea71b5a..9b76be3d561a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -354,10 +354,13 @@ struct ieee80211_sta_he_cap { * * @types_mask: interface types mask * @he_cap: holds the HE capabilities + * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a + * 6 GHz band channel (and 0 may be valid value). */ struct ieee80211_sband_iftype_data { u16 types_mask; struct ieee80211_sta_he_cap he_cap; + struct ieee80211_he_6ghz_capa he_6ghz_capa; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e42ae429383e..5b350d032fa3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3565,6 +3565,8 @@ enum nl80211_mpath_info { * defined in HE capabilities IE * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band HE capability attribute currently * defined + * @NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA: HE 6GHz band capabilities (__le16), + * given for all 6 GHz band channels * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use */ enum nl80211_band_iftype_attr { @@ -3575,6 +3577,7 @@ enum nl80211_band_iftype_attr { NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, + NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, /* keep last */ __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bf8bd8268cb7..3a24e6add13e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1562,6 +1562,7 @@ static int nl80211_send_coalesce(struct sk_buff *msg, static int nl80211_send_iftype_data(struct sk_buff *msg, + const struct ieee80211_supported_band *sband, const struct ieee80211_sband_iftype_data *iftdata) { const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; @@ -1585,6 +1586,12 @@ nl80211_send_iftype_data(struct sk_buff *msg, return -ENOBUFS; } + if (sband->band == NL80211_BAND_6GHZ && + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, + sizeof(iftdata->he_6ghz_capa), + &iftdata->he_6ghz_capa)) + return -ENOBUFS; + return 0; } @@ -1633,7 +1640,7 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, if (!iftdata) return -ENOBUFS; - err = nl80211_send_iftype_data(msg, + err = nl80211_send_iftype_data(msg, sband, &sband->iftype_data[i]); if (err) return err; -- cgit v1.2.3 From 093a48d2aa4b74db3134b61d7b7a061dbe79177b Mon Sep 17 00:00:00 2001 From: Nathan Errera Date: Thu, 28 May 2020 21:22:38 +0200 Subject: cfg80211: support bigger kek/kck key length With some newer AKMs, the KCK and KEK are bigger, so allow that if the driver advertises support for it. In addition, add a new attribute for the AKM so we can use it for offloaded rekeying. Signed-off-by: Nathan Errera [reword commit message] Link: https://lore.kernel.org/r/20200528212237.5eb58b00a5d1.I61b09d77c4f382e8d58a05dcca78096e99a6bc15@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 +++++++++--- include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 23 +++++++++++++++++++---- 3 files changed, 32 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 95b55eea2afb..b58ad1a3f695 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2936,12 +2936,17 @@ struct cfg80211_wowlan_wakeup { /** * struct cfg80211_gtk_rekey_data - rekey data - * @kek: key encryption key (NL80211_KEK_LEN bytes) - * @kck: key confirmation key (NL80211_KCK_LEN bytes) + * @kek: key encryption key (@kek_len bytes) + * @kck: key confirmation key (@kck_len bytes) * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes) + * @kek_len: length of kek + * @kck_len length of kck + * @akm: akm (oui, id) */ struct cfg80211_gtk_rekey_data { const u8 *kek, *kck, *replay_ctr; + u32 akm; + u8 kek_len, kck_len; }; /** @@ -4166,9 +4171,10 @@ struct cfg80211_ops { * beaconing mode (AP, IBSS, Mesh, ...). * @WIPHY_FLAG_HAS_STATIC_WEP: The device supports static WEP key installation * before connection. + * @WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK: The device supports bigger kek and kck keys */ enum wiphy_flags { - /* use hole at 0 */ + WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), /* use hole at 1 */ /* use hole at 2 */ WIPHY_FLAG_NETNS_OK = BIT(3), diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5b350d032fa3..dad8c8f8581f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5396,6 +5396,8 @@ enum plink_actions { #define NL80211_KCK_LEN 16 #define NL80211_KEK_LEN 16 +#define NL80211_KCK_EXT_LEN 24 +#define NL80211_KEK_EXT_LEN 32 #define NL80211_REPLAY_CTR_LEN 8 /** @@ -5404,6 +5406,7 @@ enum plink_actions { * @NL80211_REKEY_DATA_KEK: key encryption key (binary) * @NL80211_REKEY_DATA_KCK: key confirmation key (binary) * @NL80211_REKEY_DATA_REPLAY_CTR: replay counter (binary) + * @NL80211_REKEY_DATA_AKM: AKM data (OUI, suite type) * @NUM_NL80211_REKEY_DATA: number of rekey attributes (internal) * @MAX_NL80211_REKEY_DATA: highest rekey attribute (internal) */ @@ -5412,6 +5415,7 @@ enum nl80211_rekey_data { NL80211_REKEY_DATA_KEK, NL80211_REKEY_DATA_KCK, NL80211_REKEY_DATA_REPLAY_CTR, + NL80211_REKEY_DATA_AKM, /* keep last */ NUM_NL80211_REKEY_DATA, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3a24e6add13e..263ae395ad44 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -730,9 +730,16 @@ nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = { /* policy for GTK rekey offload attributes */ static const struct nla_policy nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { - [NL80211_REKEY_DATA_KEK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KEK_LEN), - [NL80211_REKEY_DATA_KCK] = NLA_POLICY_EXACT_LEN_WARN(NL80211_KCK_LEN), + [NL80211_REKEY_DATA_KEK] = { + .type = NLA_BINARY, + .len = NL80211_KEK_EXT_LEN + }, + [NL80211_REKEY_DATA_KCK] = { + .type = NLA_BINARY, + .len = NL80211_KCK_EXT_LEN + }, [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN_WARN(NL80211_REPLAY_CTR_LEN), + [NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 }, }; static const struct nla_policy @@ -12347,14 +12354,22 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; - if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) + if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN && + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK && + nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KEK_EXT_LEN)) return -ERANGE; - if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN) + if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN && + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK && + nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KCK_EXT_LEN)) return -ERANGE; rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]); rekey_data.kck = nla_data(tb[NL80211_REKEY_DATA_KCK]); rekey_data.replay_ctr = nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]); + rekey_data.kek_len = nla_len(tb[NL80211_REKEY_DATA_KEK]); + rekey_data.kck_len = nla_len(tb[NL80211_REKEY_DATA_KCK]); + if (tb[NL80211_REKEY_DATA_AKM]) + rekey_data.akm = nla_get_u32(tb[NL80211_REKEY_DATA_AKM]); wdev_lock(wdev); if (!wdev->current_bss) { -- cgit v1.2.3 From 72de5fa4c16195827252b961ba44028a39dfeaff Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 25 May 2020 16:41:22 +0200 Subject: KVM: x86: announce KVM_FEATURE_ASYNC_PF_INT Introduce new capability to indicate that KVM supports interrupt based delivery of 'page ready' APF events. This includes support for both MSR_KVM_ASYNC_PF_INT and MSR_KVM_ASYNC_PF_ACK. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200525144125.143875-8-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/cpuid.rst | 6 ++++++ Documentation/virt/kvm/msr.rst | 12 ++++++++---- arch/x86/include/uapi/asm/kvm_para.h | 1 + arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/x86.c | 1 + include/uapi/linux/kvm.h | 1 + 6 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst index f721c89327ec..a7dff9186bed 100644 --- a/Documentation/virt/kvm/cpuid.rst +++ b/Documentation/virt/kvm/cpuid.rst @@ -86,6 +86,12 @@ KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit before using paravirtualized sched yield. +KVM_FEATURE_ASYNC_PF_INT 14 guest checks this feature bit + before using the second async + pf control msr 0x4b564d06 and + async pf acknowledgment msr + 0x4b564d07. + KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24 host will warn if no guest-side per-cpu warps are expeced in kvmclock diff --git a/Documentation/virt/kvm/msr.rst b/Documentation/virt/kvm/msr.rst index 9b107889b033..e37a14c323d2 100644 --- a/Documentation/virt/kvm/msr.rst +++ b/Documentation/virt/kvm/msr.rst @@ -213,7 +213,8 @@ data: cpl == 0. Bit 2 is 1 if asynchronous page faults are delivered to L1 as #PF vmexits. Bit 2 can be set only if KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID. Bit 3 enables interrupt based delivery of 'page ready' - events. + events. Bit 3 can only be set if KVM_FEATURE_ASYNC_PF_INT is present in + CPUID. 'Page not present' events are currently always delivered as synthetic #PF exception. During delivery of these events APF CR2 register contains @@ -242,7 +243,8 @@ data: Note, MSR_KVM_ASYNC_PF_INT MSR specifying the interrupt vector for 'page ready' APF delivery needs to be written to before enabling APF mechanism - in MSR_KVM_ASYNC_PF_EN or interrupt #0 can get injected. + in MSR_KVM_ASYNC_PF_EN or interrupt #0 can get injected. The MSR is + available if KVM_FEATURE_ASYNC_PF_INT is present in CPUID. Note, previously, 'page ready' events were delivered via the same #PF exception as 'page not present' events but this is now deprecated. If @@ -360,7 +362,8 @@ data: Interrupt vector for asynchnonous 'page ready' notifications delivery. The vector has to be set up before asynchronous page fault mechanism - is enabled in MSR_KVM_ASYNC_PF_EN. + is enabled in MSR_KVM_ASYNC_PF_EN. The MSR is only available if + KVM_FEATURE_ASYNC_PF_INT is present in CPUID. MSR_KVM_ASYNC_PF_ACK: 0x4b564d07 @@ -371,4 +374,5 @@ data: When the guest is done processing 'page ready' APF event and 'token' field in 'struct kvm_vcpu_pv_apf_data' is cleared it is supposed to write '1' to bit 0 of the MSR, this causes the host to re-scan its queue - and check if there are more notifications pending. + and check if there are more notifications pending. The MSR is available + if KVM_FEATURE_ASYNC_PF_INT is present in CPUID. diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 7ac20df80ba8..812e9b4c1114 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -31,6 +31,7 @@ #define KVM_FEATURE_PV_SEND_IPI 11 #define KVM_FEATURE_POLL_CONTROL 12 #define KVM_FEATURE_PV_SCHED_YIELD 13 +#define KVM_FEATURE_ASYNC_PF_INT 14 #define KVM_HINTS_REALTIME 0 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index cd708b0b460a..a9f1905896fb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -711,7 +711,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | (1 << KVM_FEATURE_PV_SEND_IPI) | (1 << KVM_FEATURE_POLL_CONTROL) | - (1 << KVM_FEATURE_PV_SCHED_YIELD); + (1 << KVM_FEATURE_PV_SCHED_YIELD) | + (1 << KVM_FEATURE_ASYNC_PF_INT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9d709a672f3..ae3a7f2fbda2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3446,6 +3446,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ac9eba0289d1..fc0075988b80 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1018,6 +1018,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_PROTECTED 180 #define KVM_CAP_PPC_SECURE_GUEST 181 #define KVM_CAP_HALT_POLL 182 +#define KVM_CAP_ASYNC_PF_INT 183 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From f7d31e65368aeef973fab788aa22c4f1d5a6af66 Mon Sep 17 00:00:00 2001 From: Jon Doron Date: Fri, 24 Apr 2020 14:37:40 +0300 Subject: x86/kvm/hyper-v: Explicitly align hcall param for kvm_hyperv_exit The problem the patch is trying to address is the fact that 'struct kvm_hyperv_exit' has different layout on when compiling in 32 and 64 bit modes. In 64-bit mode the default alignment boundary is 64 bits thus forcing extra gaps after 'type' and 'msr' but in 32-bit mode the boundary is at 32 bits thus no extra gaps. This is an issue as even when the kernel is 64 bit, the userspace using the interface can be both 32 and 64 bit but the same 32 bit userspace has to work with 32 bit kernel. The issue is fixed by forcing the 64 bit layout, this leads to ABI change for 32 bit builds and while we are obviously breaking '32 bit userspace with 32 bit kernel' case, we're fixing the '32 bit userspace with 64 bit kernel' one. As the interface has no (known) users and 32 bit KVM is rather baroque nowadays, this seems like a reasonable decision. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Jon Doron Message-Id: <20200424113746.3473563-2-arilou@gmail.com> Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 2 ++ include/uapi/linux/kvm.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index d871dacb984e..d0569db2b1e2 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5067,9 +5067,11 @@ EOI was received. #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 __u32 type; + __u32 pad1; union { struct { __u32 msr; + __u32 pad2; __u64 control; __u64 evt_page; __u64 msg_page; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index fc0075988b80..cc04aceb8793 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -189,9 +189,11 @@ struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 __u32 type; + __u32 pad1; union { struct { __u32 msr; + __u32 pad2; __u64 control; __u64 evt_page; __u64 msg_page; -- cgit v1.2.3 From f97f5a56f5977311f3833056a73cdbb0ee56cb1e Mon Sep 17 00:00:00 2001 From: Jon Doron Date: Fri, 29 May 2020 16:45:40 +0300 Subject: x86/kvm/hyper-v: Add support for synthetic debugger interface Add support for Hyper-V synthetic debugger (syndbg) interface. The syndbg interface is using MSRs to emulate a way to send/recv packets data. The debug transport dll (kdvm/kdnet) will identify if Hyper-V is enabled and if it supports the synthetic debugger interface it will attempt to use it, instead of trying to initialize a network adapter. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Jon Doron Message-Id: <20200529134543.1127440-4-arilou@gmail.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 16 ++++ arch/x86/include/asm/kvm_host.h | 13 ++++ arch/x86/kvm/hyperv.c | 158 +++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/hyperv.h | 5 ++ arch/x86/kvm/trace.h | 51 +++++++++++++ arch/x86/kvm/x86.c | 8 ++ include/uapi/linux/kvm.h | 10 +++ 7 files changed, 258 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index db22e834ce2a..aad60be4884e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5070,6 +5070,7 @@ EOI was received. struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 + #define KVM_EXIT_HYPERV_SYNDBG 3 __u32 type; __u32 pad1; union { @@ -5085,6 +5086,15 @@ EOI was received. __u64 result; __u64 params[2]; } hcall; + struct { + __u32 msr; + __u32 pad2; + __u64 control; + __u64 status; + __u64 send_page; + __u64 recv_page; + __u64 pending_page; + } syndbg; } u; }; /* KVM_EXIT_HYPERV */ @@ -5101,6 +5111,12 @@ Hyper-V SynIC state change. Notification is used to remap SynIC event/message pages and to enable/disable SynIC messages/events processing in userspace. + - KVM_EXIT_HYPERV_SYNDBG -- synchronously notify user-space about + +Hyper-V Synthetic debugger state change. Notification is used to either update +the pending_page location or to send a control command (send the buffer located +in send_page or recv a buffer to recv_page). + :: /* KVM_EXIT_ARM_NISV */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b878fcd164ce..58337a25396a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -863,6 +863,18 @@ struct kvm_apic_map { struct kvm_lapic *phys_map[]; }; +/* Hyper-V synthetic debugger (SynDbg)*/ +struct kvm_hv_syndbg { + struct { + u64 control; + u64 status; + u64 send_page; + u64 recv_page; + u64 pending_page; + } control; + u64 options; +}; + /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; @@ -886,6 +898,7 @@ struct kvm_hv { atomic_t num_mismatched_vp_indexes; struct hv_partition_assist_pg *hv_pa_pg; + struct kvm_hv_syndbg hv_syndbg; }; enum kvm_irqchip_mode { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index f9d3b919823c..c21f99357ad5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -21,6 +21,7 @@ #include "x86.h" #include "lapic.h" #include "ioapic.h" +#include "cpuid.h" #include "hyperv.h" #include @@ -266,6 +267,123 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, return ret; } +static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + + entry = kvm_find_cpuid_entry(vcpu, + HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, + 0); + if (!entry) + return false; + + return entry->eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; +} + +static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = &kvm->arch.hyperv; + + if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) + hv->hv_syndbg.control.status = + vcpu->run->hyperv.u.syndbg.status; + return 1; +} + +static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + + hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; + hv_vcpu->exit.u.syndbg.msr = msr; + hv_vcpu->exit.u.syndbg.control = syndbg->control.control; + hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page; + hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page; + hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page; + vcpu->arch.complete_userspace_io = + kvm_hv_syndbg_complete_userspace; + + kvm_make_request(KVM_REQ_HV_EXIT, vcpu); +} + +static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, + vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data); + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + syndbg->control.control = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_STATUS: + syndbg->control.status = data; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + syndbg->control.send_page = data; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + syndbg->control.recv_page = data; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + syndbg->control.pending_page = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + syndbg->options = data; + break; + default: + break; + } + + return 0; +} + +static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + *pdata = syndbg->control.control; + break; + case HV_X64_MSR_SYNDBG_STATUS: + *pdata = syndbg->control.status; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + *pdata = syndbg->control.send_page; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + *pdata = syndbg->control.recv_page; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + *pdata = syndbg->control.pending_page; + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + *pdata = syndbg->options; + break; + default: + break; + } + + trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, + vcpu_to_hv_vcpu(vcpu)->vp_index, msr, + *pdata); + + return 0; +} + static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, bool host) { @@ -800,6 +918,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; break; } @@ -1061,6 +1181,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, if (!host) return 1; break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_set_msr(vcpu, msr, data, host); default: vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1190,7 +1313,8 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 0; } -static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + bool host) { u64 data = 0; struct kvm *kvm = vcpu->kvm; @@ -1227,6 +1351,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_get_msr(vcpu, msr, pdata, host); default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1316,7 +1443,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) int r; mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); - r = kvm_hv_get_msr_pw(vcpu, msr, pdata); + r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else @@ -1795,6 +1922,9 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, { .function = HYPERV_CPUID_FEATURES }, { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, + { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS }, + { .function = HYPERV_CPUID_SYNDBG_INTERFACE }, + { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }, { .function = HYPERV_CPUID_NESTED_FEATURES }, }; int i, nent = ARRAY_SIZE(cpuid_entries); @@ -1820,7 +1950,7 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); - ent->eax = HYPERV_CPUID_NESTED_FEATURES; + ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; @@ -1859,6 +1989,10 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + ent->ebx |= HV_X64_DEBUGGING; + ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; + ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + /* * Direct Synthetic timers only make sense with in-kernel * LAPIC @@ -1902,6 +2036,24 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, break; + case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: + memcpy(signature, "Linux KVM Hv", 12); + + ent->eax = 0; + ent->ebx = signature[0]; + ent->ecx = signature[1]; + ent->edx = signature[2]; + break; + + case HYPERV_CPUID_SYNDBG_INTERFACE: + memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); + ent->eax = signature[0]; + break; + + case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: + ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + break; + default: break; } diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 7f50ff0bad07..e68c6c2e9649 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -73,6 +73,11 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic)); } +static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu) +{ + return &vcpu->kvm->arch.hyperv.hv_syndbg; +} + int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 54a10c98d746..b66432b015d2 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1541,6 +1541,57 @@ TRACE_EVENT(kvm_nested_vmenter_failed, __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); +/* + * Tracepoint for syndbg_set_msr. + */ +TRACE_EVENT(kvm_hv_syndbg_set_msr, + TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), + TP_ARGS(vcpu_id, vp_index, msr, data), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, vp_index) + __field(u32, msr) + __field(u64, data) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vp_index = vp_index; + __entry->msr = msr; + __entry->data = data; + ), + + TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", + __entry->vcpu_id, __entry->vp_index, __entry->msr, + __entry->data) +); + +/* + * Tracepoint for syndbg_get_msr. + */ +TRACE_EVENT(kvm_hv_syndbg_get_msr, + TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), + TP_ARGS(vcpu_id, vp_index, msr, data), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, vp_index) + __field(u32, msr) + __field(u64, data) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vp_index = vp_index; + __entry->msr = msr; + __entry->data = data; + ), + + TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", + __entry->vcpu_id, __entry->vp_index, __entry->msr, + __entry->data) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca8a57312291..9e41b5135340 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1246,6 +1246,10 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, @@ -3011,6 +3015,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: @@ -3272,6 +3278,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index cc04aceb8793..6721eb563eda 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -188,6 +188,7 @@ struct kvm_s390_cmma_log { struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 +#define KVM_EXIT_HYPERV_SYNDBG 3 __u32 type; __u32 pad1; union { @@ -203,6 +204,15 @@ struct kvm_hyperv_exit { __u64 result; __u64 params[2]; } hcall; + struct { + __u32 msr; + __u32 pad2; + __u64 control; + __u64 status; + __u64 send_page; + __u64 recv_page; + __u64 pending_page; + } syndbg; } u; }; -- cgit v1.2.3 From 9eefeabed6f831018c15bd7e17d34967ee34d9dd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:39 +0300 Subject: devlink: Add 'mirror' trap action The action is used by control traps such as IGMP query. The packet is flooded by the device, but also trapped to the CPU in order for the software bridge to mark the receiving port as a multicast router port. Such packets are marked with 'skb->offload_fwd_mark = 1' in order to prevent the software bridge from flooding them again. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 2 ++ include/uapi/linux/devlink.h | 3 +++ net/core/devlink.c | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 4ca241e70064..5b97327caefc 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -108,6 +108,8 @@ The ``devlink-trap`` mechanism supports the following packet trap actions: * ``trap``: The sole copy of the packet is sent to the CPU. * ``drop``: The packet is dropped by the underlying device and a copy is not sent to the CPU. + * ``mirror``: The packet is forwarded by the underlying device and a copy is + sent to the CPU. Generic Packet Traps ==================== diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1ae90e06c06d..16305932a950 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -233,10 +233,13 @@ enum { * @DEVLINK_TRAP_ACTION_DROP: Packet is dropped by the device and a copy is not * sent to the CPU. * @DEVLINK_TRAP_ACTION_TRAP: The sole copy of the packet is sent to the CPU. + * @DEVLINK_TRAP_ACTION_MIRROR: Packet is forwarded by the device and a copy is + * sent to the CPU. */ enum devlink_trap_action { DEVLINK_TRAP_ACTION_DROP, DEVLINK_TRAP_ACTION_TRAP, + DEVLINK_TRAP_ACTION_MIRROR, }; /** diff --git a/net/core/devlink.c b/net/core/devlink.c index d9fff7083f02..d6298917b077 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -5869,7 +5869,8 @@ devlink_trap_action_get_from_info(struct genl_info *info, val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); switch (val) { case DEVLINK_TRAP_ACTION_DROP: /* fall-through */ - case DEVLINK_TRAP_ACTION_TRAP: + case DEVLINK_TRAP_ACTION_TRAP: /* fall-through */ + case DEVLINK_TRAP_ACTION_MIRROR: *p_trap_action = val; break; default: -- cgit v1.2.3 From 30a4e9a29ab9aadfe6c5386ae4aa396b1d2556c2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:40 +0300 Subject: devlink: Add 'control' trap type This type is used for traps that trap control packets such as ARP request and IGMP query to the CPU. Do not report such packets to the kernel's drop monitor as they were not dropped by the device no encountered an exception during forwarding. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 8 +++++++- include/uapi/linux/devlink.h | 6 ++++++ net/core/devlink.c | 7 +++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 5b97327caefc..6c293cfa23ee 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -55,7 +55,7 @@ The following diagram provides a general overview of ``devlink-trap``:: | | +-------^--------+ | - | + | Non-control traps | +----+----+ | | Kernel's Rx path @@ -97,6 +97,12 @@ The ``devlink-trap`` mechanism supports the following packet trap types: processed by ``devlink`` and injected to the kernel's Rx path. Changing the action of such traps is not allowed, as it can easily break the control plane. + * ``control``: Trapped packets were trapped by the device because these are + control packets required for the correct functioning of the control plane. + For example, ARP request and IGMP query packets. Packets are injected to + the kernel's Rx path, but not reported to the kernel's drop monitor. + Changing the action of such traps is not allowed, as it can easily break + the control plane. .. _Trap-Actions: diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 16305932a950..08563e6a424d 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -253,10 +253,16 @@ enum devlink_trap_action { * control plane for resolution. Trapped packets * are processed by devlink and injected to * the kernel's Rx path. + * @DEVLINK_TRAP_TYPE_CONTROL: Packet was trapped because it is required for + * the correct functioning of the control plane. + * For example, an ARP request packet. Trapped + * packets are injected to the kernel's Rx path, + * but not reported to drop monitor. */ enum devlink_trap_type { DEVLINK_TRAP_TYPE_DROP, DEVLINK_TRAP_TYPE_EXCEPTION, + DEVLINK_TRAP_TYPE_CONTROL, }; enum { diff --git a/net/core/devlink.c b/net/core/devlink.c index d6298917b077..47c28e0f848f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8847,6 +8847,13 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, devlink_trap_stats_update(trap_item->stats, skb->len); devlink_trap_stats_update(trap_item->group_item->stats, skb->len); + /* Control packets were not dropped by the device or encountered an + * exception during forwarding and therefore should not be reported to + * the kernel's drop monitor. + */ + if (trap_item->trap->type == DEVLINK_TRAP_TYPE_CONTROL) + return; + devlink_trap_report_metadata_fill(&hw_metadata, trap_item, in_devlink_port, fa_cookie); net_dm_hw_report(skb, &hw_metadata); -- cgit v1.2.3 From 7e89ed8ab3f74e0746d3ea80537d7a06b0e27732 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 30 May 2020 18:09:46 +0000 Subject: bridge: mrp: Update MRP frame type Replace u16/u32 with be16/be32 in the MRP frame types. This fixes sparse warnings like: warning: cast to restricted __be16 Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/uapi/linux/mrp_bridge.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h index 2600cdf5a284..bcad42128d62 100644 --- a/include/uapi/linux/mrp_bridge.h +++ b/include/uapi/linux/mrp_bridge.h @@ -55,30 +55,30 @@ struct br_mrp_end_hdr { }; struct br_mrp_common_hdr { - __u16 seq_id; + __be16 seq_id; __u8 domain[MRP_DOMAIN_UUID_LENGTH]; }; struct br_mrp_ring_test_hdr { - __u16 prio; + __be16 prio; __u8 sa[ETH_ALEN]; - __u16 port_role; - __u16 state; - __u16 transitions; - __u32 timestamp; + __be16 port_role; + __be16 state; + __be16 transitions; + __be32 timestamp; }; struct br_mrp_ring_topo_hdr { - __u16 prio; + __be16 prio; __u8 sa[ETH_ALEN]; - __u16 interval; + __be16 interval; }; struct br_mrp_ring_link_hdr { __u8 sa[ETH_ALEN]; - __u16 port_role; - __u16 interval; - __u16 blocked; + __be16 port_role; + __be16 interval; + __be16 blocked; }; #endif -- cgit v1.2.3 From 4b3a61b030d1131dcf3633a276158a3d0a435a47 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 30 May 2020 18:09:47 +0000 Subject: bridge: mrp: Set the priority of MRP instance Each MRP instance has a priority, a lower value means a higher priority. The priority of MRP instance is stored in MRP_Test frame in this way all the MRP nodes in the ring can see other nodes priority. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + include/uapi/linux/if_bridge.h | 2 ++ net/bridge/br_mrp.c | 3 ++- net/bridge/br_mrp_netlink.c | 5 +++++ net/bridge/br_mrp_switchdev.c | 1 + net/bridge/br_private_mrp.h | 1 + 6 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index db519957e134..f82ef4c45f5e 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -116,6 +116,7 @@ struct switchdev_obj_mrp { struct net_device *p_port; struct net_device *s_port; u32 ring_id; + u16 prio; }; #define SWITCHDEV_OBJ_MRP(OBJ) \ diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 5a43eb86c93b..0162c1370ecb 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -176,6 +176,7 @@ enum { IFLA_BRIDGE_MRP_INSTANCE_RING_ID, IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX, IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX, + IFLA_BRIDGE_MRP_INSTANCE_PRIO, __IFLA_BRIDGE_MRP_INSTANCE_MAX, }; @@ -230,6 +231,7 @@ struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; __u32 s_ifindex; + __u16 prio; }; struct br_mrp_ring_state { diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 8ea59504ef47..f8fd037219fe 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -147,7 +147,7 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr)); hdr = skb_put(skb, sizeof(*hdr)); - hdr->prio = cpu_to_be16(MRP_DEFAULT_PRIO); + hdr->prio = cpu_to_be16(mrp->prio); ether_addr_copy(hdr->sa, p->br->dev->dev_addr); hdr->port_role = cpu_to_be16(port_role); hdr->state = cpu_to_be16(mrp->ring_state); @@ -290,6 +290,7 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) return -ENOMEM; mrp->ring_id = instance->ring_id; + mrp->prio = instance->prio; p = br_mrp_get_port(br, instance->p_ifindex); spin_lock_bh(&br->lock); diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index d9de780d2ce0..8cb67d9ca44e 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -22,6 +22,7 @@ br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = { [IFLA_BRIDGE_MRP_INSTANCE_RING_ID] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_PRIO] = { .type = NLA_U16 }, }; static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr, @@ -49,6 +50,10 @@ static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr, inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]); inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]); inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]); + inst.prio = MRP_DEFAULT_PRIO; + + if (tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]) + inst.prio = nla_get_u16(tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]); if (cmd == RTM_SETLINK) return br_mrp_add(br, &inst); diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index 51cb1d5a24b4..3a776043bf80 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -12,6 +12,7 @@ int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp) .p_port = rtnl_dereference(mrp->p_port)->dev, .s_port = rtnl_dereference(mrp->s_port)->dev, .ring_id = mrp->ring_id, + .prio = mrp->prio, }; int err; diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index a0f53cc3ab85..558941ce2366 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -14,6 +14,7 @@ struct br_mrp { struct net_bridge_port __rcu *s_port; u32 ring_id; + u16 prio; enum br_mrp_ring_role_type ring_role; u8 ring_role_offloaded; -- cgit v1.2.3 From c6676e7d62cfb5cb7c1c5320a26f3634a11afdb0 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 30 May 2020 18:09:48 +0000 Subject: bridge: mrp: Add support for role MRA A node that has the MRA role, it can behave as MRM or MRC. Initially it starts as MRM and sends MRP_Test frames on both ring ports. If it detects that there are MRP_Test send by another MRM, then it checks if these frames have a lower priority than itself. In this case it would send MRP_Nack frames to notify the other node that it needs to stop sending MRP_Test frames. If it receives a MRP_Nack frame then it stops sending MRP_Test frames and starts to behave as a MRC but it would continue to monitor the MRP_Test frames send by MRM. If at a point the MRM stops to send MRP_Test frames it would get the MRM role and start to send MRP_Test frames. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + include/uapi/linux/if_bridge.h | 2 + include/uapi/linux/mrp_bridge.h | 38 ++++++++++++ net/bridge/br_mrp.c | 125 ++++++++++++++++++++++++++++++++++------ net/bridge/br_mrp_netlink.c | 6 ++ net/bridge/br_mrp_switchdev.c | 4 +- net/bridge/br_private_mrp.h | 4 +- 7 files changed, 159 insertions(+), 21 deletions(-) (limited to 'include/uapi') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index f82ef4c45f5e..b8c059b4e06d 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -130,6 +130,7 @@ struct switchdev_obj_ring_test_mrp { u8 max_miss; u32 ring_id; u32 period; + bool monitor; }; #define SWITCHDEV_OBJ_RING_TEST_MRP(OBJ) \ diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 0162c1370ecb..caa6914a3e53 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -222,6 +222,7 @@ enum { IFLA_BRIDGE_MRP_START_TEST_INTERVAL, IFLA_BRIDGE_MRP_START_TEST_MAX_MISS, IFLA_BRIDGE_MRP_START_TEST_PERIOD, + IFLA_BRIDGE_MRP_START_TEST_MONITOR, __IFLA_BRIDGE_MRP_START_TEST_MAX, }; @@ -249,6 +250,7 @@ struct br_mrp_start_test { __u32 interval; __u32 max_miss; __u32 period; + __u32 monitor; }; struct bridge_stp_xstats { diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h index bcad42128d62..84f15f48a7cb 100644 --- a/include/uapi/linux/mrp_bridge.h +++ b/include/uapi/linux/mrp_bridge.h @@ -11,11 +11,14 @@ #define MRP_DOMAIN_UUID_LENGTH 16 #define MRP_VERSION 1 #define MRP_FRAME_PRIO 7 +#define MRP_OUI_LENGTH 3 +#define MRP_MANUFACTURE_DATA_LENGTH 2 enum br_mrp_ring_role_type { BR_MRP_RING_ROLE_DISABLED, BR_MRP_RING_ROLE_MRC, BR_MRP_RING_ROLE_MRM, + BR_MRP_RING_ROLE_MRA, }; enum br_mrp_ring_state_type { @@ -43,6 +46,13 @@ enum br_mrp_tlv_header_type { BR_MRP_TLV_HEADER_RING_TOPO = 0x3, BR_MRP_TLV_HEADER_RING_LINK_DOWN = 0x4, BR_MRP_TLV_HEADER_RING_LINK_UP = 0x5, + BR_MRP_TLV_HEADER_OPTION = 0x7f, +}; + +enum br_mrp_sub_tlv_header_type { + BR_MRP_SUB_TLV_HEADER_TEST_MGR_NACK = 0x1, + BR_MRP_SUB_TLV_HEADER_TEST_PROPAGATE = 0x2, + BR_MRP_SUB_TLV_HEADER_TEST_AUTO_MGR = 0x3, }; struct br_mrp_tlv_hdr { @@ -50,6 +60,11 @@ struct br_mrp_tlv_hdr { __u8 length; }; +struct br_mrp_sub_tlv_hdr { + __u8 type; + __u8 length; +}; + struct br_mrp_end_hdr { struct br_mrp_tlv_hdr hdr; }; @@ -81,4 +96,27 @@ struct br_mrp_ring_link_hdr { __be16 blocked; }; +struct br_mrp_sub_opt_hdr { + __u8 type; + __u8 manufacture_data[MRP_MANUFACTURE_DATA_LENGTH]; +}; + +struct br_mrp_test_mgr_nack_hdr { + __be16 prio; + __u8 sa[ETH_ALEN]; + __be16 other_prio; + __u8 other_sa[ETH_ALEN]; +}; + +struct br_mrp_test_prop_hdr { + __be16 prio; + __u8 sa[ETH_ALEN]; + __be16 other_prio; + __u8 other_sa[ETH_ALEN]; +}; + +struct br_mrp_oui_hdr { + __u8 oui[MRP_OUI_LENGTH]; +}; + #endif diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index f8fd037219fe..24986ec7d38c 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -160,6 +160,16 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, return skb; } +/* This function is continuously called in the following cases: + * - when node role is MRM, in this case test_monitor is always set to false + * because it needs to notify the userspace that the ring is open and needs to + * send MRP_Test frames + * - when node role is MRA, there are 2 subcases: + * - when MRA behaves as MRM, in this case is similar with MRM role + * - when MRA behaves as MRC, in this case test_monitor is set to true, + * because it needs to detect when it stops seeing MRP_Test frames + * from MRM node but it doesn't need to send MRP_Test frames. + */ static void br_mrp_test_work_expired(struct work_struct *work) { struct delayed_work *del_work = to_delayed_work(work); @@ -177,8 +187,14 @@ static void br_mrp_test_work_expired(struct work_struct *work) /* Notify that the ring is open only if the ring state is * closed, otherwise it would continue to notify at every * interval. + * Also notify that the ring is open when the node has the + * role MRA and behaves as MRC. The reason is that the + * userspace needs to know when the MRM stopped sending + * MRP_Test frames so that the current node to try to take + * the role of a MRM. */ - if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED) + if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED || + mrp->test_monitor) notify_open = true; } @@ -186,12 +202,15 @@ static void br_mrp_test_work_expired(struct work_struct *work) p = rcu_dereference(mrp->p_port); if (p) { - skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_PRIMARY); - if (!skb) - goto out; - - skb_reset_network_header(skb); - dev_queue_xmit(skb); + if (!mrp->test_monitor) { + skb = br_mrp_alloc_test_skb(mrp, p, + BR_MRP_PORT_ROLE_PRIMARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + } if (notify_open && !mrp->ring_role_offloaded) br_mrp_port_open(p->dev, true); @@ -199,12 +218,15 @@ static void br_mrp_test_work_expired(struct work_struct *work) p = rcu_dereference(mrp->s_port); if (p) { - skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_SECONDARY); - if (!skb) - goto out; - - skb_reset_network_header(skb); - dev_queue_xmit(skb); + if (!mrp->test_monitor) { + skb = br_mrp_alloc_test_skb(mrp, p, + BR_MRP_PORT_ROLE_SECONDARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + } if (notify_open && !mrp->ring_role_offloaded) br_mrp_port_open(p->dev, true); @@ -227,7 +249,7 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) /* Stop sending MRP_Test frames */ cancel_delayed_work_sync(&mrp->test_work); - br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0); + br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0, 0); br_mrp_switchdev_del(br, mrp); @@ -452,8 +474,8 @@ int br_mrp_set_ring_role(struct net_bridge *br, return 0; } -/* Start to generate MRP test frames, the frames are generated by HW and if it - * fails, they are generated by the SW. +/* Start to generate or monitor MRP test frames, the frames are generated by + * HW and if it fails, they are generated by the SW. * note: already called with rtnl_lock */ int br_mrp_start_test(struct net_bridge *br, @@ -464,16 +486,18 @@ int br_mrp_start_test(struct net_bridge *br, if (!mrp) return -EINVAL; - /* Try to push it to the HW and if it fails then continue to generate in - * SW and if that also fails then return error + /* Try to push it to the HW and if it fails then continue with SW + * implementation and if that also fails then return error. */ if (!br_mrp_switchdev_send_ring_test(br, mrp, test->interval, - test->max_miss, test->period)) + test->max_miss, test->period, + test->monitor)) return 0; mrp->test_interval = test->interval; mrp->test_end = jiffies + usecs_to_jiffies(test->period); mrp->test_max_miss = test->max_miss; + mrp->test_monitor = test->monitor; mrp->test_count_miss = 0; queue_delayed_work(system_wq, &mrp->test_work, usecs_to_jiffies(test->interval)); @@ -510,6 +534,57 @@ static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port, br_mrp_port_open(port->dev, false); } +/* Determin if the test hdr has a better priority than the node */ +static bool br_mrp_test_better_than_own(struct br_mrp *mrp, + struct net_bridge *br, + const struct br_mrp_ring_test_hdr *hdr) +{ + u16 prio = be16_to_cpu(hdr->prio); + + if (prio < mrp->prio || + (prio == mrp->prio && + ether_addr_to_u64(hdr->sa) < ether_addr_to_u64(br->dev->dev_addr))) + return true; + + return false; +} + +/* Process only MRP Test frame. All the other MRP frames are processed by + * userspace application + * note: already called with rcu_read_lock + */ +static void br_mrp_mra_process(struct br_mrp *mrp, struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct br_mrp_ring_test_hdr *test_hdr; + struct br_mrp_ring_test_hdr _test_hdr; + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return; + + if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST) + return; + + test_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr), + sizeof(_test_hdr), &_test_hdr); + if (!test_hdr) + return; + + /* Only frames that have a better priority than the node will + * clear the miss counter because otherwise the node will need to behave + * as MRM. + */ + if (br_mrp_test_better_than_own(mrp, br, test_hdr)) + mrp->test_count_miss = 0; +} + /* This will just forward the frame to the other mrp ring port(MRC role) or will * not do anything. * note: already called with rcu_read_lock @@ -546,6 +621,18 @@ static int br_mrp_rcv(struct net_bridge_port *p, return 1; } + /* If the role is MRA then don't forward the frames if it behaves as + * MRM node + */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { + if (!mrp->test_monitor) { + br_mrp_mrm_process(mrp, p, skb); + return 1; + } + + br_mrp_mra_process(mrp, br, p, skb); + } + /* Clone the frame and forward it on the other MRP port */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index 8cb67d9ca44e..34b3a8776991 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -196,6 +196,7 @@ br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = { [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_MONITOR] = { .type = NLA_U32 }, }; static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, @@ -225,6 +226,11 @@ static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]); test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]); test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]); + test.monitor = false; + + if (tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]) + test.monitor = + nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]); return br_mrp_start_test(br, &test); } diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c index 3a776043bf80..0da68a0da4b5 100644 --- a/net/bridge/br_mrp_switchdev.c +++ b/net/bridge/br_mrp_switchdev.c @@ -65,7 +65,8 @@ int br_mrp_switchdev_set_ring_role(struct net_bridge *br, int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, u32 interval, - u8 max_miss, u32 period) + u8 max_miss, u32 period, + bool monitor) { struct switchdev_obj_ring_test_mrp test = { .obj.orig_dev = br->dev, @@ -74,6 +75,7 @@ int br_mrp_switchdev_send_ring_test(struct net_bridge *br, .max_miss = max_miss, .ring_id = mrp->ring_id, .period = period, + .monitor = monitor, }; int err; diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 558941ce2366..33b255e38ffe 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -26,6 +26,7 @@ struct br_mrp { unsigned long test_end; u32 test_count_miss; u32 test_max_miss; + bool test_monitor; u32 seq_id; @@ -52,7 +53,8 @@ int br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp, enum br_mrp_ring_state_type state); int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, - u32 interval, u8 max_miss, u32 period); + u32 interval, u8 max_miss, u32 period, + bool monitor); int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, enum br_mrp_port_state_type state); int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, -- cgit v1.2.3 From 13d70f5a5ecff367db2fb18ed4ebe433eab8a74c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:51:15 -0700 Subject: bpf, sk_msg: Add get socket storage helpers Add helpers to use local socket storage. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033907577.12355.14740125020572756560.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 15 +++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 3 files changed, 19 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 97e1fd19ff58..54b93f8b49b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3645,6 +3645,8 @@ struct sk_msg_md { __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ }; struct sk_reuseport_md { diff --git a/net/core/filter.c b/net/core/filter.c index c3b496a19748..a6fc23447f12 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6449,6 +6449,10 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; @@ -7273,6 +7277,11 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct sk_msg_md, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): @@ -8609,6 +8618,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; + + case offsetof(struct sk_msg_md, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + break; } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 97e1fd19ff58..54b93f8b49b8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3645,6 +3645,8 @@ struct sk_msg_md { __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ }; struct sk_reuseport_md { -- cgit v1.2.3 From 457f44363a8894135c85b7a9afd2bd8196db24ab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 May 2020 00:54:20 -0700 Subject: bpf: Implement BPF ring buffer and verifier support for it This commit adds a new MPSC ring buffer implementation into BPF ecosystem, which allows multiple CPUs to submit data to a single shared ring buffer. On the consumption side, only single consumer is assumed. Motivation ---------- There are two distinctive motivators for this work, which are not satisfied by existing perf buffer, which prompted creation of a new ring buffer implementation. - more efficient memory utilization by sharing ring buffer across CPUs; - preserving ordering of events that happen sequentially in time, even across multiple CPUs (e.g., fork/exec/exit events for a task). These two problems are independent, but perf buffer fails to satisfy both. Both are a result of a choice to have per-CPU perf ring buffer. Both can be also solved by having an MPSC implementation of ring buffer. The ordering problem could technically be solved for perf buffer with some in-kernel counting, but given the first one requires an MPSC buffer, the same solution would solve the second problem automatically. Semantics and APIs ------------------ Single ring buffer is presented to BPF programs as an instance of BPF map of type BPF_MAP_TYPE_RINGBUF. Two other alternatives considered, but ultimately rejected. One way would be to, similar to BPF_MAP_TYPE_PERF_EVENT_ARRAY, make BPF_MAP_TYPE_RINGBUF could represent an array of ring buffers, but not enforce "same CPU only" rule. This would be more familiar interface compatible with existing perf buffer use in BPF, but would fail if application needed more advanced logic to lookup ring buffer by arbitrary key. HASH_OF_MAPS addresses this with current approach. Additionally, given the performance of BPF ringbuf, many use cases would just opt into a simple single ring buffer shared among all CPUs, for which current approach would be an overkill. Another approach could introduce a new concept, alongside BPF map, to represent generic "container" object, which doesn't necessarily have key/value interface with lookup/update/delete operations. This approach would add a lot of extra infrastructure that has to be built for observability and verifier support. It would also add another concept that BPF developers would have to familiarize themselves with, new syntax in libbpf, etc. But then would really provide no additional benefits over the approach of using a map. BPF_MAP_TYPE_RINGBUF doesn't support lookup/update/delete operations, but so doesn't few other map types (e.g., queue and stack; array doesn't support delete, etc). The approach chosen has an advantage of re-using existing BPF map infrastructure (introspection APIs in kernel, libbpf support, etc), being familiar concept (no need to teach users a new type of object in BPF program), and utilizing existing tooling (bpftool). For common scenario of using a single ring buffer for all CPUs, it's as simple and straightforward, as would be with a dedicated "container" object. On the other hand, by being a map, it can be combined with ARRAY_OF_MAPS and HASH_OF_MAPS map-in-maps to implement a wide variety of topologies, from one ring buffer for each CPU (e.g., as a replacement for perf buffer use cases), to a complicated application hashing/sharding of ring buffers (e.g., having a small pool of ring buffers with hashed task's tgid being a look up key to preserve order, but reduce contention). Key and value sizes are enforced to be zero. max_entries is used to specify the size of ring buffer and has to be a power of 2 value. There are a bunch of similarities between perf buffer (BPF_MAP_TYPE_PERF_EVENT_ARRAY) and new BPF ring buffer semantics: - variable-length records; - if there is no more space left in ring buffer, reservation fails, no blocking; - memory-mappable data area for user-space applications for ease of consumption and high performance; - epoll notifications for new incoming data; - but still the ability to do busy polling for new data to achieve the lowest latency, if necessary. BPF ringbuf provides two sets of APIs to BPF programs: - bpf_ringbuf_output() allows to *copy* data from one place to a ring buffer, similarly to bpf_perf_event_output(); - bpf_ringbuf_reserve()/bpf_ringbuf_commit()/bpf_ringbuf_discard() APIs split the whole process into two steps. First, a fixed amount of space is reserved. If successful, a pointer to a data inside ring buffer data area is returned, which BPF programs can use similarly to a data inside array/hash maps. Once ready, this piece of memory is either committed or discarded. Discard is similar to commit, but makes consumer ignore the record. bpf_ringbuf_output() has disadvantage of incurring extra memory copy, because record has to be prepared in some other place first. But it allows to submit records of the length that's not known to verifier beforehand. It also closely matches bpf_perf_event_output(), so will simplify migration significantly. bpf_ringbuf_reserve() avoids the extra copy of memory by providing a memory pointer directly to ring buffer memory. In a lot of cases records are larger than BPF stack space allows, so many programs have use extra per-CPU array as a temporary heap for preparing sample. bpf_ringbuf_reserve() avoid this needs completely. But in exchange, it only allows a known constant size of memory to be reserved, such that verifier can verify that BPF program can't access memory outside its reserved record space. bpf_ringbuf_output(), while slightly slower due to extra memory copy, covers some use cases that are not suitable for bpf_ringbuf_reserve(). The difference between commit and discard is very small. Discard just marks a record as discarded, and such records are supposed to be ignored by consumer code. Discard is useful for some advanced use-cases, such as ensuring all-or-nothing multi-record submission, or emulating temporary malloc()/free() within single BPF program invocation. Each reserved record is tracked by verifier through existing reference-tracking logic, similar to socket ref-tracking. It is thus impossible to reserve a record, but forget to submit (or discard) it. bpf_ringbuf_query() helper allows to query various properties of ring buffer. Currently 4 are supported: - BPF_RB_AVAIL_DATA returns amount of unconsumed data in ring buffer; - BPF_RB_RING_SIZE returns the size of ring buffer; - BPF_RB_CONS_POS/BPF_RB_PROD_POS returns current logical possition of consumer/producer, respectively. Returned values are momentarily snapshots of ring buffer state and could be off by the time helper returns, so this should be used only for debugging/reporting reasons or for implementing various heuristics, that take into account highly-changeable nature of some of those characteristics. One such heuristic might involve more fine-grained control over poll/epoll notifications about new data availability in ring buffer. Together with BPF_RB_NO_WAKEUP/BPF_RB_FORCE_WAKEUP flags for output/commit/discard helpers, it allows BPF program a high degree of control and, e.g., more efficient batched notifications. Default self-balancing strategy, though, should be adequate for most applications and will work reliable and efficiently already. Design and implementation ------------------------- This reserve/commit schema allows a natural way for multiple producers, either on different CPUs or even on the same CPU/in the same BPF program, to reserve independent records and work with them without blocking other producers. This means that if BPF program was interruped by another BPF program sharing the same ring buffer, they will both get a record reserved (provided there is enough space left) and can work with it and submit it independently. This applies to NMI context as well, except that due to using a spinlock during reservation, in NMI context, bpf_ringbuf_reserve() might fail to get a lock, in which case reservation will fail even if ring buffer is not full. The ring buffer itself internally is implemented as a power-of-2 sized circular buffer, with two logical and ever-increasing counters (which might wrap around on 32-bit architectures, that's not a problem): - consumer counter shows up to which logical position consumer consumed the data; - producer counter denotes amount of data reserved by all producers. Each time a record is reserved, producer that "owns" the record will successfully advance producer counter. At that point, data is still not yet ready to be consumed, though. Each record has 8 byte header, which contains the length of reserved record, as well as two extra bits: busy bit to denote that record is still being worked on, and discard bit, which might be set at commit time if record is discarded. In the latter case, consumer is supposed to skip the record and move on to the next one. Record header also encodes record's relative offset from the beginning of ring buffer data area (in pages). This allows bpf_ringbuf_commit()/bpf_ringbuf_discard() to accept only the pointer to the record itself, without requiring also the pointer to ring buffer itself. Ring buffer memory location will be restored from record metadata header. This significantly simplifies verifier, as well as improving API usability. Producer counter increments are serialized under spinlock, so there is a strict ordering between reservations. Commits, on the other hand, are completely lockless and independent. All records become available to consumer in the order of reservations, but only after all previous records where already committed. It is thus possible for slow producers to temporarily hold off submitted records, that were reserved later. Reservation/commit/consumer protocol is verified by litmus tests in Documentation/litmus-test/bpf-rb. One interesting implementation bit, that significantly simplifies (and thus speeds up as well) implementation of both producers and consumers is how data area is mapped twice contiguously back-to-back in the virtual memory. This allows to not take any special measures for samples that have to wrap around at the end of the circular buffer data area, because the next page after the last data page would be first data page again, and thus the sample will still appear completely contiguous in virtual memory. See comment and a simple ASCII diagram showing this visually in bpf_ringbuf_area_alloc(). Another feature that distinguishes BPF ringbuf from perf ring buffer is a self-pacing notifications of new data being availability. bpf_ringbuf_commit() implementation will send a notification of new record being available after commit only if consumer has already caught up right up to the record being committed. If not, consumer still has to catch up and thus will see new data anyways without needing an extra poll notification. Benchmarks (see tools/testing/selftests/bpf/benchs/bench_ringbuf.c) show that this allows to achieve a very high throughput without having to resort to tricks like "notify only every Nth sample", which are necessary with perf buffer. For extreme cases, when BPF program wants more manual control of notifications, commit/discard/output helpers accept BPF_RB_NO_WAKEUP and BPF_RB_FORCE_WAKEUP flags, which give full control over notifications of data availability, but require extra caution and diligence in using this API. Comparison to alternatives -------------------------- Before considering implementing BPF ring buffer from scratch existing alternatives in kernel were evaluated, but didn't seem to meet the needs. They largely fell into few categores: - per-CPU buffers (perf, ftrace, etc), which don't satisfy two motivations outlined above (ordering and memory consumption); - linked list-based implementations; while some were multi-producer designs, consuming these from user-space would be very complicated and most probably not performant; memory-mapping contiguous piece of memory is simpler and more performant for user-space consumers; - io_uring is SPSC, but also requires fixed-sized elements. Naively turning SPSC queue into MPSC w/ lock would have subpar performance compared to locked reserve + lockless commit, as with BPF ring buffer. Fixed sized elements would be too limiting for BPF programs, given existing BPF programs heavily rely on variable-sized perf buffer already; - specialized implementations (like a new printk ring buffer, [0]) with lots of printk-specific limitations and implications, that didn't seem to fit well for intended use with BPF programs. [0] https://lwn.net/Articles/779550/ Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200529075424.3139988-2-andriin@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 13 + include/linux/bpf_types.h | 1 + include/linux/bpf_verifier.h | 4 + include/uapi/linux/bpf.h | 84 +++- kernel/bpf/Makefile | 2 +- kernel/bpf/helpers.c | 10 + kernel/bpf/ringbuf.c | 501 +++++++++++++++++++++ kernel/bpf/syscall.c | 12 + kernel/bpf/verifier.c | 195 ++++++-- kernel/trace/bpf_trace.c | 10 + tools/include/uapi/linux/bpf.h | 84 +++- tools/testing/selftests/bpf/verifier/and.c | 4 +- .../testing/selftests/bpf/verifier/array_access.c | 4 +- tools/testing/selftests/bpf/verifier/bounds.c | 6 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- .../selftests/bpf/verifier/direct_value_access.c | 4 +- .../selftests/bpf/verifier/helper_access_var_len.c | 2 +- .../selftests/bpf/verifier/helper_value_access.c | 6 +- .../selftests/bpf/verifier/value_ptr_arith.c | 8 +- 19 files changed, 882 insertions(+), 70 deletions(-) create mode 100644 kernel/bpf/ringbuf.c (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index efe8836b5c48..e5884f7f801c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -90,6 +90,8 @@ struct bpf_map_ops { int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); + __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts); }; struct bpf_map_memory { @@ -244,6 +246,9 @@ enum bpf_arg_type { ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ + ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ + ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ + ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ }; /* type of values returned from helper functions */ @@ -255,6 +260,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ + RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -322,6 +328,8 @@ enum bpf_reg_type { PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ PTR_TO_BTF_ID, /* reg points to kernel struct */ PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ + PTR_TO_MEM, /* reg points to valid memory region */ + PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -1611,6 +1619,11 @@ extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_event_output_data_proto; +extern const struct bpf_func_proto bpf_ringbuf_output_proto; +extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; +extern const struct bpf_func_proto bpf_ringbuf_submit_proto; +extern const struct bpf_func_proto bpf_ringbuf_discard_proto; +extern const struct bpf_func_proto bpf_ringbuf_query_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 29d22752fc87..fa8e1b552acd 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -118,6 +118,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) #if defined(CONFIG_BPF_JIT) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ea833087e853..ca08db4ffb5f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -54,6 +54,8 @@ struct bpf_reg_state { u32 btf_id; /* for PTR_TO_BTF_ID */ + u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ + /* Max size from any of the above. */ unsigned long raw; }; @@ -63,6 +65,8 @@ struct bpf_reg_state { * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. + * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation + * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54b93f8b49b8..974ca6e948e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -147,6 +147,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, }; /* Note that tracing related programs such as @@ -3157,6 +3158,59 @@ union bpf_attr { * **bpf_sk_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * 0, on success; + * < 0, on error. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; + * - BPF_RB_RING_SIZE - the size of ring buffer; + * - BPF_RB_CONS_POS - consumer position (can wrap around); + * - BPF_RB_PROD_POS - producer(s) position (can wrap around); + * Data returned is just a momentary snapshots of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if flags are not recognized. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3288,7 +3342,12 @@ union bpf_attr { FN(seq_printf), \ FN(seq_write), \ FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), + FN(sk_ancestor_cgroup_id), \ + FN(ringbuf_output), \ + FN(ringbuf_reserve), \ + FN(ringbuf_submit), \ + FN(ringbuf_discard), \ + FN(ringbuf_query), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3398,6 +3457,29 @@ enum { BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 375b933010dd..8fca02f64811 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bb4fb634275e..be43ab3e619f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -635,6 +635,16 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: break; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c new file mode 100644 index 000000000000..180414bb0d3e --- /dev/null +++ b/kernel/bpf/ringbuf.c @@ -0,0 +1,501 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) + +/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ +#define RINGBUF_PGOFF \ + (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) +/* consumer page and producer page */ +#define RINGBUF_POS_PAGES 2 + +#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) + +/* Maximum size of ring buffer area is limited by 32-bit page offset within + * record header, counted in pages. Reserve 8 bits for extensibility, and take + * into account few extra pages for consumer/producer pages and + * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single + * ring buffer. + */ +#define RINGBUF_MAX_DATA_SZ \ + (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) + +struct bpf_ringbuf { + wait_queue_head_t waitq; + struct irq_work work; + u64 mask; + struct page **pages; + int nr_pages; + spinlock_t spinlock ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to allow + * mapping consumer page as r/w, but restrict producer page to r/o. + * This protects producer position from being modified by user-space + * application and ruining in-kernel position tracking. + */ + unsigned long consumer_pos __aligned(PAGE_SIZE); + unsigned long producer_pos __aligned(PAGE_SIZE); + char data[] __aligned(PAGE_SIZE); +}; + +struct bpf_ringbuf_map { + struct bpf_map map; + struct bpf_map_memory memory; + struct bpf_ringbuf *rb; +}; + +/* 8-byte ring buffer record header structure */ +struct bpf_ringbuf_hdr { + u32 len; + u32 pg_off; +}; + +static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) +{ + const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO; + int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; + int nr_data_pages = data_sz >> PAGE_SHIFT; + int nr_pages = nr_meta_pages + nr_data_pages; + struct page **pages, *page; + struct bpf_ringbuf *rb; + size_t array_size; + int i; + + /* Each data page is mapped twice to allow "virtual" + * continuous read of samples wrapping around the end of ring + * buffer area: + * ------------------------------------------------------ + * | meta pages | real data pages | same data pages | + * ------------------------------------------------------ + * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | + * ------------------------------------------------------ + * | | TA DA | TA DA | + * ------------------------------------------------------ + * ^^^^^^^ + * | + * Here, no need to worry about special handling of wrapped-around + * data due to double-mapped data pages. This works both in kernel and + * when mmap()'ed in user-space, simplifying both kernel and + * user-space implementations significantly. + */ + array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); + if (array_size > PAGE_SIZE) + pages = vmalloc_node(array_size, numa_node); + else + pages = kmalloc_node(array_size, flags, numa_node); + if (!pages) + return NULL; + + for (i = 0; i < nr_pages; i++) { + page = alloc_pages_node(numa_node, flags, 0); + if (!page) { + nr_pages = i; + goto err_free_pages; + } + pages[i] = page; + if (i >= nr_meta_pages) + pages[nr_data_pages + i] = page; + } + + rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, + VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + if (rb) { + rb->pages = pages; + rb->nr_pages = nr_pages; + return rb; + } + +err_free_pages: + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); + return NULL; +} + +static void bpf_ringbuf_notify(struct irq_work *work) +{ + struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); + + wake_up_all(&rb->waitq); +} + +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +{ + struct bpf_ringbuf *rb; + + if (!data_sz || !PAGE_ALIGNED(data_sz)) + return ERR_PTR(-EINVAL); + +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (data_sz > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + + rb = bpf_ringbuf_area_alloc(data_sz, numa_node); + if (!rb) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&rb->spinlock); + init_waitqueue_head(&rb->waitq); + init_irq_work(&rb->work, bpf_ringbuf_notify); + + rb->mask = data_sz - 1; + rb->consumer_pos = 0; + rb->producer_pos = 0; + + return rb; +} + +static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) +{ + struct bpf_ringbuf_map *rb_map; + u64 cost; + int err; + + if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->key_size || attr->value_size || + attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + return ERR_PTR(-EINVAL); + + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + if (!rb_map) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rb_map->map, attr); + + cost = sizeof(struct bpf_ringbuf_map) + + sizeof(struct bpf_ringbuf) + + attr->max_entries; + err = bpf_map_charge_init(&rb_map->map.memory, cost); + if (err) + goto err_free_map; + + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + if (IS_ERR(rb_map->rb)) { + err = PTR_ERR(rb_map->rb); + goto err_uncharge; + } + + return &rb_map->map; + +err_uncharge: + bpf_map_charge_finish(&rb_map->map.memory); +err_free_map: + kfree(rb_map); + return ERR_PTR(err); +} + +static void bpf_ringbuf_free(struct bpf_ringbuf *rb) +{ + /* copy pages pointer and nr_pages to local variable, as we are going + * to unmap rb itself with vunmap() below + */ + struct page **pages = rb->pages; + int i, nr_pages = rb->nr_pages; + + vunmap(rb); + for (i = 0; i < nr_pages; i++) + __free_page(pages[i]); + kvfree(pages); +} + +static void ringbuf_map_free(struct bpf_map *map) +{ + struct bpf_ringbuf_map *rb_map; + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + bpf_ringbuf_free(rb_map->rb); + kfree(rb_map); +} + +static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-ENOTSUPP); +} + +static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +{ + return -ENOTSUPP; +} + +static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) +{ + size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; + + /* consumer page + producer page + 2 x data pages */ + return RINGBUF_POS_PAGES + 2 * data_pages; +} + +static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + size_t mmap_sz; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; + + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) + return -EINVAL; + + return remap_vmalloc_range(vma, rb_map->rb, + vma->vm_pgoff + RINGBUF_PGOFF); +} + +static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) +{ + unsigned long cons_pos, prod_pos; + + cons_pos = smp_load_acquire(&rb->consumer_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; +} + +static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +const struct bpf_map_ops ringbuf_map_ops = { + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap, + .map_poll = ringbuf_map_poll, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, +}; + +/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, + * calculate offset from record metadata to ring buffer in pages, rounded + * down. This page offset is stored as part of record metadata and allows to + * restore struct bpf_ringbuf * from record pointer. This page offset is + * stored at offset 4 of record metadata header. + */ +static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, + struct bpf_ringbuf_hdr *hdr) +{ + return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; +} + +/* Given pointer to ring buffer record header, restore pointer to struct + * bpf_ringbuf itself by using page offset stored at offset 4 + */ +static struct bpf_ringbuf * +bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) +{ + unsigned long addr = (unsigned long)(void *)hdr; + unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; + + return (void*)((addr & PAGE_MASK) - off); +} + +static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) +{ + unsigned long cons_pos, prod_pos, new_prod_pos, flags; + u32 len, pg_off; + struct bpf_ringbuf_hdr *hdr; + + if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) + return NULL; + + len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + cons_pos = smp_load_acquire(&rb->consumer_pos); + + if (in_nmi()) { + if (!spin_trylock_irqsave(&rb->spinlock, flags)) + return NULL; + } else { + spin_lock_irqsave(&rb->spinlock, flags); + } + + prod_pos = rb->producer_pos; + new_prod_pos = prod_pos + len; + + /* check for out of ringbuf space by ensuring producer position + * doesn't advance more than (ringbuf_size - 1) ahead + */ + if (new_prod_pos - cons_pos > rb->mask) { + spin_unlock_irqrestore(&rb->spinlock, flags); + return NULL; + } + + hdr = (void *)rb->data + (prod_pos & rb->mask); + pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pg_off = pg_off; + + /* pairs with consumer's smp_load_acquire() */ + smp_store_release(&rb->producer_pos, new_prod_pos); + + spin_unlock_irqrestore(&rb->spinlock, flags); + + return (void *)hdr + BPF_RINGBUF_HDR_SZ; +} + +BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + + if (unlikely(flags)) + return 0; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); +} + +const struct bpf_func_proto bpf_ringbuf_reserve_proto = { + .func = bpf_ringbuf_reserve, + .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) +{ + unsigned long rec_pos, cons_pos; + struct bpf_ringbuf_hdr *hdr; + struct bpf_ringbuf *rb; + u32 new_len; + + hdr = sample - BPF_RINGBUF_HDR_SZ; + rb = bpf_ringbuf_restore_from_rec(hdr); + new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* update record header with correct final size prefix */ + xchg(&hdr->len, new_len); + + /* if consumer caught up and is waiting for our record, notify about + * new data availability + */ + rec_pos = (void *)hdr - (void *)rb->data; + cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) + irq_work_queue(&rb->work); +} + +BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_proto = { + .func = bpf_ringbuf_submit, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) +{ + bpf_ringbuf_commit(sample, flags, true /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_proto = { + .func = bpf_ringbuf_discard, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, + u64, flags) +{ + struct bpf_ringbuf_map *rb_map; + void *rec; + + if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) + return -EINVAL; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + rec = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!rec) + return -EAGAIN; + + memcpy(rec, data, size); + bpf_ringbuf_commit(rec, flags, false /* discard */); + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_output_proto = { + .func = bpf_ringbuf_output, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) +{ + struct bpf_ringbuf *rb; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + switch (flags) { + case BPF_RB_AVAIL_DATA: + return ringbuf_avail_data_sz(rb); + case BPF_RB_RING_SIZE: + return rb->mask + 1; + case BPF_RB_CONS_POS: + return smp_load_acquire(&rb->consumer_pos); + case BPF_RB_PROD_POS: + return smp_load_acquire(&rb->producer_pos); + default: + return 0; + } +} + +const struct bpf_func_proto bpf_ringbuf_query_proto = { + .func = bpf_ringbuf_query, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c969a9b90d3..9de3540fa90c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -26,6 +26,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -662,6 +663,16 @@ out: return err; } +static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) +{ + struct bpf_map *map = filp->private_data; + + if (map->ops->map_poll) + return map->ops->map_poll(map, filp, pts); + + return EPOLLERR; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -670,6 +681,7 @@ const struct file_operations bpf_map_fops = { .read = bpf_dummy_read, .write = bpf_dummy_write, .mmap = bpf_map_mmap, + .poll = bpf_map_poll, }; int bpf_map_new_fd(struct bpf_map *map, int flags) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d725a26f66e..5c7bbaac81ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -233,6 +233,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + int mem_size; u64 msize_max_value; int ref_obj_id; int func_id; @@ -408,7 +409,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL; + type == PTR_TO_BTF_ID_OR_NULL || + type == PTR_TO_MEM_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -422,7 +424,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_MEM || + type == PTR_TO_MEM_OR_NULL; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -436,7 +440,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return func_id == BPF_FUNC_sk_release; + return func_id == BPF_FUNC_sk_release || + func_id == BPF_FUNC_ringbuf_submit || + func_id == BPF_FUNC_ringbuf_discard; } static bool may_be_acquire_function(enum bpf_func_id func_id) @@ -444,7 +450,8 @@ static bool may_be_acquire_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_map_lookup_elem; + func_id == BPF_FUNC_map_lookup_elem || + func_id == BPF_FUNC_ringbuf_reserve; } static bool is_acquire_function(enum bpf_func_id func_id, @@ -454,7 +461,8 @@ static bool is_acquire_function(enum bpf_func_id func_id, if (func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp) + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_ringbuf_reserve) return true; if (func_id == BPF_FUNC_map_lookup_elem && @@ -494,6 +502,8 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_MEM] = "mem", + [PTR_TO_MEM_OR_NULL] = "mem_or_null", }; static char slot_type_char[] = { @@ -2468,32 +2478,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, return 0; } -/* check read/write into map element returned by bpf_map_lookup_elem() */ -static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, bool zero_size_allowed) +/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ +static int __check_mem_access(struct bpf_verifier_env *env, int regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + bool size_ok = size > 0 || (size == 0 && zero_size_allowed); + struct bpf_reg_state *reg; + + if (off >= 0 && size_ok && (u64)off + size <= mem_size) + return 0; - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - off + size > map->value_size) { + reg = &cur_regs(env)[regno]; + switch (reg->type) { + case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", - map->value_size, off, size); - return -EACCES; + mem_size, off, size); + break; + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_PACKET_END: + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, off, mem_size); + break; + case PTR_TO_MEM: + default: + verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n", + mem_size, off, size); } - return 0; + + return -EACCES; } -/* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) +/* check read/write into a memory region with possible variable offset */ +static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, u32 mem_size, + bool zero_size_allowed) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; - /* We may have adjusted the register to this map value, so we + /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ @@ -2514,10 +2541,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size, - zero_size_allowed); + err = __check_mem_access(env, regno, reg->smin_value + off, size, + mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the array range\n", + verbose(env, "R%d min value is outside of the allowed memory range\n", regno); return err; } @@ -2527,18 +2554,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size, - zero_size_allowed); - if (err) - verbose(env, "R%d max value is outside of the array range\n", + err = __check_mem_access(env, regno, reg->umax_value + off, size, + mem_size, zero_size_allowed); + if (err) { + verbose(env, "R%d max value is outside of the allowed memory range\n", regno); + return err; + } + + return 0; +} - if (map_value_has_spin_lock(reg->map_ptr)) { - u32 lock = reg->map_ptr->spin_lock_off; +/* check read/write into a map element with possible variable offset */ +static int check_map_access(struct bpf_verifier_env *env, u32 regno, + int off, int size, bool zero_size_allowed) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_mem_region_access(env, regno, off, size, map->value_size, + zero_size_allowed); + if (err) + return err; + + if (map_value_has_spin_lock(map)) { + u32 lock = map->spin_lock_off; /* if any part of struct bpf_spin_lock can be touched by * load/store reject this program. @@ -2596,21 +2643,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) -{ - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; - - if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || - (u64)off + size > reg->range) { - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, reg->off, reg->range); - return -EACCES; - } - return 0; -} - static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { @@ -2631,16 +2663,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size, zero_size_allowed); + err = __check_mem_access(env, regno, off, size, reg->range, + zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; } - /* __check_packet_access has made sure "off + size - 1" is within u16. + /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info - * that __check_packet_access would have rejected this pkt access. + * that __check_mem_access would have rejected this pkt access. * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = @@ -3220,6 +3253,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } + } else if (reg->type == PTR_TO_MEM) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into mem\n", value_regno); + return -EACCES; + } + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; u32 btf_id = 0; @@ -3557,6 +3600,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MEM: + return check_mem_region_access(env, regno, reg->off, + access_size, reg->mem_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3661,6 +3708,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_ALLOC_MEM || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + +static bool arg_type_is_alloc_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; +} + static bool arg_type_is_int_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_INT || @@ -3720,7 +3778,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + arg_type == ARG_CONST_SIZE_OR_ZERO || + arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { expected_type = SCALAR_VALUE; if (type != expected_type) goto err_type; @@ -3791,13 +3850,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * happens during stack boundary checking. */ if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MEM_OR_NULL) + (arg_type == ARG_PTR_TO_MEM_OR_NULL || + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MEM && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_alloc_mem_ptr(arg_type)) { + expected_type = PTR_TO_MEM; + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) + /* final test in check_stack_boundary() */; + else if (type != expected_type) + goto err_type; + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && @@ -3893,6 +3968,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); + } else if (arg_type_is_alloc_size(arg_type)) { + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + meta->mem_size = reg->var_off.value; } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -3929,6 +4011,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_xdp_output) goto error; break; + case BPF_MAP_TYPE_RINGBUF: + if (func_id != BPF_FUNC_ringbuf_output && + func_id != BPF_FUNC_ringbuf_reserve && + func_id != BPF_FUNC_ringbuf_submit && + func_id != BPF_FUNC_ringbuf_discard && + func_id != BPF_FUNC_ringbuf_query) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -4655,6 +4745,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].mem_size = meta.mem_size; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -6611,6 +6706,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_TCP_SOCK; } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { reg->type = PTR_TO_BTF_ID; + } else if (reg->type == PTR_TO_MEM_OR_NULL) { + reg->type = PTR_TO_MEM; } if (is_null) { /* We don't need id and ref_obj_id from this point diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 187cd6995bbb..3767d34114c0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1088,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_read_value_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 54b93f8b49b8..974ca6e948e3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -147,6 +147,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, }; /* Note that tracing related programs such as @@ -3157,6 +3158,59 @@ union bpf_attr { * **bpf_sk_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. + * + * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * 0, on success; + * < 0, on error. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of + * new data availability is sent. + * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of + * new data availability is sent unconditionally. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; + * - BPF_RB_RING_SIZE - the size of ring buffer; + * - BPF_RB_CONS_POS - consumer position (can wrap around); + * - BPF_RB_PROD_POS - producer(s) position (can wrap around); + * Data returned is just a momentary snapshots of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if flags are not recognized. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3288,7 +3342,12 @@ union bpf_attr { FN(seq_printf), \ FN(seq_write), \ FN(sk_cgroup_id), \ - FN(sk_ancestor_cgroup_id), + FN(sk_ancestor_cgroup_id), \ + FN(ringbuf_output), \ + FN(ringbuf_reserve), \ + FN(ringbuf_submit), \ + FN(ringbuf_discard), \ + FN(ringbuf_query), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3398,6 +3457,29 @@ enum { BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c index e0fad1548737..d781bc86e100 100644 --- a/tools/testing/selftests/bpf/verifier/and.c +++ b/tools/testing/selftests/bpf/verifier/and.c @@ -15,7 +15,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -44,7 +44,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index f3c33e128709..1c4b1939f5a8 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -117,7 +117,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -137,7 +137,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R0 unbounded memory access, make sure to bounds check any such access", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index 58f4aa593b1b..4d6645f2874c 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -20,7 +20,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, }, { @@ -146,7 +146,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT }, { @@ -354,7 +354,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT }, { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 7629a0cebb9b..94258c6b5235 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -105,7 +105,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_map_hash_8b = { 16 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "calls: overlapping caller/callee", diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index b9fb28e8e224..988f46a1a4c7 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -68,7 +68,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 7", @@ -220,7 +220,7 @@ }, .fixup_map_array_small = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 19", diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c index 67ab12410050..5a605ae131a9 100644 --- a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c +++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c @@ -318,7 +318,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 4 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, diff --git a/tools/testing/selftests/bpf/verifier/helper_value_access.c b/tools/testing/selftests/bpf/verifier/helper_value_access.c index 7572e403ddb9..961f28139b96 100644 --- a/tools/testing/selftests/bpf/verifier/helper_value_access.c +++ b/tools/testing/selftests/bpf/verifier/helper_value_access.c @@ -280,7 +280,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -415,7 +415,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -926,7 +926,7 @@ }, .fixup_map_hash_16b = { 3, 10 }, .result = REJECT, - .errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R2 unbounded memory access, make sure to bounds check any such access", .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, { diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index a53d99cebd9f..97ee658e1242 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -50,7 +50,7 @@ .fixup_map_array_48b = { 8 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R0 min value is outside of the array range", + .errstr_unpriv = "R0 min value is outside of the allowed memory range", .retval = 1, }, { @@ -325,7 +325,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result_unpriv = REJECT, .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", }, @@ -601,7 +601,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R1 max value is outside of the array range", + .errstr = "R1 max value is outside of the allowed memory range", .errstr_unpriv = "R1 pointer arithmetic of map value goes out of range", .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -726,7 +726,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "map access: value_ptr -= known scalar, 2", -- cgit v1.2.3 From c3c16f2ea6d20159903cf93afbb1155f3d8348d5 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Tue, 26 May 2020 17:34:36 -0700 Subject: bpf: Add rx_queue_mapping to bpf_sock Add "rx_queue_mapping" to bpf_sock. This gives read access for the existing field (sk_rx_queue_mapping) of struct sock from bpf_sock. Semantics for the bpf_sock rx_queue_mapping access are similar to sk_rx_queue_get(), i.e the value NO_QUEUE_MAPPING is not allowed and -1 is returned in that case. This is useful for transmit queue selection based on the received queue index which is cached in the socket in the receive path. v3: Addressed review comments to add usecase in patch description, and fixed default value for rx_queue_mapping. v2: fixed build error for CONFIG_XPS wrapping, reported by kbuild test robot Signed-off-by: Amritha Nambiar Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 974ca6e948e3..630432c5c292 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3612,6 +3612,7 @@ struct bpf_sock { __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; + __s32 rx_queue_mapping; }; struct bpf_tcp_sock { diff --git a/net/core/filter.c b/net/core/filter.c index a6fc23447f12..0008b029d644 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6849,6 +6849,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, protocol): case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): + case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): case bpf_ctx_range(struct bpf_sock, dst_ip4): @@ -7897,6 +7898,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, skc_state), target_size)); break; + case offsetof(struct bpf_sock, rx_queue_mapping): +#ifdef CONFIG_XPS + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_rx_queue_mapping, + sizeof_field(struct sock, + sk_rx_queue_mapping), + target_size)); + *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, + 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); +#else + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); + *target_size = 2; +#endif + break; } return insn - insn_buf; -- cgit v1.2.3 From fbee97feed9b3e4acdf9590e1f6b4a2eefecfffe Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:13 -0600 Subject: bpf: Add support to attach bpf program to a devmap entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add BPF_XDP_DEVMAP attach type for use with programs associated with a DEVMAP entry. Allow DEVMAPs to associate a program with a device entry by adding a bpf_prog.fd to 'struct bpf_devmap_val'. Values read show the program id, so the fd and id are a union. bpf programs can get access to the struct via vmlinux.h. The program associated with the fd must have type XDP with expected attach type BPF_XDP_DEVMAP. When a program is associated with a device index, the program is run on an XDP_REDIRECT and before the buffer is added to the per-cpu queue. At this point rxq data is still valid; the next patch adds tx device information allowing the prorgam to see both ingress and egress device indices. XDP generic is skb based and XDP programs do not work with skb's. Block the use case by walking maps used by a program that is to be attached via xdpgeneric and fail if any of them are DEVMAP / DEVMAP_HASH with Block attach of BPF_XDP_DEVMAP programs to devices. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-3-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/devmap.c | 88 ++++++++++++++++++++++++++++++++++++++++-- net/core/dev.c | 18 +++++++++ tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 109 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e5884f7f801c..e042311f991f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1250,6 +1250,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); +bool dev_map_can_have_prog(struct bpf_map *map); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); @@ -1363,6 +1364,10 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map { return NULL; } +static inline bool dev_map_can_have_prog(struct bpf_map *map) +{ + return false; +} static inline void __dev_flush(void) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 630432c5c292..f1e364d69007 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a1459de0914e..0089d56617ec 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -63,12 +63,17 @@ struct xdp_dev_bulk_queue { /* DEVMAP values */ struct bpf_devmap_val { u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; + struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; @@ -111,12 +116,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { + u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes */ + /* check sanity of attributes. 2 value sizes supported: + * 4 bytes: ifindex + * 8 bytes: ifindex + prog fd + */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the @@ -223,6 +234,8 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -237,6 +250,8 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -323,6 +338,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } +bool dev_map_can_have_prog(struct bpf_map *map) +{ + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) + return true; + + return false; +} + static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; @@ -447,6 +472,30 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + switch (act) { + case XDP_PASS: + return xdp; + case XDP_DROP: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + break; + } + + xdp_return_buff(xdp); + return NULL; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -458,6 +507,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; + if (dst->xdp_prog) { + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); + if (!xdp) + return 0; + } return __xdp_enqueue(dev, xdp, dev_rx); } @@ -494,6 +548,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -547,6 +603,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, @@ -558,11 +615,31 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; + if (val->bpf_prog.fd >= 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; + } + dev->idx = idx; dev->dtab = dtab; + if (prog) { + dev->xdp_prog = prog; + dev->val.bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + dev->val.bpf_prog.id = 0; + } dev->val.ifindex = val->ifindex; return dev; +err_put_prog: + bpf_prog_put(prog); +err_put_dev: + dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); @@ -572,8 +649,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -588,6 +665,9 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; + /* can not specify fd if ifindex is 0 */ + if (val.bpf_prog.fd != -1) + return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) @@ -616,8 +696,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; diff --git a/net/core/dev.c b/net/core/dev.c index ae37586f6ee8..10684833f864 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5420,6 +5420,18 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; + if (new) { + u32 i; + + /* generic XDP does not work with DEVMAPs that can + * have a bpf_prog installed on an entry + */ + for (i = 0; i < new->aux->used_map_cnt; i++) { + if (dev_map_can_have_prog(new->aux->used_maps[i])) + return -EINVAL; + } + } + switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -8835,6 +8847,12 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_DEVMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 974ca6e948e3..65d7717bce2f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 64b59025c15b244c0954cf52b24fbabfcf5ed8f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:14 -0600 Subject: xdp: Add xdp_txq_info to xdp_buff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the moment only the device is added. Other fields (queue_index) can be added as use cases arise. >From a UAPI perspective, add egress_ifindex to xdp context for bpf programs to see the Tx device. Update the verifier to only allow accesses to egress_ifindex by XDP programs with BPF_XDP_DEVMAP expected attach type. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-4-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 5 +++++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/devmap.c | 3 +++ net/core/filter.c | 17 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 5 files changed, 29 insertions(+) (limited to 'include/uapi') diff --git a/include/net/xdp.h b/include/net/xdp.h index 90f11760bd12..d54022959491 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -61,12 +61,17 @@ struct xdp_rxq_info { struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_txq_info { + struct net_device *dev; +}; + struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; + struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1e364d69007..f862a58fb567 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3707,6 +3707,8 @@ struct xdp_md { /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ }; enum sk_action { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0089d56617ec..c04fb1c72f5e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -476,8 +476,11 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + struct xdp_txq_info txq = { .dev = dev }; u32 act; + xdp->txq = &txq; + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: diff --git a/net/core/filter.c b/net/core/filter.c index 0008b029d644..85ff827aab73 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7015,6 +7015,13 @@ static bool xdp_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + if (prog->expected_attach_type != BPF_XDP_DEVMAP) { + switch (off) { + case offsetof(struct xdp_md, egress_ifindex): + return false; + } + } + if (type == BPF_WRITE) { if (bpf_prog_is_dev_bound(prog->aux)) { switch (off) { @@ -7985,6 +7992,16 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_rxq_info, queue_index)); break; + case offsetof(struct xdp_md, egress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, txq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_txq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 65d7717bce2f..f74bc4a2385e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3706,6 +3706,8 @@ struct xdp_md { /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ }; enum sk_action { -- cgit v1.2.3 From 7f045a49fee04b5662cbdeaf0838f9322ae8c63a Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:38 +0200 Subject: bpf: Add link-based BPF program attachment to network namespace Extend bpf() syscall subcommands that operate on bpf_link, that is LINK_CREATE, LINK_UPDATE, OBJ_GET_INFO, to accept attach types tied to network namespaces (only flow dissector at the moment). Link-based and prog-based attachment can be used interchangeably, but only one can exist at a time. Attempts to attach a link when a prog is already attached directly, and the other way around, will be met with -EEXIST. Attempts to detach a program when link exists result in -EINVAL. Attachment of multiple links of same attach type to one netns is not supported with the intention to lift the restriction when a use-case presents itself. Because of that link create returns -E2BIG when trying to create another netns link, when one already exists. Link-based attachments to netns don't keep a netns alive by holding a ref to it. Instead links get auto-detached from netns when the latter is being destroyed, using a pernet pre_exit callback. When auto-detached, link lives in defunct state as long there are open FDs for it. -ENOLINK is returned if a user tries to update a defunct link. Because bpf_link to netns doesn't hold a ref to struct net, special care is taken when releasing, updating, or filling link info. The netns might be getting torn down when any of these link operations are in progress. That is why auto-detach and update/release/fill_info are synchronized by the same mutex. Also, link ops have to always check if auto-detach has not happened yet and if netns is still alive (refcnt > 0). Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-5-jakub@cloudflare.com --- include/linux/bpf-netns.h | 8 ++ include/linux/bpf_types.h | 3 + include/net/netns/bpf.h | 1 + include/uapi/linux/bpf.h | 5 + kernel/bpf/net_namespace.c | 244 ++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 3 + tools/include/uapi/linux/bpf.h | 5 + 7 files changed, 267 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index f3aec3d79824..4052d649f36d 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -34,6 +34,8 @@ int netns_bpf_prog_query(const union bpf_attr *attr, int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int netns_bpf_prog_detach(const union bpf_attr *attr); +int netns_bpf_link_create(const union bpf_attr *attr, + struct bpf_prog *prog); #else static inline int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -51,6 +53,12 @@ static inline int netns_bpf_prog_detach(const union bpf_attr *attr) { return -EOPNOTSUPP; } + +static inline int netns_bpf_link_create(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif #endif /* _BPF_NETNS_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa8e1b552acd..a18ae82a298a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -126,3 +126,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) #endif BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) +#ifdef CONFIG_NET +BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) +#endif diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h index a858d1c5b166..a8dce2a380c8 100644 --- a/include/net/netns/bpf.h +++ b/include/net/netns/bpf.h @@ -12,6 +12,7 @@ struct bpf_prog; struct netns_bpf { struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; + struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE]; }; #endif /* __NETNS_BPF_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f862a58fb567..b9ed9f14f2a2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -237,6 +237,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, MAX_BPF_LINK_TYPE, }; @@ -3839,6 +3840,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index b37d81450c3a..78cf061f8179 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -8,9 +8,140 @@ * Functions to manage BPF programs attached to netns */ +struct bpf_netns_link { + struct bpf_link link; + enum bpf_attach_type type; + enum netns_bpf_attach_type netns_type; + + /* We don't hold a ref to net in order to auto-detach the link + * when netns is going away. Instead we rely on pernet + * pre_exit callback to clear this pointer. Must be accessed + * with netns_bpf_mutex held. + */ + struct net *net; +}; + /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); +/* Must be called with netns_bpf_mutex held. */ +static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + net_link->net = NULL; +} + +static void bpf_netns_link_release(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + + /* Link auto-detached by dying netns. */ + if (!net_link->net) + return; + + mutex_lock(&netns_bpf_mutex); + + /* Recheck after potential sleep. We can race with cleanup_net + * here, but if we see a non-NULL struct net pointer pre_exit + * has not happened yet and will block on netns_bpf_mutex. + */ + net = net_link->net; + if (!net) + goto out_unlock; + + net->bpf.links[type] = NULL; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); +} + +static void bpf_netns_link_dealloc(struct bpf_link *link) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + + kfree(net_link); +} + +static int bpf_netns_link_update_prog(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + enum netns_bpf_attach_type type = net_link->netns_type; + struct net *net; + int ret = 0; + + if (old_prog && old_prog != link->prog) + return -EPERM; + if (new_prog->type != link->prog->type) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + + net = net_link->net; + if (!net || !check_net(net)) { + /* Link auto-detached or netns dying */ + ret = -ENOLINK; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + rcu_assign_pointer(net->bpf.progs[type], new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return ret; +} + +static int bpf_netns_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); + unsigned int inum = 0; + struct net *net; + + mutex_lock(&netns_bpf_mutex); + net = net_link->net; + if (net && check_net(net)) + inum = net->ns.inum; + mutex_unlock(&netns_bpf_mutex); + + info->netns.netns_ino = inum; + info->netns.attach_type = net_link->type; + return 0; +} + +static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_link_info info = {}; + + bpf_netns_link_fill_info(link, &info); + seq_printf(seq, + "netns_ino:\t%u\n" + "attach_type:\t%u\n", + info.netns.netns_ino, + info.netns.attach_type); +} + +static const struct bpf_link_ops bpf_netns_link_ops = { + .release = bpf_netns_link_release, + .dealloc = bpf_netns_link_dealloc, + .update_prog = bpf_netns_link_update_prog, + .fill_link_info = bpf_netns_link_fill_info, + .show_fdinfo = bpf_netns_link_show_fdinfo, +}; + int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -67,6 +198,13 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) net = current->nsproxy->net_ns; mutex_lock(&netns_bpf_mutex); + + /* Attaching prog directly is not compatible with links */ + if (net->bpf.links[type]) { + ret = -EEXIST; + goto out_unlock; + } + switch (type) { case NETNS_BPF_FLOW_DISSECTOR: ret = flow_dissector_bpf_prog_attach(net, prog); @@ -75,6 +213,7 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) ret = -EINVAL; break; } +out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; @@ -86,6 +225,10 @@ static int __netns_bpf_prog_detach(struct net *net, { struct bpf_prog *attached; + /* Progs attached via links cannot be detached */ + if (net->bpf.links[type]) + return -EINVAL; + attached = rcu_dereference_protected(net->bpf.progs[type], lockdep_is_held(&netns_bpf_mutex)); if (!attached) @@ -111,13 +254,110 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) return ret; } +static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *prog; + int err; + + mutex_lock(&netns_bpf_mutex); + + /* Allow attaching only one prog or link for now */ + if (net->bpf.links[type]) { + err = -E2BIG; + goto out_unlock; + } + /* Links are not compatible with attaching prog directly */ + prog = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (prog) { + err = -EEXIST; + goto out_unlock; + } + + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + err = flow_dissector_bpf_prog_attach(net, link->prog); + break; + default: + err = -EINVAL; + break; + } + if (err) + goto out_unlock; + + net->bpf.links[type] = link; + +out_unlock: + mutex_unlock(&netns_bpf_mutex); + return err; +} + +int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type netns_type; + struct bpf_link_primer link_primer; + struct bpf_netns_link *net_link; + enum bpf_attach_type type; + struct net *net; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + type = attr->link_create.attach_type; + netns_type = to_netns_bpf_attach_type(type); + if (netns_type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->link_create.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + net_link = kzalloc(sizeof(*net_link), GFP_USER); + if (!net_link) { + err = -ENOMEM; + goto out_put_net; + } + bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, + &bpf_netns_link_ops, prog); + net_link->net = net; + net_link->type = type; + net_link->netns_type = netns_type; + + err = bpf_link_prime(&net_link->link, &link_primer); + if (err) { + kfree(net_link); + goto out_put_net; + } + + err = netns_bpf_link_attach(net, &net_link->link, netns_type); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_net; + } + + put_net(net); + return bpf_link_settle(&link_primer); + +out_put_net: + put_net(net); + return err; +} + static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; + struct bpf_link *link; mutex_lock(&netns_bpf_mutex); - for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) - __netns_bpf_prog_detach(net, type); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { + link = net->bpf.links[type]; + if (link) + bpf_netns_link_auto_detach(link); + else + __netns_bpf_prog_detach(net, type); + } mutex_unlock(&netns_bpf_mutex); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c77ab9c76f7b..e14a842d7e0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3887,6 +3887,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_TRACING: ret = tracing_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = netns_bpf_link_create(attr, prog); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f862a58fb567..b9ed9f14f2a2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -237,6 +237,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, MAX_BPF_LINK_TYPE, }; @@ -3839,6 +3840,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; }; } __attribute__((aligned(8))); -- cgit v1.2.3 From e7c8cc35a64d1d21e6fb811c519eadbda30f4f77 Mon Sep 17 00:00:00 2001 From: Matej Genci Date: Wed, 11 Sep 2019 12:49:53 +0000 Subject: virtio: add VIRTIO_RING_NO_LEGACY Add macro to disable legacy vring functions. Signed-off-by: Matej Genci Link: https://lore.kernel.org/r/20190911124942.243713-1-matej.genci@nutanix.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 1 + include/uapi/linux/virtio_ring.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 7abcc50838b8..db93cedd262f 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -16,6 +16,7 @@ #include #define VIRTIO_PCI_NO_LEGACY +#define VIRTIO_RING_NO_LEGACY #include "virtio_pci_common.h" /* diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index 559f42e73315..9223c3a5c46a 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -135,6 +135,8 @@ struct vring { #define VRING_USED_ALIGN_SIZE 4 #define VRING_DESC_ALIGN_SIZE 16 +#ifndef VIRTIO_RING_NO_LEGACY + /* The standard layout for the ring is a continuous chunk of memory which looks * like this. We assume num is a power of 2. * @@ -181,6 +183,8 @@ static inline unsigned vring_size(unsigned int num, unsigned long align) + sizeof(__virtio16) * 3 + sizeof(struct vring_used_elem) * num; } +#endif /* VIRTIO_RING_NO_LEGACY */ + /* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */ /* Assuming a given event_idx value from the other side, if * we have just incremented index from old to new_idx, -- cgit v1.2.3 From a865e420b9561235851c3f5d483c82ef389d29bd Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 6 Apr 2020 08:42:55 -0400 Subject: virtio: force spec specified alignment on types The ring element addresses are passed between components with different alignments assumptions. Thus, if guest/userspace selects a pointer and host then gets and dereferences it, we might need to decrease the compiler-selected alignment to prevent compiler on the host from assuming pointer is aligned. This actually triggers on ARM with -mabi=apcs-gnu - which is a deprecated configuration, but it seems safer to handle this generally. Note that userspace that allocates the memory is actually OK and does not need to be fixed, but userspace that gets it from guest or another process does need to be fixed. The later doesn't generally talk to the kernel so while it might be buggy it's not talking to the kernel in the buggy way - it's just using the header in the buggy way - so fixing header and asking userspace to recompile is the best we can do. I verified that the produced kernel binary on x86 is exactly identical before and after the change. Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vhost.c | 8 +++---- drivers/vhost/vhost.h | 6 +++--- drivers/vhost/vringh.c | 6 +++--- include/linux/vringh.h | 6 +++--- include/uapi/linux/virtio_ring.h | 46 ++++++++++++++++++++++++++++++---------- 5 files changed, 48 insertions(+), 24 deletions(-) (limited to 'include/uapi') diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 21a59b598ed8..96d9871fa0cb 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1244,9 +1244,9 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access) } static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, - struct vring_desc __user *desc, - struct vring_avail __user *avail, - struct vring_used __user *used) + vring_desc_t __user *desc, + vring_avail_t __user *avail, + vring_used_t __user *used) { return access_ok(desc, vhost_get_desc_size(vq, num)) && @@ -2301,7 +2301,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, unsigned count) { - struct vring_used_elem __user *used; + vring_used_elem_t __user *used; u16 old, new; int start; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index f8403bd46b85..60cab4c78229 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -67,9 +67,9 @@ struct vhost_virtqueue { /* The actual ring of buffers. */ struct mutex mutex; unsigned int num; - struct vring_desc __user *desc; - struct vring_avail __user *avail; - struct vring_used __user *used; + vring_desc_t __user *desc; + vring_avail_t __user *avail; + vring_used_t __user *used; const struct vhost_iotlb_map *meta_iotlb[VHOST_NUM_ADDRS]; struct file *kick; struct eventfd_ctx *call_ctx; diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index ba8e0d6cfd97..e059a9a47cdf 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -620,9 +620,9 @@ static inline int xfer_to_user(const struct vringh *vrh, */ int vringh_init_user(struct vringh *vrh, u64 features, unsigned int num, bool weak_barriers, - struct vring_desc __user *desc, - struct vring_avail __user *avail, - struct vring_used __user *used) + vring_desc_t __user *desc, + vring_avail_t __user *avail, + vring_used_t __user *used) { /* Sane power of 2 please! */ if (!num || num > 0xffff || (num & (num - 1))) { diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 9e2763d7c159..59bd50f99291 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -105,9 +105,9 @@ struct vringh_kiov { /* Helpers for userspace vrings. */ int vringh_init_user(struct vringh *vrh, u64 features, unsigned int num, bool weak_barriers, - struct vring_desc __user *desc, - struct vring_avail __user *avail, - struct vring_used __user *used); + vring_desc_t __user *desc, + vring_avail_t __user *avail, + vring_used_t __user *used); static inline void vringh_iov_init(struct vringh_iov *iov, struct iovec *iovec, unsigned num) diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index 9223c3a5c46a..476d3e5c0fe7 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -86,6 +86,13 @@ * at the end of the used ring. Guest should ignore the used->flags field. */ #define VIRTIO_RING_F_EVENT_IDX 29 +/* Alignment requirements for vring elements. + * When using pre-virtio 1.0 layout, these fall out naturally. + */ +#define VRING_AVAIL_ALIGN_SIZE 2 +#define VRING_USED_ALIGN_SIZE 4 +#define VRING_DESC_ALIGN_SIZE 16 + /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ struct vring_desc { /* Address (guest-physical). */ @@ -112,29 +119,46 @@ struct vring_used_elem { __virtio32 len; }; +typedef struct vring_used_elem __attribute__((aligned(VRING_USED_ALIGN_SIZE))) + vring_used_elem_t; + struct vring_used { __virtio16 flags; __virtio16 idx; - struct vring_used_elem ring[]; + vring_used_elem_t ring[]; }; +/* + * The ring element addresses are passed between components with different + * alignments assumptions. Thus, we might need to decrease the compiler-selected + * alignment, and so must use a typedef to make sure the aligned attribute + * actually takes hold: + * + * https://gcc.gnu.org/onlinedocs//gcc/Common-Type-Attributes.html#Common-Type-Attributes + * + * When used on a struct, or struct member, the aligned attribute can only + * increase the alignment; in order to decrease it, the packed attribute must + * be specified as well. When used as part of a typedef, the aligned attribute + * can both increase and decrease alignment, and specifying the packed + * attribute generates a warning. + */ +typedef struct vring_desc __attribute__((aligned(VRING_DESC_ALIGN_SIZE))) + vring_desc_t; +typedef struct vring_avail __attribute__((aligned(VRING_AVAIL_ALIGN_SIZE))) + vring_avail_t; +typedef struct vring_used __attribute__((aligned(VRING_USED_ALIGN_SIZE))) + vring_used_t; + struct vring { unsigned int num; - struct vring_desc *desc; + vring_desc_t *desc; - struct vring_avail *avail; + vring_avail_t *avail; - struct vring_used *used; + vring_used_t *used; }; -/* Alignment requirements for vring elements. - * When using pre-virtio 1.0 layout, these fall out naturally. - */ -#define VRING_AVAIL_ALIGN_SIZE 2 -#define VRING_USED_ALIGN_SIZE 4 -#define VRING_DESC_ALIGN_SIZE 16 - #ifndef VIRTIO_RING_NO_LEGACY /* The standard layout for the ring is a continuous chunk of memory which looks -- cgit v1.2.3 From 24c986748ba670c903a9d6a11ee96de2b3f5f1b8 Mon Sep 17 00:00:00 2001 From: Farhan Ali Date: Tue, 5 May 2020 14:27:41 +0200 Subject: vfio-ccw: Introduce a new schib region The schib region can be used by userspace to get the subchannel- information block (SCHIB) for the passthrough subchannel. This can be useful to get information such as channel path information via the SCHIB.PMCW fields. Signed-off-by: Farhan Ali Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20200505122745.53208-5-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- Documentation/s390/vfio-ccw.rst | 18 ++++++++- drivers/s390/cio/Makefile | 2 +- drivers/s390/cio/vfio_ccw_chp.c | 76 +++++++++++++++++++++++++++++++++++++ drivers/s390/cio/vfio_ccw_drv.c | 20 ++++++++++ drivers/s390/cio/vfio_ccw_ops.c | 14 ++++++- drivers/s390/cio/vfio_ccw_private.h | 3 ++ include/uapi/linux/vfio.h | 1 + include/uapi/linux/vfio_ccw.h | 10 +++++ 8 files changed, 140 insertions(+), 4 deletions(-) create mode 100644 drivers/s390/cio/vfio_ccw_chp.c (limited to 'include/uapi') diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst index 3a946fd45562..32310df525ba 100644 --- a/Documentation/s390/vfio-ccw.rst +++ b/Documentation/s390/vfio-ccw.rst @@ -282,6 +282,21 @@ for each access of the region. The following values may occur: ``-EBUSY`` The subchannel was status pending or busy while processing a halt request. +vfio-ccw schib region +--------------------- + +The vfio-ccw schib region is used to return Subchannel-Information +Block (SCHIB) data to userspace:: + + struct ccw_schib_region { + #define SCHIB_AREA_SIZE 52 + __u8 schib_area[SCHIB_AREA_SIZE]; + } __packed; + +This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_SCHIB. + +Reading this region triggers a STORE SUBCHANNEL to be issued to the +associated hardware. vfio-ccw operation details -------------------------- @@ -385,7 +400,8 @@ through DASD/ECKD device online in a guest now and use it as a block device. The current code allows the guest to start channel programs via -START SUBCHANNEL, and to issue HALT SUBCHANNEL and CLEAR SUBCHANNEL. +START SUBCHANNEL, and to issue HALT SUBCHANNEL, CLEAR SUBCHANNEL, +and STORE SUBCHANNEL. Currently all channel programs are prefetched, regardless of the p-bit setting in the ORB. As a result, self modifying channel diff --git a/drivers/s390/cio/Makefile b/drivers/s390/cio/Makefile index 23eae4188876..a9235f111e79 100644 --- a/drivers/s390/cio/Makefile +++ b/drivers/s390/cio/Makefile @@ -21,5 +21,5 @@ qdio-objs := qdio_main.o qdio_thinint.o qdio_debug.o qdio_setup.o obj-$(CONFIG_QDIO) += qdio.o vfio_ccw-objs += vfio_ccw_drv.o vfio_ccw_cp.o vfio_ccw_ops.o vfio_ccw_fsm.o \ - vfio_ccw_async.o vfio_ccw_trace.o + vfio_ccw_async.o vfio_ccw_trace.o vfio_ccw_chp.o obj-$(CONFIG_VFIO_CCW) += vfio_ccw.o diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c new file mode 100644 index 000000000000..18f3b3e873a9 --- /dev/null +++ b/drivers/s390/cio/vfio_ccw_chp.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Channel path related status regions for vfio_ccw + * + * Copyright IBM Corp. 2020 + * + * Author(s): Farhan Ali + * Eric Farman + */ + +#include +#include "vfio_ccw_private.h" + +static ssize_t vfio_ccw_schib_region_read(struct vfio_ccw_private *private, + char __user *buf, size_t count, + loff_t *ppos) +{ + unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) - VFIO_CCW_NUM_REGIONS; + loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; + struct ccw_schib_region *region; + int ret; + + if (pos + count > sizeof(*region)) + return -EINVAL; + + mutex_lock(&private->io_mutex); + region = private->region[i].data; + + if (cio_update_schib(private->sch)) { + ret = -ENODEV; + goto out; + } + + memcpy(region, &private->sch->schib, sizeof(*region)); + + if (copy_to_user(buf, (void *)region + pos, count)) { + ret = -EFAULT; + goto out; + } + + ret = count; + +out: + mutex_unlock(&private->io_mutex); + return ret; +} + +static ssize_t vfio_ccw_schib_region_write(struct vfio_ccw_private *private, + const char __user *buf, size_t count, + loff_t *ppos) +{ + return -EINVAL; +} + + +static void vfio_ccw_schib_region_release(struct vfio_ccw_private *private, + struct vfio_ccw_region *region) +{ + +} + +const struct vfio_ccw_regops vfio_ccw_schib_region_ops = { + .read = vfio_ccw_schib_region_read, + .write = vfio_ccw_schib_region_write, + .release = vfio_ccw_schib_region_release, +}; + +int vfio_ccw_register_schib_dev_regions(struct vfio_ccw_private *private) +{ + return vfio_ccw_register_dev_region(private, + VFIO_REGION_SUBTYPE_CCW_SCHIB, + &vfio_ccw_schib_region_ops, + sizeof(struct ccw_schib_region), + VFIO_REGION_INFO_FLAG_READ, + private->schib_region); +} diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index fb1275a7d1f5..7aeff42f370d 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -27,6 +27,7 @@ struct workqueue_struct *vfio_ccw_work_q; static struct kmem_cache *vfio_ccw_io_region; static struct kmem_cache *vfio_ccw_cmd_region; +static struct kmem_cache *vfio_ccw_schib_region; debug_info_t *vfio_ccw_debug_msg_id; debug_info_t *vfio_ccw_debug_trace_id; @@ -119,6 +120,8 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) static void vfio_ccw_free_regions(struct vfio_ccw_private *private) { + if (private->schib_region) + kmem_cache_free(vfio_ccw_schib_region, private->schib_region); if (private->cmd_region) kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); if (private->io_region) @@ -156,6 +159,12 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) if (!private->cmd_region) goto out_free; + private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region, + GFP_KERNEL | GFP_DMA); + + if (!private->schib_region) + goto out_free; + private->sch = sch; dev_set_drvdata(&sch->dev, private); mutex_init(&private->io_mutex); @@ -357,6 +366,7 @@ static void vfio_ccw_debug_exit(void) static void vfio_ccw_destroy_regions(void) { + kmem_cache_destroy(vfio_ccw_schib_region); kmem_cache_destroy(vfio_ccw_cmd_region); kmem_cache_destroy(vfio_ccw_io_region); } @@ -393,6 +403,16 @@ static int __init vfio_ccw_sch_init(void) goto out_err; } + vfio_ccw_schib_region = kmem_cache_create_usercopy("vfio_ccw_schib_region", + sizeof(struct ccw_schib_region), 0, + SLAB_ACCOUNT, 0, + sizeof(struct ccw_schib_region), NULL); + + if (!vfio_ccw_schib_region) { + ret = -ENOMEM; + goto out_err; + } + isc_register(VFIO_CCW_ISC); ret = css_driver_register(&vfio_ccw_sch_driver); if (ret) { diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index d4fc84b8867f..22988d67b6bb 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -172,8 +172,18 @@ static int vfio_ccw_mdev_open(struct mdev_device *mdev) ret = vfio_ccw_register_async_dev_regions(private); if (ret) - vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, - &private->nb); + goto out_unregister; + + ret = vfio_ccw_register_schib_dev_regions(private); + if (ret) + goto out_unregister; + + return ret; + +out_unregister: + vfio_ccw_unregister_dev_regions(private); + vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + &private->nb); return ret; } diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index ce3834159d98..d6601a8adf13 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -56,6 +56,7 @@ int vfio_ccw_register_dev_region(struct vfio_ccw_private *private, void vfio_ccw_unregister_dev_regions(struct vfio_ccw_private *private); int vfio_ccw_register_async_dev_regions(struct vfio_ccw_private *private); +int vfio_ccw_register_schib_dev_regions(struct vfio_ccw_private *private); /** * struct vfio_ccw_private @@ -69,6 +70,7 @@ int vfio_ccw_register_async_dev_regions(struct vfio_ccw_private *private); * @io_mutex: protect against concurrent update of I/O regions * @region: additional regions for other subchannel operations * @cmd_region: MMIO region for asynchronous I/O commands other than START + * @schib_region: MMIO region for SCHIB information * @num_regions: number of additional regions * @cp: channel program for the current I/O operation * @irb: irb info received from interrupt @@ -87,6 +89,7 @@ struct vfio_ccw_private { struct mutex io_mutex; struct vfio_ccw_region *region; struct ccw_cmd_region *cmd_region; + struct ccw_schib_region *schib_region; int num_regions; struct channel_program cp; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 015516bcfaa3..7a1abbd889bd 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -378,6 +378,7 @@ struct vfio_region_gfx_edid { /* sub-types for VFIO_REGION_TYPE_CCW */ #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) +#define VFIO_REGION_SUBTYPE_CCW_SCHIB (2) /* * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped diff --git a/include/uapi/linux/vfio_ccw.h b/include/uapi/linux/vfio_ccw.h index cbecbf0cd54f..758bf214898d 100644 --- a/include/uapi/linux/vfio_ccw.h +++ b/include/uapi/linux/vfio_ccw.h @@ -34,4 +34,14 @@ struct ccw_cmd_region { __u32 ret_code; } __packed; +/* + * Used for processing commands that read the subchannel-information block + * Reading this region triggers a stsch() to hardware + * Note: this is controlled by a capability + */ +struct ccw_schib_region { +#define SCHIB_AREA_SIZE 52 + __u8 schib_area[SCHIB_AREA_SIZE]; +} __packed; + #endif -- cgit v1.2.3 From 836e66c218f355ec01ba57671c85abf32961dcea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:32 +0200 Subject: bpf: Fix up bpf_skb_adjust_room helper's skb csum setting Lorenz recently reported: In our TC classifier cls_redirect [0], we use the following sequence of helper calls to decapsulate a GUE (basically IP + UDP + custom header) encapsulated packet: bpf_skb_adjust_room(skb, -encap_len, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_FIXED_GSO) bpf_redirect(skb->ifindex, BPF_F_INGRESS) It seems like some checksums of the inner headers are not validated in this case. For example, a TCP SYN packet with invalid TCP checksum is still accepted by the network stack and elicits a SYN ACK. [...] That is, we receive the following packet from the driver: | ETH | IP | UDP | GUE | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY ip_summed is CHECKSUM_UNNECESSARY because our NICs do rx checksum offloading. On this packet we run skb_adjust_room_mac(-encap_len), and get the following: | ETH | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY Note that ip_summed is still CHECKSUM_UNNECESSARY. After bpf_redirect()'ing into the ingress, we end up in tcp_v4_rcv(). There, skb_checksum_init() is turned into a no-op due to CHECKSUM_UNNECESSARY. The bpf_skb_adjust_room() helper is not aware of protocol specifics. Internally, it handles the CHECKSUM_COMPLETE case via skb_postpull_rcsum(), but that does not cover CHECKSUM_UNNECESSARY. In this case skb->csum_level of the original skb prior to bpf_skb_adjust_room() call was 0, that is, covering UDP. Right now there is no way to adjust the skb->csum_level. NICs that have checksum offload disabled (CHECKSUM_NONE) or that support CHECKSUM_COMPLETE are not affected. Use a safe default for CHECKSUM_UNNECESSARY by resetting to CHECKSUM_NONE and add a flag to the helper called BPF_F_ADJ_ROOM_NO_CSUM_RESET that allows users from opting out. Opting out is useful for the case where we don't remove/add full protocol headers, or for the case where a user wants to adjust the csum level manually e.g. through bpf_csum_level() helper that is added in subsequent patch. The bpf_skb_proto_{4_to_6,6_to_4}() for NAT64/46 translation from the BPF bpf_skb_change_proto() helper uses bpf_skb_net_hdr_{push,pop}() pair internally as well but doesn't change layers, only transitions between v4 to v6 and vice versa, therefore no adoption is required there. [0] https://lore.kernel.org/bpf/20200424185556.7358-1-lmb@cloudflare.com/ Fixes: 2be7e212d541 ("bpf: add bpf_skb_adjust_room helper") Reported-by: Lorenz Bauer Reported-by: Alan Maguire Signed-off-by: Daniel Borkmann Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/CACAyw9-uU_52esMd1JjuA80fRPHJv5vsSg8GnfW3t_qDU4aVKQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/11a90472e7cce83e76ddbfce81fdfce7bfc68808.1591108731.git.daniel@iogearbox.net --- include/linux/skbuff.h | 8 ++++++++ include/uapi/linux/bpf.h | 8 ++++++++ net/core/filter.c | 8 ++++++-- tools/include/uapi/linux/bpf.h | 8 ++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a0d5c2760103..0c0377fc00c2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3919,6 +3919,14 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) } } +static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + skb->ip_summed = CHECKSUM_NONE; + skb->csum_level = 0; + } +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index ae82bcb03124..278dcc0af961 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3113,7 +3113,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; - if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -3163,7 +3164,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3191,6 +3193,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); + if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) + __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { -- cgit v1.2.3 From 7cdec54f9713256bb170873a1fc5c75c9127c9d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:33 +0200 Subject: bpf: Add csum_level helper for fixing up csum levels Add a bpf_csum_level() helper which BPF programs can use in combination with bpf_skb_adjust_room() when they pass in BPF_F_ADJ_ROOM_NO_CSUM_RESET flag to the latter to avoid falling back to CHECKSUM_NONE. The bpf_csum_level() allows to adjust CHECKSUM_UNNECESSARY skb->csum_levels via BPF_CSUM_LEVEL_{INC,DEC} which calls __skb_{incr,decr}_checksum_unnecessary() on the skb. The helper also allows a BPF_CSUM_LEVEL_RESET which sets the skb's csum to CHECKSUM_NONE as well as a BPF_CSUM_LEVEL_QUERY to just return the current level. Without this helper, there is no way to otherwise adjust the skb->csum_level. I did not add an extra dummy flags as there is plenty of free bitspace in level argument itself iff ever needed in future. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/279ae3717cb3d03c0ffeb511493c93c450a01e1a.1591108731.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++++++++++++++++++++- net/core/filter.c | 38 +++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 122 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3ba2bbbed80c..c65b374a5090 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3220,6 +3220,38 @@ union bpf_attr { * calculation. * Return * Requested value, or 0, if flags are not recognized. + * + * int bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3356,7 +3388,8 @@ union bpf_attr { FN(ringbuf_reserve), \ FN(ringbuf_submit), \ FN(ringbuf_discard), \ - FN(ringbuf_query), + FN(ringbuf_query), \ + FN(csum_level), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3433,6 +3466,14 @@ enum { BPF_F_CURRENT_NETNS = (-1L), }; +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), diff --git a/net/core/filter.c b/net/core/filter.c index 278dcc0af961..d01a244b5087 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2015,6 +2015,40 @@ static const struct bpf_func_proto bpf_csum_update_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) +{ + /* The interface is to be used in combination with bpf_skb_adjust_room() + * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET + * is passed as flags, for example. + */ + switch (level) { + case BPF_CSUM_LEVEL_INC: + __skb_incr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_DEC: + __skb_decr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_RESET: + __skb_reset_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_QUERY: + return skb->ip_summed == CHECKSUM_UNNECESSARY ? + skb->csum_level : -EACCES; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_csum_level_proto = { + .func = bpf_csum_level, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); @@ -6280,6 +6314,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -6613,6 +6649,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ba2bbbed80c..c65b374a5090 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3220,6 +3220,38 @@ union bpf_attr { * calculation. * Return * Requested value, or 0, if flags are not recognized. + * + * int bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3356,7 +3388,8 @@ union bpf_attr { FN(ringbuf_reserve), \ FN(ringbuf_submit), \ FN(ringbuf_discard), \ - FN(ringbuf_query), + FN(ringbuf_query), \ + FN(csum_level), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3433,6 +3466,14 @@ enum { BPF_F_CURRENT_NETNS = (-1L), }; +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), -- cgit v1.2.3 From 56f2e3b7d819f4fa44857ba81aa6870f18714ea0 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 2 Jun 2020 10:17:28 +0100 Subject: capabilities: add description for CAP_SETFCAP Document the purpose of CAP_SETFCAP. For some reason this capability had no description while the others did. Signed-off-by: Stefan Hajnoczi Signed-off-by: James Morris --- include/uapi/linux/capability.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 240fdb9a60f6..877972fdfda3 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -331,6 +331,8 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_CONTROL 30 +/* Set or remove capabilities on files */ + #define CAP_SETFCAP 31 /* Override MAC access. -- cgit v1.2.3 From d8cac29b1d52204e6632d2887eff766acd02b9aa Mon Sep 17 00:00:00 2001 From: Farhan Ali Date: Tue, 5 May 2020 14:27:43 +0200 Subject: vfio-ccw: Introduce a new CRW region This region provides a mechanism to pass a Channel Report Word that affect vfio-ccw devices, and needs to be passed to the guest for its awareness and/or processing. The base driver (see crw_collect_info()) provides space for two CRWs, as a subchannel event may have two CRWs chained together (one for the ssid, one for the subchannel). As vfio-ccw will deal with everything at the subchannel level, provide space for a single CRW to be transferred in one shot. Signed-off-by: Farhan Ali Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20200505122745.53208-7-farman@linux.ibm.com> [CH: added padding to ccw_crw_region] Signed-off-by: Cornelia Huck --- Documentation/s390/vfio-ccw.rst | 20 ++++++++++++++ drivers/s390/cio/vfio_ccw_chp.c | 55 +++++++++++++++++++++++++++++++++++++ drivers/s390/cio/vfio_ccw_drv.c | 20 ++++++++++++++ drivers/s390/cio/vfio_ccw_ops.c | 8 ++++++ drivers/s390/cio/vfio_ccw_private.h | 4 +++ include/uapi/linux/vfio.h | 2 ++ include/uapi/linux/vfio_ccw.h | 9 ++++++ 7 files changed, 118 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst index 32310df525ba..8aad08a8b8a5 100644 --- a/Documentation/s390/vfio-ccw.rst +++ b/Documentation/s390/vfio-ccw.rst @@ -298,6 +298,26 @@ This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_SCHIB. Reading this region triggers a STORE SUBCHANNEL to be issued to the associated hardware. +vfio-ccw crw region +--------------------- + +The vfio-ccw crw region is used to return Channel Report Word (CRW) +data to userspace:: + + struct ccw_crw_region { + __u32 crw; + __u32 pad; + } __packed; + +This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_CRW. + +Reading this region returns a CRW if one that is relevant for this +subchannel (e.g. one reporting changes in channel path state) is +pending, or all zeroes if not. If multiple CRWs are pending (including +possibly chained CRWs), reading this region again will return the next +one, until no more CRWs are pending and zeroes are returned. This is +similar to how STORE CHANNEL REPORT WORD works. + vfio-ccw operation details -------------------------- diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c index 18f3b3e873a9..37ea344a4d72 100644 --- a/drivers/s390/cio/vfio_ccw_chp.c +++ b/drivers/s390/cio/vfio_ccw_chp.c @@ -74,3 +74,58 @@ int vfio_ccw_register_schib_dev_regions(struct vfio_ccw_private *private) VFIO_REGION_INFO_FLAG_READ, private->schib_region); } + +static ssize_t vfio_ccw_crw_region_read(struct vfio_ccw_private *private, + char __user *buf, size_t count, + loff_t *ppos) +{ + unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) - VFIO_CCW_NUM_REGIONS; + loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; + struct ccw_crw_region *region; + int ret; + + if (pos + count > sizeof(*region)) + return -EINVAL; + + mutex_lock(&private->io_mutex); + region = private->region[i].data; + + if (copy_to_user(buf, (void *)region + pos, count)) + ret = -EFAULT; + else + ret = count; + + region->crw = 0; + + mutex_unlock(&private->io_mutex); + return ret; +} + +static ssize_t vfio_ccw_crw_region_write(struct vfio_ccw_private *private, + const char __user *buf, size_t count, + loff_t *ppos) +{ + return -EINVAL; +} + +static void vfio_ccw_crw_region_release(struct vfio_ccw_private *private, + struct vfio_ccw_region *region) +{ + +} + +const struct vfio_ccw_regops vfio_ccw_crw_region_ops = { + .read = vfio_ccw_crw_region_read, + .write = vfio_ccw_crw_region_write, + .release = vfio_ccw_crw_region_release, +}; + +int vfio_ccw_register_crw_dev_regions(struct vfio_ccw_private *private) +{ + return vfio_ccw_register_dev_region(private, + VFIO_REGION_SUBTYPE_CCW_CRW, + &vfio_ccw_crw_region_ops, + sizeof(struct ccw_crw_region), + VFIO_REGION_INFO_FLAG_READ, + private->crw_region); +} diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 7aeff42f370d..e4deae6fd525 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -28,6 +28,7 @@ struct workqueue_struct *vfio_ccw_work_q; static struct kmem_cache *vfio_ccw_io_region; static struct kmem_cache *vfio_ccw_cmd_region; static struct kmem_cache *vfio_ccw_schib_region; +static struct kmem_cache *vfio_ccw_crw_region; debug_info_t *vfio_ccw_debug_msg_id; debug_info_t *vfio_ccw_debug_trace_id; @@ -120,6 +121,8 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) static void vfio_ccw_free_regions(struct vfio_ccw_private *private) { + if (private->crw_region) + kmem_cache_free(vfio_ccw_crw_region, private->crw_region); if (private->schib_region) kmem_cache_free(vfio_ccw_schib_region, private->schib_region); if (private->cmd_region) @@ -165,6 +168,12 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) if (!private->schib_region) goto out_free; + private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region, + GFP_KERNEL | GFP_DMA); + + if (!private->crw_region) + goto out_free; + private->sch = sch; dev_set_drvdata(&sch->dev, private); mutex_init(&private->io_mutex); @@ -366,6 +375,7 @@ static void vfio_ccw_debug_exit(void) static void vfio_ccw_destroy_regions(void) { + kmem_cache_destroy(vfio_ccw_crw_region); kmem_cache_destroy(vfio_ccw_schib_region); kmem_cache_destroy(vfio_ccw_cmd_region); kmem_cache_destroy(vfio_ccw_io_region); @@ -413,6 +423,16 @@ static int __init vfio_ccw_sch_init(void) goto out_err; } + vfio_ccw_crw_region = kmem_cache_create_usercopy("vfio_ccw_crw_region", + sizeof(struct ccw_crw_region), 0, + SLAB_ACCOUNT, 0, + sizeof(struct ccw_crw_region), NULL); + + if (!vfio_ccw_crw_region) { + ret = -ENOMEM; + goto out_err; + } + isc_register(VFIO_CCW_ISC); ret = css_driver_register(&vfio_ccw_sch_driver); if (ret) { diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index c3a74ab7bb86..8b3ed5b45277 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -178,6 +178,10 @@ static int vfio_ccw_mdev_open(struct mdev_device *mdev) if (ret) goto out_unregister; + ret = vfio_ccw_register_crw_dev_regions(private); + if (ret) + goto out_unregister; + return ret; out_unregister: @@ -389,6 +393,7 @@ static int vfio_ccw_mdev_get_irq_info(struct vfio_irq_info *info) { switch (info->index) { case VFIO_CCW_IO_IRQ_INDEX: + case VFIO_CCW_CRW_IRQ_INDEX: info->count = 1; info->flags = VFIO_IRQ_INFO_EVENTFD; break; @@ -416,6 +421,9 @@ static int vfio_ccw_mdev_set_irqs(struct mdev_device *mdev, case VFIO_CCW_IO_IRQ_INDEX: ctx = &private->io_trigger; break; + case VFIO_CCW_CRW_IRQ_INDEX: + ctx = &private->crw_trigger; + break; default: return -EINVAL; } diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index d6601a8adf13..97131b4df0b9 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -57,6 +57,7 @@ void vfio_ccw_unregister_dev_regions(struct vfio_ccw_private *private); int vfio_ccw_register_async_dev_regions(struct vfio_ccw_private *private); int vfio_ccw_register_schib_dev_regions(struct vfio_ccw_private *private); +int vfio_ccw_register_crw_dev_regions(struct vfio_ccw_private *private); /** * struct vfio_ccw_private @@ -71,6 +72,7 @@ int vfio_ccw_register_schib_dev_regions(struct vfio_ccw_private *private); * @region: additional regions for other subchannel operations * @cmd_region: MMIO region for asynchronous I/O commands other than START * @schib_region: MMIO region for SCHIB information + * @crw_region: MMIO region for getting channel report words * @num_regions: number of additional regions * @cp: channel program for the current I/O operation * @irb: irb info received from interrupt @@ -90,6 +92,7 @@ struct vfio_ccw_private { struct vfio_ccw_region *region; struct ccw_cmd_region *cmd_region; struct ccw_schib_region *schib_region; + struct ccw_crw_region *crw_region; int num_regions; struct channel_program cp; @@ -97,6 +100,7 @@ struct vfio_ccw_private { union scsw scsw; struct eventfd_ctx *io_trigger; + struct eventfd_ctx *crw_trigger; struct work_struct io_work; } __aligned(8); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 7a1abbd889bd..907758cf6d60 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -379,6 +379,7 @@ struct vfio_region_gfx_edid { /* sub-types for VFIO_REGION_TYPE_CCW */ #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) #define VFIO_REGION_SUBTYPE_CCW_SCHIB (2) +#define VFIO_REGION_SUBTYPE_CCW_CRW (3) /* * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped @@ -578,6 +579,7 @@ enum { enum { VFIO_CCW_IO_IRQ_INDEX, + VFIO_CCW_CRW_IRQ_INDEX, VFIO_CCW_NUM_IRQS }; diff --git a/include/uapi/linux/vfio_ccw.h b/include/uapi/linux/vfio_ccw.h index 758bf214898d..aa04f3aa6db0 100644 --- a/include/uapi/linux/vfio_ccw.h +++ b/include/uapi/linux/vfio_ccw.h @@ -44,4 +44,13 @@ struct ccw_schib_region { __u8 schib_area[SCHIB_AREA_SIZE]; } __packed; +/* + * Used for returning a Channel Report Word to userspace. + * Note: this is controlled by a capability + */ +struct ccw_crw_region { + __u32 crw; + __u32 pad; +} __packed; + #endif -- cgit v1.2.3 From 10c5db286452b8c60e8f58e9a4c1cbc5a91e4e5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 23 May 2020 09:30:11 +0200 Subject: fs: move the fiemap definitions out of fs.h No need to pull the fiemap definitions into almost every file in the kernel build. Signed-off-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20200523073016.2944131-5-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/bad_inode.c | 1 + fs/btrfs/extent_io.h | 1 + fs/cifs/inode.c | 1 + fs/cifs/smb2ops.c | 1 + fs/ext2/inode.c | 1 + fs/ext4/ext4.h | 1 + fs/f2fs/data.c | 1 + fs/f2fs/inline.c | 1 + fs/gfs2/inode.c | 1 + fs/hpfs/file.c | 1 + fs/ioctl.c | 1 + fs/iomap/fiemap.c | 1 + fs/nilfs2/inode.c | 1 + fs/overlayfs/inode.c | 1 + fs/xfs/xfs_iops.c | 1 + include/linux/fiemap.h | 24 ++++++++++++++++++++++++ include/linux/fs.h | 19 +------------------ include/uapi/linux/fiemap.h | 6 +++--- 18 files changed, 43 insertions(+), 21 deletions(-) create mode 100644 include/linux/fiemap.h (limited to 'include/uapi') diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 8035d2a44561..54f0ce444272 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -15,6 +15,7 @@ #include #include #include +#include static int bad_file_open(struct inode *inode, struct file *filp) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2ed65bd0760e..817698bc0669 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -5,6 +5,7 @@ #include #include +#include #include "ulist.h" /* diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 390d2b15ef6e..3f276eb8ca68 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "cifsfs.h" diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f829f4165d38..09047f1ddfb6 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "cifsfs.h" #include "cifsglob.h" #include "smb2pdu.h" diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c885cf7d724b..0f12a0e8a8d9 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "ext2.h" #include "acl.h" #include "xattr.h" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1eb07ca91fca..9e5c332a2b94 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __KERNEL__ #include #endif diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cdf2f626bea7..25abbbb65ba0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4167e5408151..9686ffea177e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -8,6 +8,7 @@ #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 70b2d3a1e866..4842f313a808 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "gfs2.h" diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b36abf9cb345..62959a8e43ad 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -9,6 +9,7 @@ #include "hpfs_fn.h" #include +#include #define BLOCKS(size) (((size) + 511) >> 9) diff --git a/fs/ioctl.c b/fs/ioctl.c index 8fe5131b1dee..3f300cc07dee 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "internal.h" diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index d55e8f491a5e..0a807bbb2b4a 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -6,6 +6,7 @@ #include #include #include +#include struct fiemap_ctx { struct fiemap_extent_info *fi; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 671085512e0f..6e1aca38931f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "nilfs.h" #include "btnode.h" #include "segment.h" diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b0d42ece4d7c..b5fec3410556 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "overlayfs.h" diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f7a99b3bbcf7..44c353998ac5 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * Directories have different lock order w.r.t. mmap_sem compared to regular diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h new file mode 100644 index 000000000000..240d4f7d9116 --- /dev/null +++ b/include/linux/fiemap.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FIEMAP_H +#define _LINUX_FIEMAP_H 1 + +#include +#include + +struct fiemap_extent_info { + unsigned int fi_flags; /* Flags as passed from user */ + unsigned int fi_extents_mapped; /* Number of mapped extents */ + unsigned int fi_extents_max; /* Size of fiemap_extent array */ + struct fiemap_extent __user *fi_extents_start; /* Start of + fiemap_extent array */ +}; + +int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, + u64 phys, u64 len, u32 flags); +int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); + +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, u64 len, + get_block_t *get_block); + +#endif /* _LINUX_FIEMAP_H 1 */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 3104c6f7527b..09bcd329c062 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -48,6 +47,7 @@ struct backing_dev_info; struct bdi_writeback; struct bio; struct export_operations; +struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; @@ -1745,19 +1745,6 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, extern void inode_init_owner(struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); -/* - * VFS FS_IOC_FIEMAP helper definitions. - */ -struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent __user *fi_extents_start; /* Start of - fiemap_extent array */ -}; -int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, - u64 phys, u64 len, u32 flags); -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); /* * This is the "filldir" function type, used by readdir() to let @@ -3299,10 +3286,6 @@ static inline int vfs_fstat(int fd, struct kstat *stat) extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); -extern int generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block); - extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 7a900b2377b6..24ca0c00cae3 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -9,8 +9,8 @@ * Andreas Dilger */ -#ifndef _LINUX_FIEMAP_H -#define _LINUX_FIEMAP_H +#ifndef _UAPI_LINUX_FIEMAP_H +#define _UAPI_LINUX_FIEMAP_H #include @@ -67,4 +67,4 @@ struct fiemap { #define FIEMAP_EXTENT_SHARED 0x00002000 /* Space shared with other * files. */ -#endif /* _LINUX_FIEMAP_H */ +#endif /* _UAPI_LINUX_FIEMAP_H */ -- cgit v1.2.3 From 5f1f79bbc9e26fa9412fa9522f957bb8f030c442 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 7 May 2020 16:01:25 +0200 Subject: virtio-mem: Paravirtualized memory hotplug Each virtio-mem device owns exactly one memory region. It is responsible for adding/removing memory from that memory region on request. When the device driver starts up, the requested amount of memory is queried and then plugged to Linux. On request, further memory can be plugged or unplugged. This patch only implements the plugging part. On x86-64, memory can currently be plugged in 4MB ("subblock") granularity. When required, a new memory block will be added (e.g., usually 128MB on x86-64) in order to plug more subblocks. Only x86-64 was tested for now. The online_page callback is used to keep unplugged subblocks offline when onlining memory - similar to the Hyper-V balloon driver. Unplugged pages are marked PG_offline, to tell dump tools (e.g., makedumpfile) to skip them. User space is usually responsible for onlining the added memory. The memory hotplug notifier is used to synchronize virtio-mem activity against memory onlining/offlining. Each virtio-mem device can belong to a NUMA node, which allows us to easily add/remove small chunks of memory to/from a specific NUMA node by using multiple virtio-mem devices. Something that works even when the guest has no idea about the NUMA topology. One way to view virtio-mem is as a "resizable DIMM" or a DIMM with many "sub-DIMMS". This patch directly introduces the basic infrastructure to implement memory unplug. Especially the memory block states and subblock bitmaps will be heavily used there. Notes: - In case memory is to be onlined by user space, we limit the amount of offline memory blocks, to not run out of memory. This is esp. an issue if memory is added faster than it is getting onlined. - Suspend/Hibernate is not supported due to the way virtio-mem devices behave. Limited support might be possible in the future. - Reloading the device driver is not supported. Reviewed-by: Pankaj Gupta Tested-by: Pankaj Gupta Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Oscar Salvador Cc: Michal Hocko Cc: Igor Mammedov Cc: Dave Young Cc: Andrew Morton Cc: Dan Williams Cc: Pavel Tatashin Cc: Stefan Hajnoczi Cc: Vlastimil Babka Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: linux-acpi@vger.kernel.org Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20200507140139.17083-2-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/Kconfig | 16 + drivers/virtio/Makefile | 1 + drivers/virtio/virtio_mem.c | 1533 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_ids.h | 1 + include/uapi/linux/virtio_mem.h | 200 +++++ 5 files changed, 1751 insertions(+) create mode 100644 drivers/virtio/virtio_mem.c create mode 100644 include/uapi/linux/virtio_mem.h (limited to 'include/uapi') diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 69a32dfc318a..d6dde7d2cf76 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -78,6 +78,22 @@ config VIRTIO_BALLOON If unsure, say M. +config VIRTIO_MEM + tristate "Virtio mem driver" + default m + depends on X86_64 + depends on VIRTIO + depends on MEMORY_HOTPLUG_SPARSE + depends on MEMORY_HOTREMOVE + help + This driver provides access to virtio-mem paravirtualized memory + devices, allowing to hotplug and hotunplug memory. + + This driver was only tested under x86-64, but should theoretically + work on all architectures that support memory hotplug and hotremove. + + If unsure, say M. + config VIRTIO_INPUT tristate "Virtio input driver" depends on VIRTIO diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index 29a1386ecc03..4d993791f2d7 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -7,3 +7,4 @@ virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o obj-$(CONFIG_VIRTIO_VDPA) += virtio_vdpa.o +obj-$(CONFIG_VIRTIO_MEM) += virtio_mem.o diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c new file mode 100644 index 000000000000..5d1dcaa6fc42 --- /dev/null +++ b/drivers/virtio/virtio_mem.c @@ -0,0 +1,1533 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Virtio-mem device driver. + * + * Copyright Red Hat, Inc. 2020 + * + * Author(s): David Hildenbrand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum virtio_mem_mb_state { + /* Unplugged, not added to Linux. Can be reused later. */ + VIRTIO_MEM_MB_STATE_UNUSED = 0, + /* (Partially) plugged, not added to Linux. Error on add_memory(). */ + VIRTIO_MEM_MB_STATE_PLUGGED, + /* Fully plugged, fully added to Linux, offline. */ + VIRTIO_MEM_MB_STATE_OFFLINE, + /* Partially plugged, fully added to Linux, offline. */ + VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL, + /* Fully plugged, fully added to Linux, online (!ZONE_MOVABLE). */ + VIRTIO_MEM_MB_STATE_ONLINE, + /* Partially plugged, fully added to Linux, online (!ZONE_MOVABLE). */ + VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL, + /* + * Fully plugged, fully added to Linux, online (ZONE_MOVABLE). + * We are not allowed to allocate (unplug) parts of this block that + * are not movable (similar to gigantic pages). We will never allow + * to online OFFLINE_PARTIAL to ZONE_MOVABLE (as they would contain + * unmovable parts). + */ + VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE, + VIRTIO_MEM_MB_STATE_COUNT +}; + +struct virtio_mem { + struct virtio_device *vdev; + + /* We might first have to unplug all memory when starting up. */ + bool unplug_all_required; + + /* Workqueue that processes the plug/unplug requests. */ + struct work_struct wq; + atomic_t config_changed; + + /* Virtqueue for guest->host requests. */ + struct virtqueue *vq; + + /* Wait for a host response to a guest request. */ + wait_queue_head_t host_resp; + + /* Space for one guest request and the host response. */ + struct virtio_mem_req req; + struct virtio_mem_resp resp; + + /* The current size of the device. */ + uint64_t plugged_size; + /* The requested size of the device. */ + uint64_t requested_size; + + /* The device block size (for communicating with the device). */ + uint32_t device_block_size; + /* Physical start address of the memory region. */ + uint64_t addr; + /* Maximum region size in bytes. */ + uint64_t region_size; + + /* The subblock size. */ + uint32_t subblock_size; + /* The number of subblocks per memory block. */ + uint32_t nb_sb_per_mb; + + /* Id of the first memory block of this device. */ + unsigned long first_mb_id; + /* Id of the last memory block of this device. */ + unsigned long last_mb_id; + /* Id of the last usable memory block of this device. */ + unsigned long last_usable_mb_id; + /* Id of the next memory bock to prepare when needed. */ + unsigned long next_mb_id; + + /* Summary of all memory block states. */ + unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT]; +#define VIRTIO_MEM_NB_OFFLINE_THRESHOLD 10 + + /* + * One byte state per memory block. + * + * Allocated via vmalloc(). When preparing new blocks, resized + * (alloc+copy+free) when needed (crossing pages with the next mb). + * (when crossing pages). + * + * With 128MB memory blocks, we have states for 512GB of memory in one + * page. + */ + uint8_t *mb_state; + + /* + * $nb_sb_per_mb bit per memory block. Handled similar to mb_state. + * + * With 4MB subblocks, we manage 128GB of memory in one page. + */ + unsigned long *sb_bitmap; + + /* + * Mutex that protects the nb_mb_state, mb_state, and sb_bitmap. + * + * When this lock is held the pointers can't change, ONLINE and + * OFFLINE blocks can't change the state and no subblocks will get + * plugged. + */ + struct mutex hotplug_mutex; + bool hotplug_active; + + /* An error occurred we cannot handle - stop processing requests. */ + bool broken; + + /* The driver is being removed. */ + spinlock_t removal_lock; + bool removing; + + /* Timer for retrying to plug/unplug memory. */ + struct hrtimer retry_timer; +#define VIRTIO_MEM_RETRY_TIMER_MS 30000 + + /* Memory notifier (online/offline events). */ + struct notifier_block memory_notifier; + + /* Next device in the list of virtio-mem devices. */ + struct list_head next; +}; + +/* + * We have to share a single online_page callback among all virtio-mem + * devices. We use RCU to iterate the list in the callback. + */ +static DEFINE_MUTEX(virtio_mem_mutex); +static LIST_HEAD(virtio_mem_devices); + +static void virtio_mem_online_page_cb(struct page *page, unsigned int order); + +/* + * Register a virtio-mem device so it will be considered for the online_page + * callback. + */ +static int register_virtio_mem_device(struct virtio_mem *vm) +{ + int rc = 0; + + /* First device registers the callback. */ + mutex_lock(&virtio_mem_mutex); + if (list_empty(&virtio_mem_devices)) + rc = set_online_page_callback(&virtio_mem_online_page_cb); + if (!rc) + list_add_rcu(&vm->next, &virtio_mem_devices); + mutex_unlock(&virtio_mem_mutex); + + return rc; +} + +/* + * Unregister a virtio-mem device so it will no longer be considered for the + * online_page callback. + */ +static void unregister_virtio_mem_device(struct virtio_mem *vm) +{ + /* Last device unregisters the callback. */ + mutex_lock(&virtio_mem_mutex); + list_del_rcu(&vm->next); + if (list_empty(&virtio_mem_devices)) + restore_online_page_callback(&virtio_mem_online_page_cb); + mutex_unlock(&virtio_mem_mutex); + + synchronize_rcu(); +} + +/* + * Calculate the memory block id of a given address. + */ +static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) +{ + return addr / memory_block_size_bytes(); +} + +/* + * Calculate the physical start address of a given memory block id. + */ +static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) +{ + return mb_id * memory_block_size_bytes(); +} + +/* + * Calculate the subblock id of a given address. + */ +static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, + unsigned long addr) +{ + const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); + const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); + + return (addr - mb_addr) / vm->subblock_size; +} + +/* + * Set the state of a memory block, taking care of the state counter. + */ +static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long mb_id, + enum virtio_mem_mb_state state) +{ + const unsigned long idx = mb_id - vm->first_mb_id; + enum virtio_mem_mb_state old_state; + + old_state = vm->mb_state[idx]; + vm->mb_state[idx] = state; + + BUG_ON(vm->nb_mb_state[old_state] == 0); + vm->nb_mb_state[old_state]--; + vm->nb_mb_state[state]++; +} + +/* + * Get the state of a memory block. + */ +static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm, + unsigned long mb_id) +{ + const unsigned long idx = mb_id - vm->first_mb_id; + + return vm->mb_state[idx]; +} + +/* + * Prepare the state array for the next memory block. + */ +static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm) +{ + unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1; + unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2; + int old_pages = PFN_UP(old_bytes); + int new_pages = PFN_UP(new_bytes); + uint8_t *new_mb_state; + + if (vm->mb_state && old_pages == new_pages) + return 0; + + new_mb_state = vzalloc(new_pages * PAGE_SIZE); + if (!new_mb_state) + return -ENOMEM; + + mutex_lock(&vm->hotplug_mutex); + if (vm->mb_state) + memcpy(new_mb_state, vm->mb_state, old_pages * PAGE_SIZE); + vfree(vm->mb_state); + vm->mb_state = new_mb_state; + mutex_unlock(&vm->hotplug_mutex); + + return 0; +} + +#define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \ + for (_mb_id = _vm->first_mb_id; \ + _mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \ + _mb_id++) \ + if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) + +/* + * Mark all selected subblocks plugged. + * + * Will not modify the state of the memory block. + */ +static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) +{ + const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + + __bitmap_set(vm->sb_bitmap, bit, count); +} + +/* + * Mark all selected subblocks unplugged. + * + * Will not modify the state of the memory block. + */ +static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) +{ + const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + + __bitmap_clear(vm->sb_bitmap, bit, count); +} + +/* + * Test if all selected subblocks are plugged. + */ +static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) +{ + const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + + if (count == 1) + return test_bit(bit, vm->sb_bitmap); + + /* TODO: Helper similar to bitmap_set() */ + return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >= + bit + count; +} + +/* + * Find the first plugged subblock. Returns vm->nb_sb_per_mb in case there is + * none. + */ +static int virtio_mem_mb_first_plugged_sb(struct virtio_mem *vm, + unsigned long mb_id) +{ + const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb; + + return find_next_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) - bit; +} + +/* + * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is + * none. + */ +static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm, + unsigned long mb_id) +{ + const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb; + + return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) - + bit; +} + +/* + * Prepare the subblock bitmap for the next memory block. + */ +static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm) +{ + const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id; + const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb; + const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb; + int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); + int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); + unsigned long *new_sb_bitmap, *old_sb_bitmap; + + if (vm->sb_bitmap && old_pages == new_pages) + return 0; + + new_sb_bitmap = vzalloc(new_pages * PAGE_SIZE); + if (!new_sb_bitmap) + return -ENOMEM; + + mutex_lock(&vm->hotplug_mutex); + if (new_sb_bitmap) + memcpy(new_sb_bitmap, vm->sb_bitmap, old_pages * PAGE_SIZE); + + old_sb_bitmap = vm->sb_bitmap; + vm->sb_bitmap = new_sb_bitmap; + mutex_unlock(&vm->hotplug_mutex); + + vfree(old_sb_bitmap); + return 0; +} + +/* + * Try to add a memory block to Linux. This will usually only fail + * if out of memory. + * + * Must not be called with the vm->hotplug_mutex held (possible deadlock with + * onlining code). + * + * Will not modify the state of the memory block. + */ +static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + int nid = memory_add_physaddr_to_nid(addr); + + dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); + return add_memory(nid, addr, memory_block_size_bytes()); +} + +/* + * Try to remove a memory block from Linux. Will only fail if the memory block + * is not offline. + * + * Must not be called with the vm->hotplug_mutex held (possible deadlock with + * onlining code). + * + * Will not modify the state of the memory block. + */ +static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + int nid = memory_add_physaddr_to_nid(addr); + + dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id); + return remove_memory(nid, addr, memory_block_size_bytes()); +} + +/* + * Trigger the workqueue so the device can perform its magic. + */ +static void virtio_mem_retry(struct virtio_mem *vm) +{ + unsigned long flags; + + spin_lock_irqsave(&vm->removal_lock, flags); + if (!vm->removing) + queue_work(system_freezable_wq, &vm->wq); + spin_unlock_irqrestore(&vm->removal_lock, flags); +} + +/* + * Test if a virtio-mem device overlaps with the given range. Can be called + * from (notifier) callbacks lockless. + */ +static bool virtio_mem_overlaps_range(struct virtio_mem *vm, + unsigned long start, unsigned long size) +{ + unsigned long dev_start = virtio_mem_mb_id_to_phys(vm->first_mb_id); + unsigned long dev_end = virtio_mem_mb_id_to_phys(vm->last_mb_id) + + memory_block_size_bytes(); + + return start < dev_end && dev_start < start + size; +} + +/* + * Test if a virtio-mem device owns a memory block. Can be called from + * (notifier) callbacks lockless. + */ +static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id) +{ + return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id; +} + +static int virtio_mem_notify_going_online(struct virtio_mem *vm, + unsigned long mb_id, + enum zone_type zone) +{ + switch (virtio_mem_mb_get_state(vm, mb_id)) { + case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: + /* + * We won't allow to online a partially plugged memory block + * to the MOVABLE zone - it would contain unmovable parts. + */ + if (zone == ZONE_MOVABLE) { + dev_warn_ratelimited(&vm->vdev->dev, + "memory block has holes, MOVABLE not supported\n"); + return NOTIFY_BAD; + } + return NOTIFY_OK; + case VIRTIO_MEM_MB_STATE_OFFLINE: + return NOTIFY_OK; + default: + break; + } + dev_warn_ratelimited(&vm->vdev->dev, + "memory block onlining denied\n"); + return NOTIFY_BAD; +} + +static void virtio_mem_notify_offline(struct virtio_mem *vm, + unsigned long mb_id) +{ + switch (virtio_mem_mb_get_state(vm, mb_id)) { + case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL: + virtio_mem_mb_set_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); + break; + case VIRTIO_MEM_MB_STATE_ONLINE: + case VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE: + virtio_mem_mb_set_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_OFFLINE); + break; + default: + BUG(); + break; + } +} + +static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id, + enum zone_type zone) +{ + unsigned long nb_offline; + + switch (virtio_mem_mb_get_state(vm, mb_id)) { + case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: + BUG_ON(zone == ZONE_MOVABLE); + virtio_mem_mb_set_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL); + break; + case VIRTIO_MEM_MB_STATE_OFFLINE: + if (zone == ZONE_MOVABLE) + virtio_mem_mb_set_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE); + else + virtio_mem_mb_set_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_ONLINE); + break; + default: + BUG(); + break; + } + nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] + + vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL]; + + /* see if we can add new blocks now that we onlined one block */ + if (nb_offline == VIRTIO_MEM_NB_OFFLINE_THRESHOLD - 1) + virtio_mem_retry(vm); +} + +/* + * This callback will either be called synchronously from add_memory() or + * asynchronously (e.g., triggered via user space). We have to be careful + * with locking when calling add_memory(). + */ +static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, + unsigned long action, void *arg) +{ + struct virtio_mem *vm = container_of(nb, struct virtio_mem, + memory_notifier); + struct memory_notify *mhp = arg; + const unsigned long start = PFN_PHYS(mhp->start_pfn); + const unsigned long size = PFN_PHYS(mhp->nr_pages); + const unsigned long mb_id = virtio_mem_phys_to_mb_id(start); + enum zone_type zone; + int rc = NOTIFY_OK; + + if (!virtio_mem_overlaps_range(vm, start, size)) + return NOTIFY_DONE; + + /* + * Memory is onlined/offlined in memory block granularity. We cannot + * cross virtio-mem device boundaries and memory block boundaries. Bail + * out if this ever changes. + */ + if (WARN_ON_ONCE(size != memory_block_size_bytes() || + !IS_ALIGNED(start, memory_block_size_bytes()))) + return NOTIFY_BAD; + + /* + * Avoid circular locking lockdep warnings. We lock the mutex + * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The + * blocking_notifier_call_chain() has it's own lock, which gets unlocked + * between both notifier calls and will bail out. False positive. + */ + lockdep_off(); + + switch (action) { + case MEM_GOING_OFFLINE: + mutex_lock(&vm->hotplug_mutex); + if (vm->removing) { + rc = notifier_from_errno(-EBUSY); + mutex_unlock(&vm->hotplug_mutex); + break; + } + vm->hotplug_active = true; + break; + case MEM_GOING_ONLINE: + mutex_lock(&vm->hotplug_mutex); + if (vm->removing) { + rc = notifier_from_errno(-EBUSY); + mutex_unlock(&vm->hotplug_mutex); + break; + } + vm->hotplug_active = true; + zone = page_zonenum(pfn_to_page(mhp->start_pfn)); + rc = virtio_mem_notify_going_online(vm, mb_id, zone); + break; + case MEM_OFFLINE: + virtio_mem_notify_offline(vm, mb_id); + vm->hotplug_active = false; + mutex_unlock(&vm->hotplug_mutex); + break; + case MEM_ONLINE: + zone = page_zonenum(pfn_to_page(mhp->start_pfn)); + virtio_mem_notify_online(vm, mb_id, zone); + vm->hotplug_active = false; + mutex_unlock(&vm->hotplug_mutex); + break; + case MEM_CANCEL_OFFLINE: + case MEM_CANCEL_ONLINE: + if (!vm->hotplug_active) + break; + vm->hotplug_active = false; + mutex_unlock(&vm->hotplug_mutex); + break; + default: + break; + } + + lockdep_on(); + + return rc; +} + +/* + * Set a range of pages PG_offline. + */ +static void virtio_mem_set_fake_offline(unsigned long pfn, + unsigned int nr_pages) +{ + for (; nr_pages--; pfn++) + __SetPageOffline(pfn_to_page(pfn)); +} + +/* + * Clear PG_offline from a range of pages. + */ +static void virtio_mem_clear_fake_offline(unsigned long pfn, + unsigned int nr_pages) +{ + for (; nr_pages--; pfn++) + __ClearPageOffline(pfn_to_page(pfn)); +} + +/* + * Release a range of fake-offline pages to the buddy, effectively + * fake-onlining them. + */ +static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages) +{ + const int order = MAX_ORDER - 1; + int i; + + /* + * We are always called with subblock granularity, which is at least + * aligned to MAX_ORDER - 1. + */ + virtio_mem_clear_fake_offline(pfn, nr_pages); + + for (i = 0; i < nr_pages; i += 1 << order) + generic_online_page(pfn_to_page(pfn + i), order); +} + +static void virtio_mem_online_page_cb(struct page *page, unsigned int order) +{ + const unsigned long addr = page_to_phys(page); + const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); + struct virtio_mem *vm; + int sb_id; + + /* + * We exploit here that subblocks have at least MAX_ORDER - 1 + * size/alignment and that this callback is is called with such a + * size/alignment. So we cannot cross subblocks and therefore + * also not memory blocks. + */ + rcu_read_lock(); + list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { + if (!virtio_mem_owned_mb(vm, mb_id)) + continue; + + sb_id = virtio_mem_phys_to_sb_id(vm, addr); + /* + * If plugged, online the pages, otherwise, set them fake + * offline (PageOffline). + */ + if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) + generic_online_page(page, order); + else + virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order); + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + /* not virtio-mem memory, but e.g., a DIMM. online it */ + generic_online_page(page, order); +} + +static uint64_t virtio_mem_send_request(struct virtio_mem *vm, + const struct virtio_mem_req *req) +{ + struct scatterlist *sgs[2], sg_req, sg_resp; + unsigned int len; + int rc; + + /* don't use the request residing on the stack (vaddr) */ + vm->req = *req; + + /* out: buffer for request */ + sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); + sgs[0] = &sg_req; + + /* in: buffer for response */ + sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); + sgs[1] = &sg_resp; + + rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); + if (rc < 0) + return rc; + + virtqueue_kick(vm->vq); + + /* wait for a response */ + wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); + + return virtio16_to_cpu(vm->vdev, vm->resp.type); +} + +static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, + uint64_t size) +{ + const uint64_t nb_vm_blocks = size / vm->device_block_size; + const struct virtio_mem_req req = { + .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), + .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), + .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), + }; + + if (atomic_read(&vm->config_changed)) + return -EAGAIN; + + switch (virtio_mem_send_request(vm, &req)) { + case VIRTIO_MEM_RESP_ACK: + vm->plugged_size += size; + return 0; + case VIRTIO_MEM_RESP_NACK: + return -EAGAIN; + case VIRTIO_MEM_RESP_BUSY: + return -EBUSY; + case VIRTIO_MEM_RESP_ERROR: + return -EINVAL; + default: + return -ENOMEM; + } +} + +static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, + uint64_t size) +{ + const uint64_t nb_vm_blocks = size / vm->device_block_size; + const struct virtio_mem_req req = { + .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), + .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), + .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), + }; + + if (atomic_read(&vm->config_changed)) + return -EAGAIN; + + switch (virtio_mem_send_request(vm, &req)) { + case VIRTIO_MEM_RESP_ACK: + vm->plugged_size -= size; + return 0; + case VIRTIO_MEM_RESP_BUSY: + return -EBUSY; + case VIRTIO_MEM_RESP_ERROR: + return -EINVAL; + default: + return -ENOMEM; + } +} + +static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) +{ + const struct virtio_mem_req req = { + .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), + }; + + switch (virtio_mem_send_request(vm, &req)) { + case VIRTIO_MEM_RESP_ACK: + vm->unplug_all_required = false; + vm->plugged_size = 0; + /* usable region might have shrunk */ + atomic_set(&vm->config_changed, 1); + return 0; + case VIRTIO_MEM_RESP_BUSY: + return -EBUSY; + default: + return -ENOMEM; + } +} + +/* + * Plug selected subblocks. Updates the plugged state, but not the state + * of the memory block. + */ +static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id, + int sb_id, int count) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + + sb_id * vm->subblock_size; + const uint64_t size = count * vm->subblock_size; + int rc; + + dev_dbg(&vm->vdev->dev, "plugging memory block: %lu : %i - %i\n", mb_id, + sb_id, sb_id + count - 1); + + rc = virtio_mem_send_plug_request(vm, addr, size); + if (!rc) + virtio_mem_mb_set_sb_plugged(vm, mb_id, sb_id, count); + return rc; +} + +/* + * Unplug selected subblocks. Updates the plugged state, but not the state + * of the memory block. + */ +static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, + int sb_id, int count) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + + sb_id * vm->subblock_size; + const uint64_t size = count * vm->subblock_size; + int rc; + + dev_dbg(&vm->vdev->dev, "unplugging memory block: %lu : %i - %i\n", + mb_id, sb_id, sb_id + count - 1); + + rc = virtio_mem_send_unplug_request(vm, addr, size); + if (!rc) + virtio_mem_mb_set_sb_unplugged(vm, mb_id, sb_id, count); + return rc; +} + +/* + * Unplug the desired number of plugged subblocks of a offline or not-added + * memory block. Will fail if any subblock cannot get unplugged (instead of + * skipping it). + * + * Will not modify the state of the memory block. + * + * Note: can fail after some subblocks were unplugged. + */ +static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb) +{ + int sb_id, count; + int rc; + + while (*nb_sb) { + sb_id = virtio_mem_mb_first_plugged_sb(vm, mb_id); + if (sb_id >= vm->nb_sb_per_mb) + break; + count = 1; + while (count < *nb_sb && + sb_id + count < vm->nb_sb_per_mb && + virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id + count, + 1)) + count++; + + rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count); + if (rc) + return rc; + *nb_sb -= count; + } + + return 0; +} + +/* + * Unplug all plugged subblocks of an offline or not-added memory block. + * + * Will not modify the state of the memory block. + * + * Note: can fail after some subblocks were unplugged. + */ +static int virtio_mem_mb_unplug(struct virtio_mem *vm, unsigned long mb_id) +{ + uint64_t nb_sb = vm->nb_sb_per_mb; + + return virtio_mem_mb_unplug_any_sb(vm, mb_id, &nb_sb); +} + +/* + * Prepare tracking data for the next memory block. + */ +static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, + unsigned long *mb_id) +{ + int rc; + + if (vm->next_mb_id > vm->last_usable_mb_id) + return -ENOSPC; + + /* Resize the state array if required. */ + rc = virtio_mem_mb_state_prepare_next_mb(vm); + if (rc) + return rc; + + /* Resize the subblock bitmap if required. */ + rc = virtio_mem_sb_bitmap_prepare_next_mb(vm); + if (rc) + return rc; + + vm->nb_mb_state[VIRTIO_MEM_MB_STATE_UNUSED]++; + *mb_id = vm->next_mb_id++; + return 0; +} + +/* + * Don't add too many blocks that are not onlined yet to avoid running OOM. + */ +static bool virtio_mem_too_many_mb_offline(struct virtio_mem *vm) +{ + unsigned long nb_offline; + + nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] + + vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL]; + return nb_offline >= VIRTIO_MEM_NB_OFFLINE_THRESHOLD; +} + +/* + * Try to plug the desired number of subblocks and add the memory block + * to Linux. + * + * Will modify the state of the memory block. + */ +static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, + unsigned long mb_id, + uint64_t *nb_sb) +{ + const int count = min_t(int, *nb_sb, vm->nb_sb_per_mb); + int rc, rc2; + + if (WARN_ON_ONCE(!count)) + return -EINVAL; + + /* + * Plug the requested number of subblocks before adding it to linux, + * so that onlining will directly online all plugged subblocks. + */ + rc = virtio_mem_mb_plug_sb(vm, mb_id, 0, count); + if (rc) + return rc; + + /* + * Mark the block properly offline before adding it to Linux, + * so the memory notifiers will find the block in the right state. + */ + if (count == vm->nb_sb_per_mb) + virtio_mem_mb_set_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_OFFLINE); + else + virtio_mem_mb_set_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); + + /* Add the memory block to linux - if that fails, try to unplug. */ + rc = virtio_mem_mb_add(vm, mb_id); + if (rc) { + enum virtio_mem_mb_state new_state = VIRTIO_MEM_MB_STATE_UNUSED; + + dev_err(&vm->vdev->dev, + "adding memory block %lu failed with %d\n", mb_id, rc); + rc2 = virtio_mem_mb_unplug_sb(vm, mb_id, 0, count); + + /* + * TODO: Linux MM does not properly clean up yet in all cases + * where adding of memory failed - especially on -ENOMEM. + */ + if (rc2) + new_state = VIRTIO_MEM_MB_STATE_PLUGGED; + virtio_mem_mb_set_state(vm, mb_id, new_state); + return rc; + } + + *nb_sb -= count; + return 0; +} + +/* + * Try to plug the desired number of subblocks of a memory block that + * is already added to Linux. + * + * Will modify the state of the memory block. + * + * Note: Can fail after some subblocks were successfully plugged. + */ +static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, + uint64_t *nb_sb, bool online) +{ + unsigned long pfn, nr_pages; + int sb_id, count; + int rc; + + if (WARN_ON_ONCE(!*nb_sb)) + return -EINVAL; + + while (*nb_sb) { + sb_id = virtio_mem_mb_first_unplugged_sb(vm, mb_id); + if (sb_id >= vm->nb_sb_per_mb) + break; + count = 1; + while (count < *nb_sb && + sb_id + count < vm->nb_sb_per_mb && + !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id + count, + 1)) + count++; + + rc = virtio_mem_mb_plug_sb(vm, mb_id, sb_id, count); + if (rc) + return rc; + *nb_sb -= count; + if (!online) + continue; + + /* fake-online the pages if the memory block is online */ + pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + + sb_id * vm->subblock_size); + nr_pages = PFN_DOWN(count * vm->subblock_size); + virtio_mem_fake_online(pfn, nr_pages); + } + + if (virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + if (online) + virtio_mem_mb_set_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_ONLINE); + else + virtio_mem_mb_set_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_OFFLINE); + } + + return rc; +} + +/* + * Try to plug the requested amount of memory. + */ +static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) +{ + uint64_t nb_sb = diff / vm->subblock_size; + unsigned long mb_id; + int rc; + + if (!nb_sb) + return 0; + + /* Don't race with onlining/offlining */ + mutex_lock(&vm->hotplug_mutex); + + /* Try to plug subblocks of partially plugged online blocks. */ + virtio_mem_for_each_mb_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) { + rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, true); + if (rc || !nb_sb) + goto out_unlock; + cond_resched(); + } + + /* Try to plug subblocks of partially plugged offline blocks. */ + virtio_mem_for_each_mb_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { + rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, false); + if (rc || !nb_sb) + goto out_unlock; + cond_resched(); + } + + /* + * We won't be working on online/offline memory blocks from this point, + * so we can't race with memory onlining/offlining. Drop the mutex. + */ + mutex_unlock(&vm->hotplug_mutex); + + /* Try to plug and add unused blocks */ + virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED) { + if (virtio_mem_too_many_mb_offline(vm)) + return -ENOSPC; + + rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb); + if (rc || !nb_sb) + return rc; + cond_resched(); + } + + /* Try to prepare, plug and add new blocks */ + while (nb_sb) { + if (virtio_mem_too_many_mb_offline(vm)) + return -ENOSPC; + + rc = virtio_mem_prepare_next_mb(vm, &mb_id); + if (rc) + return rc; + rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb); + if (rc) + return rc; + cond_resched(); + } + + return 0; +out_unlock: + mutex_unlock(&vm->hotplug_mutex); + return rc; +} + +/* + * Try to unplug all blocks that couldn't be unplugged before, for example, + * because the hypervisor was busy. + */ +static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) +{ + unsigned long mb_id; + int rc; + + virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_PLUGGED) { + rc = virtio_mem_mb_unplug(vm, mb_id); + if (rc) + return rc; + virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED); + } + + return 0; +} + +/* + * Update all parts of the config that could have changed. + */ +static void virtio_mem_refresh_config(struct virtio_mem *vm) +{ + const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; + uint64_t new_plugged_size, usable_region_size, end_addr; + + /* the plugged_size is just a reflection of what _we_ did previously */ + virtio_cread(vm->vdev, struct virtio_mem_config, plugged_size, + &new_plugged_size); + if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) + vm->plugged_size = new_plugged_size; + + /* calculate the last usable memory block id */ + virtio_cread(vm->vdev, struct virtio_mem_config, + usable_region_size, &usable_region_size); + end_addr = vm->addr + usable_region_size; + end_addr = min(end_addr, phys_limit); + vm->last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr) - 1; + + /* see if there is a request to change the size */ + virtio_cread(vm->vdev, struct virtio_mem_config, requested_size, + &vm->requested_size); + + dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); + dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); +} + +/* + * Workqueue function for handling plug/unplug requests and config updates. + */ +static void virtio_mem_run_wq(struct work_struct *work) +{ + struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); + uint64_t diff; + int rc; + + hrtimer_cancel(&vm->retry_timer); + + if (vm->broken) + return; + +retry: + rc = 0; + + /* Make sure we start with a clean state if there are leftovers. */ + if (unlikely(vm->unplug_all_required)) + rc = virtio_mem_send_unplug_all_request(vm); + + if (atomic_read(&vm->config_changed)) { + atomic_set(&vm->config_changed, 0); + virtio_mem_refresh_config(vm); + } + + /* Unplug any leftovers from previous runs */ + if (!rc) + rc = virtio_mem_unplug_pending_mb(vm); + + if (!rc && vm->requested_size != vm->plugged_size) { + if (vm->requested_size > vm->plugged_size) { + diff = vm->requested_size - vm->plugged_size; + rc = virtio_mem_plug_request(vm, diff); + } + /* TODO: try to unplug memory */ + } + + switch (rc) { + case 0: + break; + case -ENOSPC: + /* + * We cannot add any more memory (alignment, physical limit) + * or we have too many offline memory blocks. + */ + break; + case -EBUSY: + /* + * The hypervisor cannot process our request right now + * (e.g., out of memory, migrating). + */ + case -ENOMEM: + /* Out of memory, try again later. */ + hrtimer_start(&vm->retry_timer, + ms_to_ktime(VIRTIO_MEM_RETRY_TIMER_MS), + HRTIMER_MODE_REL); + break; + case -EAGAIN: + /* Retry immediately (e.g., the config changed). */ + goto retry; + default: + /* Unknown error, mark as broken */ + dev_err(&vm->vdev->dev, + "unknown error, marking device broken: %d\n", rc); + vm->broken = true; + } +} + +static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) +{ + struct virtio_mem *vm = container_of(timer, struct virtio_mem, + retry_timer); + + virtio_mem_retry(vm); + return HRTIMER_NORESTART; +} + +static void virtio_mem_handle_response(struct virtqueue *vq) +{ + struct virtio_mem *vm = vq->vdev->priv; + + wake_up(&vm->host_resp); +} + +static int virtio_mem_init_vq(struct virtio_mem *vm) +{ + struct virtqueue *vq; + + vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, + "guest-request"); + if (IS_ERR(vq)) + return PTR_ERR(vq); + vm->vq = vq; + + return 0; +} + +/* + * Test if any memory in the range is present in Linux. + */ +static bool virtio_mem_any_memory_present(unsigned long start, + unsigned long size) +{ + const unsigned long start_pfn = PFN_DOWN(start); + const unsigned long end_pfn = PFN_UP(start + size); + unsigned long pfn; + + for (pfn = start_pfn; pfn != end_pfn; pfn++) + if (present_section_nr(pfn_to_section_nr(pfn))) + return true; + + return false; +} + +static int virtio_mem_init(struct virtio_mem *vm) +{ + const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; + + if (!vm->vdev->config->get) { + dev_err(&vm->vdev->dev, "config access disabled\n"); + return -EINVAL; + } + + /* + * We don't want to (un)plug or reuse any memory when in kdump. The + * memory is still accessible (but not mapped). + */ + if (is_kdump_kernel()) { + dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n"); + return -EBUSY; + } + + /* Fetch all properties that can't change. */ + virtio_cread(vm->vdev, struct virtio_mem_config, plugged_size, + &vm->plugged_size); + virtio_cread(vm->vdev, struct virtio_mem_config, block_size, + &vm->device_block_size); + virtio_cread(vm->vdev, struct virtio_mem_config, addr, &vm->addr); + virtio_cread(vm->vdev, struct virtio_mem_config, region_size, + &vm->region_size); + + /* + * If we still have memory plugged, we might have to unplug all + * memory first. However, if somebody simply unloaded the driver + * we would have to reinitialize the old state - something we don't + * support yet. Detect if we have any memory in the area present. + */ + if (vm->plugged_size) { + uint64_t usable_region_size; + + virtio_cread(vm->vdev, struct virtio_mem_config, + usable_region_size, &usable_region_size); + + if (virtio_mem_any_memory_present(vm->addr, + usable_region_size)) { + dev_err(&vm->vdev->dev, + "reloading the driver is not supported\n"); + return -EINVAL; + } + /* + * Note: it might happen that the device is busy and + * unplugging all memory might take some time. + */ + dev_info(&vm->vdev->dev, "unplugging all memory required\n"); + vm->unplug_all_required = 1; + } + + /* + * We always hotplug memory in memory block granularity. This way, + * we have to wait for exactly one memory block to online. + */ + if (vm->device_block_size > memory_block_size_bytes()) { + dev_err(&vm->vdev->dev, + "The block size is not supported (too big).\n"); + return -EINVAL; + } + + /* bad device setup - warn only */ + if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) + dev_warn(&vm->vdev->dev, + "The alignment of the physical start address can make some memory unusable.\n"); + if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) + dev_warn(&vm->vdev->dev, + "The alignment of the physical end address can make some memory unusable.\n"); + if (vm->addr + vm->region_size > phys_limit) + dev_warn(&vm->vdev->dev, + "Some memory is not addressable. This can make some memory unusable.\n"); + + /* + * Calculate the subblock size: + * - At least MAX_ORDER - 1 / pageblock_order. + * - At least the device block size. + * In the worst case, a single subblock per memory block. + */ + vm->subblock_size = PAGE_SIZE * 1u << max_t(uint32_t, MAX_ORDER - 1, + pageblock_order); + vm->subblock_size = max_t(uint32_t, vm->device_block_size, + vm->subblock_size); + vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size; + + /* Round up to the next full memory block */ + vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 + + memory_block_size_bytes()); + vm->next_mb_id = vm->first_mb_id; + vm->last_mb_id = virtio_mem_phys_to_mb_id(vm->addr + + vm->region_size) - 1; + + dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); + dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); + dev_info(&vm->vdev->dev, "device block size: 0x%x", + vm->device_block_size); + dev_info(&vm->vdev->dev, "memory block size: 0x%lx", + memory_block_size_bytes()); + dev_info(&vm->vdev->dev, "subblock size: 0x%x", + vm->subblock_size); + + return 0; +} + +static int virtio_mem_probe(struct virtio_device *vdev) +{ + struct virtio_mem *vm; + int rc = -EINVAL; + + vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); + if (!vm) + return -ENOMEM; + + init_waitqueue_head(&vm->host_resp); + vm->vdev = vdev; + INIT_WORK(&vm->wq, virtio_mem_run_wq); + mutex_init(&vm->hotplug_mutex); + INIT_LIST_HEAD(&vm->next); + spin_lock_init(&vm->removal_lock); + hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + vm->retry_timer.function = virtio_mem_timer_expired; + + /* register the virtqueue */ + rc = virtio_mem_init_vq(vm); + if (rc) + goto out_free_vm; + + /* initialize the device by querying the config */ + rc = virtio_mem_init(vm); + if (rc) + goto out_del_vq; + + /* register callbacks */ + vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; + rc = register_memory_notifier(&vm->memory_notifier); + if (rc) + goto out_del_vq; + rc = register_virtio_mem_device(vm); + if (rc) + goto out_unreg_mem; + + virtio_device_ready(vdev); + + /* trigger a config update to start processing the requested_size */ + atomic_set(&vm->config_changed, 1); + queue_work(system_freezable_wq, &vm->wq); + + return 0; +out_unreg_mem: + unregister_memory_notifier(&vm->memory_notifier); +out_del_vq: + vdev->config->del_vqs(vdev); +out_free_vm: + kfree(vm); + vdev->priv = NULL; + + return rc; +} + +static void virtio_mem_remove(struct virtio_device *vdev) +{ + struct virtio_mem *vm = vdev->priv; + unsigned long mb_id; + int rc; + + /* + * Make sure the workqueue won't be triggered anymore and no memory + * blocks can be onlined/offlined until we're finished here. + */ + mutex_lock(&vm->hotplug_mutex); + spin_lock_irq(&vm->removal_lock); + vm->removing = true; + spin_unlock_irq(&vm->removal_lock); + mutex_unlock(&vm->hotplug_mutex); + + /* wait until the workqueue stopped */ + cancel_work_sync(&vm->wq); + hrtimer_cancel(&vm->retry_timer); + + /* + * After we unregistered our callbacks, user space can online partially + * plugged offline blocks. Make sure to remove them. + */ + virtio_mem_for_each_mb_state(vm, mb_id, + VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { + rc = virtio_mem_mb_remove(vm, mb_id); + BUG_ON(rc); + virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED); + } + + /* unregister callbacks */ + unregister_virtio_mem_device(vm); + unregister_memory_notifier(&vm->memory_notifier); + + /* + * There is no way we could reliably remove all memory we have added to + * the system. And there is no way to stop the driver/device from going + * away. Warn at least. + */ + if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] || + vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] || + vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] || + vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL] || + vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE]) + dev_warn(&vdev->dev, "device still has system memory added\n"); + + /* remove all tracking data - no locking needed */ + vfree(vm->mb_state); + vfree(vm->sb_bitmap); + + /* reset the device and cleanup the queues */ + vdev->config->reset(vdev); + vdev->config->del_vqs(vdev); + + kfree(vm); + vdev->priv = NULL; +} + +static void virtio_mem_config_changed(struct virtio_device *vdev) +{ + struct virtio_mem *vm = vdev->priv; + + atomic_set(&vm->config_changed, 1); + virtio_mem_retry(vm); +} + +#ifdef CONFIG_PM_SLEEP +static int virtio_mem_freeze(struct virtio_device *vdev) +{ + /* + * When restarting the VM, all memory is usually unplugged. Don't + * allow to suspend/hibernate. + */ + dev_err(&vdev->dev, "save/restore not supported.\n"); + return -EPERM; +} + +static int virtio_mem_restore(struct virtio_device *vdev) +{ + return -EPERM; +} +#endif + +static struct virtio_device_id virtio_mem_id_table[] = { + { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct virtio_driver virtio_mem_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = virtio_mem_id_table, + .probe = virtio_mem_probe, + .remove = virtio_mem_remove, + .config_changed = virtio_mem_config_changed, +#ifdef CONFIG_PM_SLEEP + .freeze = virtio_mem_freeze, + .restore = virtio_mem_restore, +#endif +}; + +module_virtio_driver(virtio_mem_driver); +MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); +MODULE_AUTHOR("David Hildenbrand "); +MODULE_DESCRIPTION("Virtio-mem driver"); +MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index ecc27a17401a..b052355ac7a3 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -44,6 +44,7 @@ #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ #define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ +#define VIRTIO_ID_MEM 24 /* virtio mem */ #define VIRTIO_ID_FS 26 /* virtio filesystem */ #define VIRTIO_ID_PMEM 27 /* virtio pmem */ #define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ diff --git a/include/uapi/linux/virtio_mem.h b/include/uapi/linux/virtio_mem.h new file mode 100644 index 000000000000..1bfade78bdfd --- /dev/null +++ b/include/uapi/linux/virtio_mem.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* + * Virtio Mem Device + * + * Copyright Red Hat, Inc. 2020 + * + * Authors: + * David Hildenbrand + * + * This header is BSD licensed so anyone can use the definitions + * to implement compatible drivers/servers: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IBM OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _LINUX_VIRTIO_MEM_H +#define _LINUX_VIRTIO_MEM_H + +#include +#include +#include +#include + +/* + * Each virtio-mem device manages a dedicated region in physical address + * space. Each device can belong to a single NUMA node, multiple devices + * for a single NUMA node are possible. A virtio-mem device is like a + * "resizable DIMM" consisting of small memory blocks that can be plugged + * or unplugged. The device driver is responsible for (un)plugging memory + * blocks on demand. + * + * Virtio-mem devices can only operate on their assigned memory region in + * order to (un)plug memory. A device cannot (un)plug memory belonging to + * other devices. + * + * The "region_size" corresponds to the maximum amount of memory that can + * be provided by a device. The "size" corresponds to the amount of memory + * that is currently plugged. "requested_size" corresponds to a request + * from the device to the device driver to (un)plug blocks. The + * device driver should try to (un)plug blocks in order to reach the + * "requested_size". It is impossible to plug more memory than requested. + * + * The "usable_region_size" represents the memory region that can actually + * be used to (un)plug memory. It is always at least as big as the + * "requested_size" and will grow dynamically. It will only shrink when + * explicitly triggered (VIRTIO_MEM_REQ_UNPLUG). + * + * There are no guarantees what will happen if unplugged memory is + * read/written. Such memory should, in general, not be touched. E.g., + * even writing might succeed, but the values will simply be discarded at + * random points in time. + * + * It can happen that the device cannot process a request, because it is + * busy. The device driver has to retry later. + * + * Usually, during system resets all memory will get unplugged, so the + * device driver can start with a clean state. However, in specific + * scenarios (if the device is busy) it can happen that the device still + * has memory plugged. The device driver can request to unplug all memory + * (VIRTIO_MEM_REQ_UNPLUG) - which might take a while to succeed if the + * device is busy. + */ + +/* --- virtio-mem: guest -> host requests --- */ + +/* request to plug memory blocks */ +#define VIRTIO_MEM_REQ_PLUG 0 +/* request to unplug memory blocks */ +#define VIRTIO_MEM_REQ_UNPLUG 1 +/* request to unplug all blocks and shrink the usable size */ +#define VIRTIO_MEM_REQ_UNPLUG_ALL 2 +/* request information about the plugged state of memory blocks */ +#define VIRTIO_MEM_REQ_STATE 3 + +struct virtio_mem_req_plug { + __virtio64 addr; + __virtio16 nb_blocks; +}; + +struct virtio_mem_req_unplug { + __virtio64 addr; + __virtio16 nb_blocks; +}; + +struct virtio_mem_req_state { + __virtio64 addr; + __virtio16 nb_blocks; +}; + +struct virtio_mem_req { + __virtio16 type; + __virtio16 padding[3]; + + union { + struct virtio_mem_req_plug plug; + struct virtio_mem_req_unplug unplug; + struct virtio_mem_req_state state; + } u; +}; + + +/* --- virtio-mem: host -> guest response --- */ + +/* + * Request processed successfully, applicable for + * - VIRTIO_MEM_REQ_PLUG + * - VIRTIO_MEM_REQ_UNPLUG + * - VIRTIO_MEM_REQ_UNPLUG_ALL + * - VIRTIO_MEM_REQ_STATE + */ +#define VIRTIO_MEM_RESP_ACK 0 +/* + * Request denied - e.g. trying to plug more than requested, applicable for + * - VIRTIO_MEM_REQ_PLUG + */ +#define VIRTIO_MEM_RESP_NACK 1 +/* + * Request cannot be processed right now, try again later, applicable for + * - VIRTIO_MEM_REQ_PLUG + * - VIRTIO_MEM_REQ_UNPLUG + * - VIRTIO_MEM_REQ_UNPLUG_ALL + */ +#define VIRTIO_MEM_RESP_BUSY 2 +/* + * Error in request (e.g. addresses/alignment), applicable for + * - VIRTIO_MEM_REQ_PLUG + * - VIRTIO_MEM_REQ_UNPLUG + * - VIRTIO_MEM_REQ_STATE + */ +#define VIRTIO_MEM_RESP_ERROR 3 + + +/* State of memory blocks is "plugged" */ +#define VIRTIO_MEM_STATE_PLUGGED 0 +/* State of memory blocks is "unplugged" */ +#define VIRTIO_MEM_STATE_UNPLUGGED 1 +/* State of memory blocks is "mixed" */ +#define VIRTIO_MEM_STATE_MIXED 2 + +struct virtio_mem_resp_state { + __virtio16 state; +}; + +struct virtio_mem_resp { + __virtio16 type; + __virtio16 padding[3]; + + union { + struct virtio_mem_resp_state state; + } u; +}; + +/* --- virtio-mem: configuration --- */ + +struct virtio_mem_config { + /* Block size and alignment. Cannot change. */ + __u32 block_size; + __u32 padding; + /* Start address of the memory region. Cannot change. */ + __u64 addr; + /* Region size (maximum). Cannot change. */ + __u64 region_size; + /* + * Currently usable region size. Can grow up to region_size. Can + * shrink due to VIRTIO_MEM_REQ_UNPLUG_ALL (in which case no config + * update will be sent). + */ + __u64 usable_region_size; + /* + * Currently used size. Changes due to plug/unplug requests, but no + * config updates will be sent. + */ + __u64 plugged_size; + /* Requested size. New plug requests cannot exceed it. Can change. */ + __u64 requested_size; +}; + +#endif /* _LINUX_VIRTIO_MEM_H */ -- cgit v1.2.3 From f2af6d3978d74a7891d0f428537b4494498202cb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 7 May 2020 16:01:27 +0200 Subject: virtio-mem: Allow to specify an ACPI PXM as nid We want to allow to specify (similar as for a DIMM), to which node a virtio-mem device (and, therefore, its memory) belongs. Add a new virtio-mem feature flag and export pxm_to_node, so it can be used in kernel module context. Acked-by: Michal Hocko # for the export Acked-by: "Rafael J. Wysocki" # for the export Acked-by: Pankaj Gupta Tested-by: Pankaj Gupta Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Oscar Salvador Cc: Michal Hocko Cc: Igor Mammedov Cc: Dave Young Cc: Andrew Morton Cc: Dan Williams Cc: Pavel Tatashin Cc: Stefan Hajnoczi Cc: Vlastimil Babka Cc: Len Brown Cc: linux-acpi@vger.kernel.org Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20200507140139.17083-4-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/acpi/numa/srat.c | 1 + drivers/virtio/virtio_mem.c | 39 +++++++++++++++++++++++++++++++++++++-- include/uapi/linux/virtio_mem.h | 10 +++++++++- 3 files changed, 47 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 47b4969d9b93..5be5a977da1b 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -35,6 +35,7 @@ int pxm_to_node(int pxm) return NUMA_NO_NODE; return pxm_to_node_map[pxm]; } +EXPORT_SYMBOL(pxm_to_node); int node_to_pxm(int node) { diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 5d1dcaa6fc42..270ddeaec059 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -21,6 +21,8 @@ #include #include +#include + enum virtio_mem_mb_state { /* Unplugged, not added to Linux. Can be reused later. */ VIRTIO_MEM_MB_STATE_UNUSED = 0, @@ -72,6 +74,8 @@ struct virtio_mem { /* The device block size (for communicating with the device). */ uint32_t device_block_size; + /* The translated node id. NUMA_NO_NODE in case not specified. */ + int nid; /* Physical start address of the memory region. */ uint64_t addr; /* Maximum region size in bytes. */ @@ -389,7 +393,10 @@ static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm) static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - int nid = memory_add_physaddr_to_nid(addr); + int nid = vm->nid; + + if (nid == NUMA_NO_NODE) + nid = memory_add_physaddr_to_nid(addr); dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); return add_memory(nid, addr, memory_block_size_bytes()); @@ -407,7 +414,10 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - int nid = memory_add_physaddr_to_nid(addr); + int nid = vm->nid; + + if (nid == NUMA_NO_NODE) + nid = memory_add_physaddr_to_nid(addr); dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id); return remove_memory(nid, addr, memory_block_size_bytes()); @@ -426,6 +436,17 @@ static void virtio_mem_retry(struct virtio_mem *vm) spin_unlock_irqrestore(&vm->removal_lock, flags); } +static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) +{ + int node = NUMA_NO_NODE; + +#if defined(CONFIG_ACPI_NUMA) + if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) + node = pxm_to_node(node_id); +#endif + return node; +} + /* * Test if a virtio-mem device overlaps with the given range. Can be called * from (notifier) callbacks lockless. @@ -1267,6 +1288,7 @@ static bool virtio_mem_any_memory_present(unsigned long start, static int virtio_mem_init(struct virtio_mem *vm) { const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; + uint16_t node_id; if (!vm->vdev->config->get) { dev_err(&vm->vdev->dev, "config access disabled\n"); @@ -1287,6 +1309,9 @@ static int virtio_mem_init(struct virtio_mem *vm) &vm->plugged_size); virtio_cread(vm->vdev, struct virtio_mem_config, block_size, &vm->device_block_size); + virtio_cread(vm->vdev, struct virtio_mem_config, node_id, + &node_id); + vm->nid = virtio_mem_translate_node_id(vm, node_id); virtio_cread(vm->vdev, struct virtio_mem_config, addr, &vm->addr); virtio_cread(vm->vdev, struct virtio_mem_config, region_size, &vm->region_size); @@ -1365,6 +1390,8 @@ static int virtio_mem_init(struct virtio_mem *vm) memory_block_size_bytes()); dev_info(&vm->vdev->dev, "subblock size: 0x%x", vm->subblock_size); + if (vm->nid != NUMA_NO_NODE) + dev_info(&vm->vdev->dev, "nid: %d", vm->nid); return 0; } @@ -1508,12 +1535,20 @@ static int virtio_mem_restore(struct virtio_device *vdev) } #endif +static unsigned int virtio_mem_features[] = { +#if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) + VIRTIO_MEM_F_ACPI_PXM, +#endif +}; + static struct virtio_device_id virtio_mem_id_table[] = { { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, { 0 }, }; static struct virtio_driver virtio_mem_driver = { + .feature_table = virtio_mem_features, + .feature_table_size = ARRAY_SIZE(virtio_mem_features), .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = virtio_mem_id_table, diff --git a/include/uapi/linux/virtio_mem.h b/include/uapi/linux/virtio_mem.h index 1bfade78bdfd..e0a9dc7397c3 100644 --- a/include/uapi/linux/virtio_mem.h +++ b/include/uapi/linux/virtio_mem.h @@ -83,6 +83,12 @@ * device is busy. */ +/* --- virtio-mem: feature bits --- */ + +/* node_id is an ACPI PXM and is valid */ +#define VIRTIO_MEM_F_ACPI_PXM 0 + + /* --- virtio-mem: guest -> host requests --- */ /* request to plug memory blocks */ @@ -177,7 +183,9 @@ struct virtio_mem_resp { struct virtio_mem_config { /* Block size and alignment. Cannot change. */ __u32 block_size; - __u32 padding; + /* Valid with VIRTIO_MEM_F_ACPI_PXM. Cannot change. */ + __u16 node_id; + __u16 padding; /* Start address of the memory region. Cannot change. */ __u64 addr; /* Region size (maximum). Cannot change. */ -- cgit v1.2.3 From fce8afd76e3a4d8c59c92f84f8027569fd7031d0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 May 2020 12:14:02 +0200 Subject: virtio-mem: Don't rely on implicit compiler padding for requests The compiler will add padding after the last member, make that explicit. The size of a request is always 24 bytes. The size of a response always 10 bytes. Add compile-time checks. Cc: "Michael S. Tsirkin" Cc: Pankaj Gupta Cc: teawater Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20200515101402.16597-1-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 3 +++ include/uapi/linux/virtio_mem.h | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 9e523db3bee1..f658fe9149be 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1770,6 +1770,9 @@ static int virtio_mem_probe(struct virtio_device *vdev) struct virtio_mem *vm; int rc = -EINVAL; + BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); + BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); + vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); if (!vm) return -ENOMEM; diff --git a/include/uapi/linux/virtio_mem.h b/include/uapi/linux/virtio_mem.h index e0a9dc7397c3..a455c488a995 100644 --- a/include/uapi/linux/virtio_mem.h +++ b/include/uapi/linux/virtio_mem.h @@ -103,16 +103,19 @@ struct virtio_mem_req_plug { __virtio64 addr; __virtio16 nb_blocks; + __virtio16 padding[3]; }; struct virtio_mem_req_unplug { __virtio64 addr; __virtio16 nb_blocks; + __virtio16 padding[3]; }; struct virtio_mem_req_state { __virtio64 addr; __virtio16 nb_blocks; + __virtio16 padding[3]; }; struct virtio_mem_req { -- cgit v1.2.3 From f286d627ef026a4d04b41ae5917d58ddf243c3c5 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 13 Jan 2020 21:21:49 +0100 Subject: gfs2: Keep track of deleted inode generations in LVBs When deleting an inode, keep track of the generation of the deleted inode in the inode glock Lock Value Block (LVB). When trying to delete an inode remotely, check the last-known inode generation against the deleted inode generation to skip duplicate remote deletes. This avoids taking the resource group glock in order to verify the block type. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 19 +++++++++++++++++++ fs/gfs2/glock.h | 3 +++ fs/gfs2/glops.c | 2 +- fs/gfs2/super.c | 3 +++ include/uapi/linux/gfs2_ondisk.h | 6 ++++++ 5 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 86e9e621f346..12681616eb76 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -755,6 +755,25 @@ out_unlock: return; } +void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) +{ + struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; + + if (ri->ri_magic == 0) + ri->ri_magic = cpu_to_be32(GFS2_MAGIC); + if (ri->ri_magic == cpu_to_be32(GFS2_MAGIC)) + ri->ri_generation_deleted = cpu_to_be64(generation); +} + +bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation) +{ + struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; + + if (ri->ri_magic != cpu_to_be32(GFS2_MAGIC)) + return false; + return generation <= be64_to_cpu(ri->ri_generation_deleted); +} + static void delete_work_func(struct work_struct *work) { struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index b8adaf80e4c5..5c1b60fdedcf 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -306,4 +306,7 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object) spin_unlock(&gl->gl_lockref.lock); } +extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); +extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 9e9c7a4b8c66..63ae9e45ce34 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -692,7 +692,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_lock = inode_go_lock, .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, - .go_flags = GLOF_ASPACE | GLOF_LRU, + .go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB, .go_free = inode_go_free, }; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 956fced0a8ec..e69efed9fb51 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1315,6 +1315,8 @@ static void gfs2_evict_inode(struct inode *inode) goto out; } + if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) + goto out_truncate; error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); if (error) goto out_truncate; @@ -1368,6 +1370,7 @@ alloc_failed: that subsequent inode creates don't see an old gl_object. */ glock_clear_object(ip->i_gl, ip); error = gfs2_dinode_dealloc(ip); + gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); goto out_unlock; out_truncate: diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h index 2dc10a034de1..07e508e6691b 100644 --- a/include/uapi/linux/gfs2_ondisk.h +++ b/include/uapi/linux/gfs2_ondisk.h @@ -171,6 +171,12 @@ struct gfs2_rindex { #define GFS2_RGF_NOALLOC 0x00000008 #define GFS2_RGF_TRIMMED 0x00000010 +struct gfs2_inode_lvb { + __be32 ri_magic; + __be32 __pad; + __be64 ri_generation_deleted; +}; + struct gfs2_rgrp_lvb { __be32 rl_magic; __be32 rl_flags; -- cgit v1.2.3 From 776f395004d829bbbf18c159ed9beb517a208c71 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Fri, 5 Jun 2020 18:27:13 +0800 Subject: vhost_vdpa: Support config interrupt in vdpa This commit implements config interrupt support in vhost_vdpa layer. Signed-off-by: Zhu Lingshan Acked-by: Jason Wang Link: https://lore.kernel.org/r/1591352835-22441-4-git-send-email-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vhost.h | 4 ++++ 2 files changed, 51 insertions(+) (limited to 'include/uapi') diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 6ca7660ee6b5..77a0c9fb6cc3 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "vhost.h" @@ -71,6 +72,7 @@ struct vhost_vdpa { int nvqs; int virtio_id; int minor; + struct eventfd_ctx *config_ctx; }; static DEFINE_IDA(vhost_vdpa_ida); @@ -102,6 +104,17 @@ static irqreturn_t vhost_vdpa_virtqueue_cb(void *private) return IRQ_HANDLED; } +static irqreturn_t vhost_vdpa_config_cb(void *private) +{ + struct vhost_vdpa *v = private; + struct eventfd_ctx *config_ctx = v->config_ctx; + + if (config_ctx) + eventfd_signal(config_ctx, 1); + + return IRQ_HANDLED; +} + static void vhost_vdpa_reset(struct vhost_vdpa *v) { struct vdpa_device *vdpa = v->vdpa; @@ -289,6 +302,36 @@ static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp) return 0; } +static void vhost_vdpa_config_put(struct vhost_vdpa *v) +{ + if (v->config_ctx) + eventfd_ctx_put(v->config_ctx); +} + +static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp) +{ + struct vdpa_callback cb; + int fd; + struct eventfd_ctx *ctx; + + cb.callback = vhost_vdpa_config_cb; + cb.private = v->vdpa; + if (copy_from_user(&fd, argp, sizeof(fd))) + return -EFAULT; + + ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd); + swap(ctx, v->config_ctx); + + if (!IS_ERR_OR_NULL(ctx)) + eventfd_ctx_put(ctx); + + if (IS_ERR(v->config_ctx)) + return PTR_ERR(v->config_ctx); + + v->vdpa->config->set_config_cb(v->vdpa, &cb); + + return 0; +} static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, void __user *argp) { @@ -396,6 +439,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, case VHOST_SET_LOG_FD: r = -ENOIOCTLCMD; break; + case VHOST_VDPA_SET_CONFIG_CALL: + r = vhost_vdpa_set_config_call(v, argp); + break; default: r = vhost_dev_ioctl(&v->vdev, cmd, argp); if (r == -ENOIOCTLCMD) @@ -730,6 +776,7 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep) vhost_dev_stop(&v->vdev); vhost_vdpa_iotlb_free(v); vhost_vdpa_free_domain(v); + vhost_vdpa_config_put(v); vhost_dev_cleanup(&v->vdev); kfree(v->vdev.vqs); mutex_unlock(&d->mutex); diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 9fe72e4b1373..0c2349612e77 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -15,6 +15,8 @@ #include #include +#define VHOST_FILE_UNBIND -1 + /* ioctls */ #define VHOST_VIRTIO 0xAF @@ -140,4 +142,6 @@ /* Get the max ring size. */ #define VHOST_VDPA_GET_VRING_NUM _IOR(VHOST_VIRTIO, 0x76, __u16) +/* Set event fd for config interrupt*/ +#define VHOST_VDPA_SET_CONFIG_CALL _IOW(VHOST_VIRTIO, 0x77, int) #endif -- cgit v1.2.3 From 97eda66421c44f1449e8d087fd05eab5d466afb7 Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Fri, 5 Jun 2020 17:41:11 +0200 Subject: include: fix wiki website url in netlink interface header The wiki url is still the old "wireless.kernel.org" instead of the new "wireless.wiki.kernel.org" Signed-off-by: Flavio Suligoi Link: https://lore.kernel.org/r/20200605154112.16277-9-f.suligoi@asem.it Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index dad8c8f8581f..4e6339ab1fce 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -794,7 +794,7 @@ * various triggers. These triggers can be configured through this * command with the %NL80211_ATTR_WOWLAN_TRIGGERS attribute. For * more background information, see - * http://wireless.kernel.org/en/users/Documentation/WoWLAN. + * https://wireless.wiki.kernel.org/en/users/Documentation/WoWLAN. * The @NL80211_CMD_SET_WOWLAN command can also be used as a notification * from the driver reporting the wakeup reason. In this case, the * @NL80211_ATTR_WOWLAN_TRIGGERS attribute will contain the reason -- cgit v1.2.3 From 544fc7dbbf920a3e64d109c416ee229e8e1763c5 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 8 Jun 2020 02:03:15 -0400 Subject: virtio_mem: convert device block size into 64bit If subblock size is large (e.g. 1G) 32 bit math involving it can overflow. Rather than try to catch all instances of that, let's tweak block size to 64 bit. It ripples through UAPI which is an ABI change, but it's not too late to make it, and it will allow supporting >4Gbyte blocks while might become necessary down the road. Fixes: 5f1f79bbc9e26 ("virtio-mem: Paravirtualized memory hotplug") Signed-off-by: Michael S. Tsirkin Acked-by: David Hildenbrand --- drivers/virtio/virtio_mem.c | 18 +++++++++--------- include/uapi/linux/virtio_mem.h | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 2f357142ea5e..50c689f25045 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -77,7 +77,7 @@ struct virtio_mem { uint64_t requested_size; /* The device block size (for communicating with the device). */ - uint32_t device_block_size; + uint64_t device_block_size; /* The translated node id. NUMA_NO_NODE in case not specified. */ int nid; /* Physical start address of the memory region. */ @@ -86,7 +86,7 @@ struct virtio_mem { uint64_t region_size; /* The subblock size. */ - uint32_t subblock_size; + uint64_t subblock_size; /* The number of subblocks per memory block. */ uint32_t nb_sb_per_mb; @@ -1698,9 +1698,9 @@ static int virtio_mem_init(struct virtio_mem *vm) * - At least the device block size. * In the worst case, a single subblock per memory block. */ - vm->subblock_size = PAGE_SIZE * 1u << max_t(uint32_t, MAX_ORDER - 1, - pageblock_order); - vm->subblock_size = max_t(uint32_t, vm->device_block_size, + vm->subblock_size = PAGE_SIZE * 1ul << max_t(uint32_t, MAX_ORDER - 1, + pageblock_order); + vm->subblock_size = max_t(uint64_t, vm->device_block_size, vm->subblock_size); vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size; @@ -1713,12 +1713,12 @@ static int virtio_mem_init(struct virtio_mem *vm) dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); - dev_info(&vm->vdev->dev, "device block size: 0x%x", - vm->device_block_size); + dev_info(&vm->vdev->dev, "device block size: 0x%llx", + (unsigned long long)vm->device_block_size); dev_info(&vm->vdev->dev, "memory block size: 0x%lx", memory_block_size_bytes()); - dev_info(&vm->vdev->dev, "subblock size: 0x%x", - vm->subblock_size); + dev_info(&vm->vdev->dev, "subblock size: 0x%llx", + (unsigned long long)vm->subblock_size); if (vm->nid != NUMA_NO_NODE) dev_info(&vm->vdev->dev, "nid: %d", vm->nid); diff --git a/include/uapi/linux/virtio_mem.h b/include/uapi/linux/virtio_mem.h index a455c488a995..a9ffe041843c 100644 --- a/include/uapi/linux/virtio_mem.h +++ b/include/uapi/linux/virtio_mem.h @@ -185,10 +185,10 @@ struct virtio_mem_resp { struct virtio_mem_config { /* Block size and alignment. Cannot change. */ - __u32 block_size; + __u64 block_size; /* Valid with VIRTIO_MEM_F_ACPI_PXM. Cannot change. */ __u16 node_id; - __u16 padding; + __u8 padding[6]; /* Start address of the memory region. Cannot change. */ __u64 addr; /* Region size (maximum). Cannot change. */ -- cgit v1.2.3 From 281920b7e0b31e0a7706433ff58e7d52ac97c327 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Jun 2020 15:31:46 +0200 Subject: bpf: Devmap adjust uapi for attach bpf program V2: - Defer changing BPF-syscall to start at file-descriptor 1 - Use {} to zero initialise struct. The recent commit fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry"), introduced ability to attach (and run) a separate XDP bpf_prog for each devmap entry. A bpf_prog is added via a file-descriptor. As zero were a valid FD, not using the feature requires using value minus-1. The UAPI is extended via tail-extending struct bpf_devmap_val and using map->value_size to determine the feature set. This will break older userspace applications not using the bpf_prog feature. Consider an old userspace app that is compiled against newer kernel uapi/bpf.h, it will not know that it need to initialise the member bpf_prog.fd to minus-1. Thus, users will be forced to update source code to get program running on newer kernels. This patch remove the minus-1 checks, and have zero mean feature isn't used. Followup patches either for kernel or libbpf should handle and avoid returning file-descriptor zero in the first place. Fixes: fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159170950687.2102545.7235914718298050113.stgit@firesoul --- include/uapi/linux/bpf.h | 13 +++++++++++++ kernel/bpf/devmap.c | 17 ++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c65b374a5090..19684813faae 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3761,6 +3761,19 @@ struct xdp_md { __u32 egress_ifindex; /* txq->dev->ifindex */ }; +/* DEVMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + enum sk_action { SK_DROP = 0, SK_PASS, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index bfdff2faf5cb..0cbb72cdaf63 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -60,15 +60,6 @@ struct xdp_dev_bulk_queue { unsigned int count; }; -/* DEVMAP values */ -struct bpf_devmap_val { - u32 ifindex; /* device index */ - union { - int fd; /* prog fd on map write */ - u32 id; /* prog id on map read */ - } bpf_prog; -}; - struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; @@ -619,7 +610,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; - if (val->bpf_prog.fd >= 0) { + if (val->bpf_prog.fd > 0) { prog = bpf_prog_get_type_dev(val->bpf_prog.fd, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) @@ -653,8 +644,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; + struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -670,7 +661,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; /* can not specify fd if ifindex is 0 */ - if (val.bpf_prog.fd != -1) + if (val.bpf_prog.fd > 0) return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); @@ -700,8 +691,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; + struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; -- cgit v1.2.3 From 7bb64402a092136fb2fda995b0e0304b43c163c5 Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Thu, 11 Jun 2020 20:56:52 +0800 Subject: spi: tools: Add macro definitions to fix build errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SPI_TX_OCTAL and SPI_RX_OCTAL to fix the following build errors: CC spidev_test.o spidev_test.c: In function ‘transfer’: spidev_test.c:131:13: error: ‘SPI_TX_OCTAL’ undeclared (first use in this function) if (mode & SPI_TX_OCTAL) ^ spidev_test.c:131:13: note: each undeclared identifier is reported only once for each function it appears in spidev_test.c:137:13: error: ‘SPI_RX_OCTAL’ undeclared (first use in this function) if (mode & SPI_RX_OCTAL) ^ spidev_test.c: In function ‘parse_opts’: spidev_test.c:290:12: error: ‘SPI_TX_OCTAL’ undeclared (first use in this function) mode |= SPI_TX_OCTAL; ^ spidev_test.c:308:12: error: ‘SPI_RX_OCTAL’ undeclared (first use in this function) mode |= SPI_RX_OCTAL; ^ LD spidev_test-in.o ld: cannot find spidev_test.o: No such file or directory Additionally, maybe SPI_CS_WORD and SPI_3WIRE_HIZ will be used in the future, so add them too. Fixes: 896fa735084e ("spi: spidev_test: Add support for Octal mode data transfers") Signed-off-by: Qing Zhang Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/1591880212-13479-2-git-send-email-zhangqing@loongson.cn Signed-off-by: Mark Brown --- include/uapi/linux/spi/spidev.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/spi/spidev.h b/include/uapi/linux/spi/spidev.h index ee0f2460bff6..9390615d52f0 100644 --- a/include/uapi/linux/spi/spidev.h +++ b/include/uapi/linux/spi/spidev.h @@ -48,6 +48,10 @@ #define SPI_TX_QUAD 0x200 #define SPI_RX_DUAL 0x400 #define SPI_RX_QUAD 0x800 +#define SPI_CS_WORD 0x1000 +#define SPI_TX_OCTAL 0x2000 +#define SPI_RX_OCTAL 0x4000 +#define SPI_3WIRE_HIZ 0x8000 /*---------------------------------------------------------------------------*/ -- cgit v1.2.3 From 88ee9d571b6d8ed345f877e05f685814412e359b Mon Sep 17 00:00:00 2001 From: "Jan (janneke) Nieuwenhuizen" Date: Mon, 25 May 2020 21:39:40 +0200 Subject: ext4: support xattr gnu.* namespace for the Hurd The Hurd gained[0] support for moving the translator and author fields out of the inode and into the "gnu.*" xattr namespace. In anticipation of that, an xattr INDEX was reserved[1]. The Hurd has now been brought into compliance[2] with that. This patch adds support for reading and writing such attributes from Linux; you can now do something like mkdir -p hurd-root/servers/socket touch hurd-root/servers/socket/1 setfattr --name=gnu.translator --value='"/hurd/pflocal\0"' \ hurd-root/servers/socket/1 getfattr --name=gnu.translator hurd-root/servers/socket/1 # file: 1 gnu.translator="/hurd/pflocal" to setup a pipe translator, which is being used to create[3] a vm-image for the Hurd from GNU Guix. [0] https://summerofcode.withgoogle.com/projects/#5869799859027968 [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3980bd3b406addb327d858aebd19e229ea340b9a [2] https://git.savannah.gnu.org/cgit/hurd/hurd.git/commit/?id=a04c7bf83172faa7cb080fbe3b6c04a8415ca645 [3] https://git.savannah.gnu.org/cgit/guix.git/log/?h=wip-hurd-vm Signed-off-by: Jan Nieuwenhuizen Link: https://lore.kernel.org/r/20200525193940.878-1-janneke@gnu.org Signed-off-by: Theodore Ts'o --- fs/ext4/Makefile | 3 ++- fs/ext4/xattr.c | 2 ++ fs/ext4/xattr.h | 1 + fs/ext4/xattr_hurd.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/xattr.h | 4 ++++ 5 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 fs/ext4/xattr_hurd.c (limited to 'include/uapi') diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 4ccb3c9189d8..2e42f47a7f98 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -9,7 +9,8 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ - super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o + super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ + xattr_user.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 9b29a40738ac..7d2f6576d954 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -93,6 +93,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = { #ifdef CONFIG_EXT4_FS_SECURITY [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, #endif + [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; const struct xattr_handler *ext4_xattr_handlers[] = { @@ -105,6 +106,7 @@ const struct xattr_handler *ext4_xattr_handlers[] = { #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif + &ext4_xattr_hurd_handler, NULL }; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index ffe21ac77f78..730b91fa0dd7 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -124,6 +124,7 @@ struct ext4_xattr_inode_array { extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; +extern const struct xattr_handler ext4_xattr_hurd_handler; #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c new file mode 100644 index 000000000000..8cfa74a56361 --- /dev/null +++ b/fs/ext4/xattr_hurd.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/xattr_hurd.c + * Handler for extended gnu attributes for the Hurd. + * + * Copyright (C) 2001 by Andreas Gruenbacher, + * Copyright (C) 2020 by Jan (janneke) Nieuwenhuizen, + */ + +#include +#include +#include "ext4.h" +#include "xattr.h" + +static bool +ext4_xattr_hurd_list(struct dentry *dentry) +{ + return test_opt(dentry->d_sb, XATTR_USER); +} + +static int +ext4_xattr_hurd_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_HURD, + name, buffer, size); +} + +static int +ext4_xattr_hurd_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_HURD, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_hurd_handler = { + .prefix = XATTR_HURD_PREFIX, + .list = ext4_xattr_hurd_list, + .get = ext4_xattr_hurd_get, + .set = ext4_xattr_hurd_set, +}; diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h index c1395b5bd432..9463db2dfa9d 100644 --- a/include/uapi/linux/xattr.h +++ b/include/uapi/linux/xattr.h @@ -7,6 +7,7 @@ Copyright (C) 2001 by Andreas Gruenbacher Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. Copyright (c) 2004 Red Hat, Inc., James Morris + Copyright (c) 2020 Jan (janneke) Nieuwenhuizen */ #include @@ -31,6 +32,9 @@ #define XATTR_BTRFS_PREFIX "btrfs." #define XATTR_BTRFS_PREFIX_LEN (sizeof(XATTR_BTRFS_PREFIX) - 1) +#define XATTR_HURD_PREFIX "gnu." +#define XATTR_HURD_PREFIX_LEN (sizeof(XATTR_HURD_PREFIX) - 1) + #define XATTR_SECURITY_PREFIX "security." #define XATTR_SECURITY_PREFIX_LEN (sizeof(XATTR_SECURITY_PREFIX) - 1) -- cgit v1.2.3 From e17d43b93e544f5016c0251d2074c15568d5d963 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:08 +0300 Subject: perf: Add perf text poke event Record (single instruction) changes to the kernel text (i.e. self-modifying code) in order to support tracers like Intel PT and ARM CoreSight. A copy of the running kernel code is needed as a reference point (e.g. from /proc/kcore). The text poke event records the old bytes and the new bytes so that the event can be processed forwards or backwards. The basic problem is recording the modified instruction in an unambiguous manner given SMP instruction cache (in)coherence. That is, when modifying an instruction concurrently any solution with one or multiple timestamps is not sufficient: CPU0 CPU1 0 1 write insn A 2 execute insn A 3 sync-I$ 4 Due to I$, CPU1 might execute either the old or new A. No matter where we record tracepoints on CPU0, one simply cannot tell what CPU1 will have observed, except that at 0 it must be the old one and at 4 it must be the new one. To solve this, take inspiration from x86 text poking, which has to solve this exact problem due to variable length instruction encoding and I-fetch windows. 1) overwrite the instruction with a breakpoint and sync I$ This guarantees that that code flow will never hit the target instruction anymore, on any CPU (or rather, it will cause an exception). 2) issue the TEXT_POKE event 3) overwrite the breakpoint with the new instruction and sync I$ Now we know that any execution after the TEXT_POKE event will either observe the breakpoint (and hit the exception) or the new instruction. So by guarding the TEXT_POKE event with an exception on either side; we can now tell, without doubt, which instruction another CPU will have observed. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-2-adrian.hunter@intel.com --- include/linux/perf_event.h | 8 ++++ include/uapi/linux/perf_event.h | 21 +++++++++- kernel/events/core.c | 90 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 117 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b4bb32082342..46fe5cfb5163 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1232,6 +1232,9 @@ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); +extern void perf_event_text_poke(const void *addr, + const void *old_bytes, size_t old_len, + const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); @@ -1479,6 +1482,11 @@ static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } +static inline void perf_event_text_poke(const void *addr, + const void *old_bytes, + size_t old_len, + const void *new_bytes, + size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7b2d6fc9e6ed..e5bee6c17b86 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -383,7 +383,8 @@ struct perf_event_attr { bpf_event : 1, /* include bpf events */ aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ - __reserved_1 : 31; + text_poke : 1, /* include text poke events */ + __reserved_1 : 30; union { __u32 wakeup_events; /* wakeup every n events */ @@ -1024,6 +1025,24 @@ enum perf_event_type { */ PERF_RECORD_CGROUP = 19, + /* + * Records changes to kernel text i.e. self-modified code. 'old_len' is + * the number of old bytes, 'new_len' is the number of new bytes. Either + * 'old_len' or 'new_len' may be zero to indicate, for example, the + * addition or removal of a trampoline. 'bytes' contains the old bytes + * followed immediately by the new bytes. + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u16 old_len; + * u16 new_len; + * u8 bytes[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_TEXT_POKE = 20, + PERF_RECORD_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 856d98c36f56..9b8f92500833 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; +static atomic_t nr_text_poke_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4575,7 +4576,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || - attr->context_switch || + attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; @@ -4651,6 +4652,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); + if (event->attr.text_poke) + atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -8628,6 +8631,89 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_text_poke_event { + const void *old_bytes; + const void *new_bytes; + size_t pad; + u16 old_len; + u16 new_len; + + struct { + struct perf_event_header header; + + u64 addr; + } event_id; +}; + +static int perf_event_text_poke_match(struct perf_event *event) +{ + return event->attr.text_poke; +} + +static void perf_event_text_poke_output(struct perf_event *event, void *data) +{ + struct perf_text_poke_event *text_poke_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u64 padding = 0; + int ret; + + if (!perf_event_text_poke_match(event)) + return; + + perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, text_poke_event->event_id); + perf_output_put(&handle, text_poke_event->old_len); + perf_output_put(&handle, text_poke_event->new_len); + + __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); + __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); + + if (text_poke_event->pad) + __output_copy(&handle, &padding, text_poke_event->pad); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_text_poke(const void *addr, const void *old_bytes, + size_t old_len, const void *new_bytes, size_t new_len) +{ + struct perf_text_poke_event text_poke_event; + size_t tot, pad; + + if (!atomic_read(&nr_text_poke_events)) + return; + + tot = sizeof(text_poke_event.old_len) + old_len; + tot += sizeof(text_poke_event.new_len) + new_len; + pad = ALIGN(tot, sizeof(u64)) - tot; + + text_poke_event = (struct perf_text_poke_event){ + .old_bytes = old_bytes, + .new_bytes = new_bytes, + .pad = pad, + .old_len = old_len, + .new_len = new_len, + .event_id = { + .header = { + .type = PERF_RECORD_TEXT_POKE, + .misc = PERF_RECORD_MISC_KERNEL, + .size = sizeof(text_poke_event.event_id) + tot + pad, + }, + .addr = (unsigned long)addr, + }, + }; + + perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -10945,6 +11031,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); + if (event->attr.text_poke) + atomic_inc(&nr_text_poke_events); if (inc) { /* -- cgit v1.2.3 From 69e49088692899d25dedfa22f00dfb9761e86ed7 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:11 +0300 Subject: kprobes: Add perf ksymbol events for kprobe insn pages Symbols are needed for tools to describe instruction addresses. Pages allocated for kprobe's purposes need symbols to be created for them. Add such symbols to be visible via perf ksymbol events. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200512121922.8997-5-adrian.hunter@intel.com --- include/uapi/linux/perf_event.h | 5 +++++ kernel/kprobes.c | 12 ++++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e5bee6c17b86..e1a4179144a1 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1049,6 +1049,11 @@ enum perf_event_type { enum perf_record_ksymbol_type { PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + /* + * Out of line code such as kprobe-replaced instructions or optimized + * kprobes. + */ + PERF_RECORD_KSYMBOL_TYPE_OOL = 2, PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ }; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 058c0be3464b..2b58740ca0f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -184,6 +185,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->cache = c; list_add_rcu(&kip->list, &c->pages); slot = kip->insns; + + /* Record the perf ksymbol register event after adding the page */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, + PAGE_SIZE, false, c->sym); out: mutex_unlock(&c->mutex); return slot; @@ -202,6 +207,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { + /* + * Record perf ksymbol unregister event before removing + * the page. + */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + (unsigned long)kip->insns, PAGE_SIZE, true, + kip->cache->sym); list_del_rcu(&kip->list); synchronize_rcu(); kip->cache->free(kip->insns); -- cgit v1.2.3 From dd9ddf466ad7a5d2e247925d81ebb0b878bf3b76 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:14 +0300 Subject: ftrace: Add perf ksymbol events for ftrace trampolines Symbols are needed for tools to describe instruction addresses. Pages allocated for ftrace's purposes need symbols to be created for them. Add such symbols to be visible via perf ksymbol events. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-8-adrian.hunter@intel.com --- include/uapi/linux/perf_event.h | 2 +- kernel/trace/ftrace.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e1a4179144a1..52ca2093831c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1051,7 +1051,7 @@ enum perf_record_ksymbol_type { PERF_RECORD_KSYMBOL_TYPE_BPF = 1, /* * Out of line code such as kprobe-replaced instructions or optimized - * kprobes. + * kprobes or ftrace trampolines. */ PERF_RECORD_KSYMBOL_TYPE_OOL = 2, PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 31675b209db2..2baaf7716537 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2790,8 +2790,13 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) static void ftrace_trampoline_free(struct ftrace_ops *ops) { if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && - ops->trampoline) + ops->trampoline) { + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, + true, FTRACE_TRAMPOLINE_SYM); + /* Remove from kallsyms after the perf events */ ftrace_remove_trampoline_from_kallsyms(ops); + } arch_ftrace_trampoline_free(ops); } @@ -6805,8 +6810,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) arch_ftrace_update_trampoline(ops); if (ops->trampoline && ops->trampoline != trampoline && - (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) { + /* Add to kallsyms before the perf events */ ftrace_add_trampoline_to_kallsyms(ops); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, false, + FTRACE_TRAMPOLINE_SYM); + } } void ftrace_init_trace_array(struct trace_array *tr) -- cgit v1.2.3 From 27784a256c2a453d891c0aaff84c3ac3f2e8a5a0 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sat, 13 Jun 2020 09:37:54 +0200 Subject: spi: uapi: spidev: Use TABs for alignment The UAPI uses TABs for alignment. Convert the recently introduced spaces to TABs to restore consistency. Fixes: 7bb64402a092136 ("spi: tools: Add macro definitions to fix build errors") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20200613073755.15906-1-geert+renesas@glider.be Signed-off-by: Mark Brown --- include/uapi/linux/spi/spidev.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/spi/spidev.h b/include/uapi/linux/spi/spidev.h index 9390615d52f0..d56427c0b3e0 100644 --- a/include/uapi/linux/spi/spidev.h +++ b/include/uapi/linux/spi/spidev.h @@ -48,10 +48,10 @@ #define SPI_TX_QUAD 0x200 #define SPI_RX_DUAL 0x400 #define SPI_RX_QUAD 0x800 -#define SPI_CS_WORD 0x1000 -#define SPI_TX_OCTAL 0x2000 -#define SPI_RX_OCTAL 0x4000 -#define SPI_3WIRE_HIZ 0x8000 +#define SPI_CS_WORD 0x1000 +#define SPI_TX_OCTAL 0x2000 +#define SPI_RX_OCTAL 0x4000 +#define SPI_3WIRE_HIZ 0x8000 /*---------------------------------------------------------------------------*/ -- cgit v1.2.3 From b0659d8a950d424e57cc0a67afc4740ee561224e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Jun 2020 14:49:26 -0700 Subject: bpf: Fix definition of bpf_ringbuf_output() helper in UAPI comments Fix definition of bpf_ringbuf_output() in UAPI header comments, which is used to generate libbpf's bpf_helper_defs.h header. Return value is a number (error code), not a pointer. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200615214926.3638836-1-andriin@fb.com --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 19684813faae..974a71342aea 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3168,7 +3168,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 19684813faae..974a71342aea 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3168,7 +3168,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of -- cgit v1.2.3 From f517f7925b7b453cb83be06c268ba057b78e4792 Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Mon, 15 Jun 2020 18:14:06 +0530 Subject: ndctl/papr_scm,uapi: Add support for PAPR nvdimm specific methods Introduce support for PAPR NVDIMM Specific Methods (PDSM) in papr_scm module and add the command family NVDIMM_FAMILY_PAPR to the white list of NVDIMM command sets. Also advertise support for ND_CMD_CALL for the nvdimm command mask and implement necessary scaffolding in the module to handle ND_CMD_CALL ioctl and PDSM requests that we receive. The layout of the PDSM request as we expect from libnvdimm/libndctl is described in newly introduced uapi header 'papr_pdsm.h' which defines a 'struct nd_pkg_pdsm' and a maximal union named 'nd_pdsm_payload'. These new structs together with 'struct nd_cmd_pkg' for a pdsm envelop thats sent by libndctl to libnvdimm and serviced by papr_scm in 'papr_scm_service_pdsm()'. The PDSM request is communicated by member 'struct nd_cmd_pkg.nd_command' together with other information on the pdsm payload (size-in, size-out). The patch also introduces 'struct pdsm_cmd_desc' instances of which are stored in an array __pdsm_cmd_descriptors[] indexed with PDSM cmd and corresponding access function pdsm_cmd_desc() is introduced. 'struct pdsm_cdm_desc' holds the service function for a given PDSM and corresponding payload in/out sizes. A new function papr_scm_service_pdsm() is introduced and is called from papr_scm_ndctl() in case of a PDSM request is received via ND_CMD_CALL command from libnvdimm. The function performs validation on the PDSM payload based on info present in corresponding PDSM descriptor and if valid calls the 'struct pdcm_cmd_desc.service' function to service the PDSM. Signed-off-by: Vaibhav Jain Cc: "Aneesh Kumar K . V" Cc: Dan Williams Cc: Michael Ellerman Cc: Ira Weiny Link: https://lore.kernel.org/r/20200615124407.32596-6-vaibhav@linux.ibm.com Signed-off-by: Dan Williams --- arch/powerpc/include/uapi/asm/papr_pdsm.h | 95 +++++++++++++++ arch/powerpc/platforms/pseries/papr_scm.c | 193 +++++++++++++++++++++++++++++- include/uapi/linux/ndctl.h | 1 + 3 files changed, 285 insertions(+), 4 deletions(-) create mode 100644 arch/powerpc/include/uapi/asm/papr_pdsm.h (limited to 'include/uapi') diff --git a/arch/powerpc/include/uapi/asm/papr_pdsm.h b/arch/powerpc/include/uapi/asm/papr_pdsm.h new file mode 100644 index 000000000000..28115152aa4e --- /dev/null +++ b/arch/powerpc/include/uapi/asm/papr_pdsm.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * PAPR nvDimm Specific Methods (PDSM) and structs for libndctl + * + * (C) Copyright IBM 2020 + * + * Author: Vaibhav Jain + */ + +#ifndef _UAPI_ASM_POWERPC_PAPR_PDSM_H_ +#define _UAPI_ASM_POWERPC_PAPR_PDSM_H_ + +#include +#include + +/* + * PDSM Envelope: + * + * The ioctl ND_CMD_CALL exchange data between user-space and kernel via + * envelope which consists of 2 headers sections and payload sections as + * illustrated below: + * +-----------------+---------------+---------------------------+ + * | 64-Bytes | 8-Bytes | Max 184-Bytes | + * +-----------------+---------------+---------------------------+ + * | ND-HEADER | PDSM-HEADER | PDSM-PAYLOAD | + * +-----------------+---------------+---------------------------+ + * | nd_family | | | + * | nd_size_out | cmd_status | | + * | nd_size_in | reserved | nd_pdsm_payload | + * | nd_command | payload --> | | + * | nd_fw_size | | | + * | nd_payload ---> | | | + * +---------------+-----------------+---------------------------+ + * + * ND Header: + * This is the generic libnvdimm header described as 'struct nd_cmd_pkg' + * which is interpreted by libnvdimm before passed on to papr_scm. Important + * member fields used are: + * 'nd_family' : (In) NVDIMM_FAMILY_PAPR_SCM + * 'nd_size_in' : (In) PDSM-HEADER + PDSM-IN-PAYLOAD (usually 0) + * 'nd_size_out' : (In) PDSM-HEADER + PDSM-RETURN-PAYLOAD + * 'nd_command' : (In) One of PAPR_PDSM_XXX + * 'nd_fw_size' : (Out) PDSM-HEADER + size of actual payload returned + * + * PDSM Header: + * This is papr-scm specific header that precedes the payload. This is defined + * as nd_cmd_pdsm_pkg. Following fields aare available in this header: + * + * 'cmd_status' : (Out) Errors if any encountered while servicing PDSM. + * 'reserved' : Not used, reserved for future and should be set to 0. + * 'payload' : A union of all the possible payload structs + * + * PDSM Payload: + * + * The layout of the PDSM Payload is defined by various structs shared between + * papr_scm and libndctl so that contents of payload can be interpreted. As such + * its defined as a union of all possible payload structs as + * 'union nd_pdsm_payload'. Based on the value of 'nd_cmd_pkg.nd_command' + * appropriate member of the union is accessed. + */ + +/* Max payload size that we can handle */ +#define ND_PDSM_PAYLOAD_MAX_SIZE 184 + +/* Max payload size that we can handle */ +#define ND_PDSM_HDR_SIZE \ + (sizeof(struct nd_pkg_pdsm) - ND_PDSM_PAYLOAD_MAX_SIZE) + +/* + * Methods to be embedded in ND_CMD_CALL request. These are sent to the kernel + * via 'nd_cmd_pkg.nd_command' member of the ioctl struct + */ +enum papr_pdsm { + PAPR_PDSM_MIN = 0x0, + PAPR_PDSM_MAX, +}; + +/* Maximal union that can hold all possible payload types */ +union nd_pdsm_payload { + __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; +} __packed; + +/* + * PDSM-header + payload expected with ND_CMD_CALL ioctl from libnvdimm + * Valid member of union 'payload' is identified via 'nd_cmd_pkg.nd_command' + * that should always precede this struct when sent to papr_scm via CMD_CALL + * interface. + */ +struct nd_pkg_pdsm { + __s32 cmd_status; /* Out: Sub-cmd status returned back */ + __u16 reserved[2]; /* Ignored and to be set as '0' */ + union nd_pdsm_payload payload; +} __packed; + +#endif /* _UAPI_ASM_POWERPC_PAPR_PDSM_H_ */ diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 692ad3d79826..d3bbf9940ba4 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -15,13 +15,15 @@ #include #include +#include #define BIND_ANY_ADDR (~0ul) #define PAPR_SCM_DIMM_CMD_MASK \ ((1ul << ND_CMD_GET_CONFIG_SIZE) | \ (1ul << ND_CMD_GET_CONFIG_DATA) | \ - (1ul << ND_CMD_SET_CONFIG_DATA)) + (1ul << ND_CMD_SET_CONFIG_DATA) | \ + (1ul << ND_CMD_CALL)) /* DIMM health bitmap bitmap indicators */ /* SCM device is unable to persist memory contents */ @@ -349,17 +351,195 @@ static int papr_scm_meta_set(struct papr_scm_priv *p, return 0; } +/* + * Do a sanity checks on the inputs args to dimm-control function and return + * '0' if valid. Validation of PDSM payloads happens later in + * papr_scm_service_pdsm. + */ +static int is_cmd_valid(struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len) +{ + unsigned long cmd_mask = PAPR_SCM_DIMM_CMD_MASK; + struct nd_cmd_pkg *nd_cmd; + struct papr_scm_priv *p; + enum papr_pdsm pdsm; + + /* Only dimm-specific calls are supported atm */ + if (!nvdimm) + return -EINVAL; + + /* get the provider data from struct nvdimm */ + p = nvdimm_provider_data(nvdimm); + + if (!test_bit(cmd, &cmd_mask)) { + dev_dbg(&p->pdev->dev, "Unsupported cmd=%u\n", cmd); + return -EINVAL; + } + + /* For CMD_CALL verify pdsm request */ + if (cmd == ND_CMD_CALL) { + /* Verify the envelope and envelop size */ + if (!buf || + buf_len < (sizeof(struct nd_cmd_pkg) + ND_PDSM_HDR_SIZE)) { + dev_dbg(&p->pdev->dev, "Invalid pkg size=%u\n", + buf_len); + return -EINVAL; + } + + /* Verify that the nd_cmd_pkg.nd_family is correct */ + nd_cmd = (struct nd_cmd_pkg *)buf; + + if (nd_cmd->nd_family != NVDIMM_FAMILY_PAPR) { + dev_dbg(&p->pdev->dev, "Invalid pkg family=0x%llx\n", + nd_cmd->nd_family); + return -EINVAL; + } + + pdsm = (enum papr_pdsm)nd_cmd->nd_command; + + /* Verify if the pdsm command is valid */ + if (pdsm <= PAPR_PDSM_MIN || pdsm >= PAPR_PDSM_MAX) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Invalid PDSM\n", + pdsm); + return -EINVAL; + } + + /* Have enough space to hold returned 'nd_pkg_pdsm' header */ + if (nd_cmd->nd_size_out < ND_PDSM_HDR_SIZE) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Invalid payload\n", + pdsm); + return -EINVAL; + } + } + + /* Let the command be further processed */ + return 0; +} + +/* + * 'struct pdsm_cmd_desc' + * Identifies supported PDSMs' expected length of in/out payloads + * and pdsm service function. + * + * size_in : Size of input payload if any in the PDSM request. + * size_out : Size of output payload if any in the PDSM request. + * service : Service function for the PDSM request. Return semantics: + * rc < 0 : Error servicing PDSM and rc indicates the error. + * rc >=0 : Serviced successfully and 'rc' indicate number of + * bytes written to payload. + */ +struct pdsm_cmd_desc { + u32 size_in; + u32 size_out; + int (*service)(struct papr_scm_priv *dimm, + union nd_pdsm_payload *payload); +}; + +/* Holds all supported PDSMs' command descriptors */ +static const struct pdsm_cmd_desc __pdsm_cmd_descriptors[] = { + [PAPR_PDSM_MIN] = { + .size_in = 0, + .size_out = 0, + .service = NULL, + }, + /* New PDSM command descriptors to be added below */ + + /* Empty */ + [PAPR_PDSM_MAX] = { + .size_in = 0, + .size_out = 0, + .service = NULL, + }, +}; + +/* Given a valid pdsm cmd return its command descriptor else return NULL */ +static inline const struct pdsm_cmd_desc *pdsm_cmd_desc(enum papr_pdsm cmd) +{ + if (cmd >= 0 || cmd < ARRAY_SIZE(__pdsm_cmd_descriptors)) + return &__pdsm_cmd_descriptors[cmd]; + + return NULL; +} + +/* + * For a given pdsm request call an appropriate service function. + * Returns errors if any while handling the pdsm command package. + */ +static int papr_scm_service_pdsm(struct papr_scm_priv *p, + struct nd_cmd_pkg *pkg) +{ + /* Get the PDSM header and PDSM command */ + struct nd_pkg_pdsm *pdsm_pkg = (struct nd_pkg_pdsm *)pkg->nd_payload; + enum papr_pdsm pdsm = (enum papr_pdsm)pkg->nd_command; + const struct pdsm_cmd_desc *pdsc; + int rc; + + /* Fetch corresponding pdsm descriptor for validation and servicing */ + pdsc = pdsm_cmd_desc(pdsm); + + /* Validate pdsm descriptor */ + /* Ensure that reserved fields are 0 */ + if (pdsm_pkg->reserved[0] || pdsm_pkg->reserved[1]) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Invalid reserved field\n", + pdsm); + return -EINVAL; + } + + /* If pdsm expects some input, then ensure that the size_in matches */ + if (pdsc->size_in && + pkg->nd_size_in != (pdsc->size_in + ND_PDSM_HDR_SIZE)) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Mismatched size_in=%d\n", + pdsm, pkg->nd_size_in); + return -EINVAL; + } + + /* If pdsm wants to return data, then ensure that size_out matches */ + if (pdsc->size_out && + pkg->nd_size_out != (pdsc->size_out + ND_PDSM_HDR_SIZE)) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Mismatched size_out=%d\n", + pdsm, pkg->nd_size_out); + return -EINVAL; + } + + /* Service the pdsm */ + if (pdsc->service) { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Servicing..\n", pdsm); + + rc = pdsc->service(p, &pdsm_pkg->payload); + + if (rc < 0) { + /* error encountered while servicing pdsm */ + pdsm_pkg->cmd_status = rc; + pkg->nd_fw_size = ND_PDSM_HDR_SIZE; + } else { + /* pdsm serviced and 'rc' bytes written to payload */ + pdsm_pkg->cmd_status = 0; + pkg->nd_fw_size = ND_PDSM_HDR_SIZE + rc; + } + } else { + dev_dbg(&p->pdev->dev, "PDSM[0x%x]: Unsupported PDSM request\n", + pdsm); + pdsm_pkg->cmd_status = -ENOENT; + pkg->nd_fw_size = ND_PDSM_HDR_SIZE; + } + + return pdsm_pkg->cmd_status; +} + static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc) { struct nd_cmd_get_config_size *get_size_hdr; + struct nd_cmd_pkg *call_pkg = NULL; struct papr_scm_priv *p; int rc; - /* Only dimm-specific calls are supported atm */ - if (!nvdimm) - return -EINVAL; + rc = is_cmd_valid(nvdimm, cmd, buf, buf_len); + if (rc) { + pr_debug("Invalid cmd=0x%x. Err=%d\n", cmd, rc); + return rc; + } /* Use a local variable in case cmd_rc pointer is NULL */ if (!cmd_rc) @@ -385,6 +565,11 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, *cmd_rc = papr_scm_meta_set(p, buf); break; + case ND_CMD_CALL: + call_pkg = (struct nd_cmd_pkg *)buf; + *cmd_rc = papr_scm_service_pdsm(p, call_pkg); + break; + default: dev_dbg(&p->pdev->dev, "Unknown command = %d\n", cmd); return -EINVAL; diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index de5d90212409..0e09dc5cec19 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -244,6 +244,7 @@ struct nd_cmd_pkg { #define NVDIMM_FAMILY_HPE2 2 #define NVDIMM_FAMILY_MSFT 3 #define NVDIMM_FAMILY_HYPERV 4 +#define NVDIMM_FAMILY_PAPR 5 #define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\ struct nd_cmd_pkg) -- cgit v1.2.3 From 9b4feb630e8e9801603f3cab3a36369e3c1cf88d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 24 May 2019 11:31:44 +0200 Subject: arch: wire-up close_range() This wires up the close_range() syscall into all arches at once. Suggested-by: Arnd Bergmann Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Acked-by: Arnd Bergmann Acked-by: Michael Ellerman (powerpc) Cc: Jann Horn Cc: David Howells Cc: Dmitry V. Levin Cc: Linus Torvalds Cc: Al Viro Cc: Florian Weimer Cc: linux-api@vger.kernel.org Cc: linux-alpha@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-ia64@vger.kernel.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-mips@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: sparclinux@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: linux-arch@vger.kernel.org Cc: x86@kernel.org --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/uapi/asm-generic/unistd.h | 2 ++ 18 files changed, 20 insertions(+) (limited to 'include/uapi') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 5ddd128d4b7a..a28fb211881d 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -475,6 +475,7 @@ 543 common fspick sys_fspick 544 common pidfd_open sys_pidfd_open # 545 reserved for clone3 +546 common close_range sys_close_range 547 common openat2 sys_openat2 548 common pidfd_getfd sys_pidfd_getfd 549 common faccessat2 sys_faccessat2 diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index d5cae5ffede0..7e8ee4adf269 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -449,6 +449,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 6d95d0c8bf2f..c760b9e159f5 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -879,6 +879,8 @@ __SYSCALL(__NR_fspick, sys_fspick) __SYSCALL(__NR_pidfd_open, sys_pidfd_open) #define __NR_clone3 435 __SYSCALL(__NR_clone3, sys_clone3) +#define __NR_close_range 436 +__SYSCALL(__NR_close_range, sys_close_range) #define __NR_openat2 437 __SYSCALL(__NR_openat2, sys_openat2) #define __NR_pidfd_getfd 438 diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 49e325b604b3..ced9c83e47c9 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -356,6 +356,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open # 435 reserved for clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index f71b1bbcc198..1a4822de7292 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -435,6 +435,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 __sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index edacc4561f2b..a3f4be8e7238 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -441,6 +441,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index f777141f5256..501bc09643bd 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -374,6 +374,7 @@ 433 n32 fspick sys_fspick 434 n32 pidfd_open sys_pidfd_open 435 n32 clone3 __sys_clone3 +436 n32 close_range sys_close_range 437 n32 openat2 sys_openat2 438 n32 pidfd_getfd sys_pidfd_getfd 439 n32 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index da8c76394e17..391acbf425a0 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -350,6 +350,7 @@ 433 n64 fspick sys_fspick 434 n64 pidfd_open sys_pidfd_open 435 n64 clone3 __sys_clone3 +436 n64 close_range sys_close_range 437 n64 openat2 sys_openat2 438 n64 pidfd_getfd sys_pidfd_getfd 439 n64 faccessat2 sys_faccessat2 diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 13280625d312..d28f12f641d3 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -423,6 +423,7 @@ 433 o32 fspick sys_fspick 434 o32 pidfd_open sys_pidfd_open 435 o32 clone3 __sys_clone3 +436 o32 close_range sys_close_range 437 o32 openat2 sys_openat2 438 o32 pidfd_getfd sys_pidfd_getfd 439 o32 faccessat2 sys_faccessat2 diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 5a758fa6ec52..5d76b8f15197 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -433,6 +433,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3_wrapper +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index f833a3190822..dd87a782d80e 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -525,6 +525,7 @@ 435 32 clone3 ppc_clone3 sys_clone3 435 64 clone3 sys_clone3 435 spu clone3 sys_ni_syscall +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index bfdcb7633957..effb5195608c 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -438,6 +438,7 @@ 433 common fspick sys_fspick sys_fspick 434 common pidfd_open sys_pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 sys_clone3 +436 common close_range sys_close_range sys_close_range 437 common openat2 sys_openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 sys_faccessat2 diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index acc35daa1b79..96848db9659e 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -438,6 +438,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open # 435 reserved for clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 8004a276cb74..d6447d08c9a1 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -481,6 +481,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open # 435 reserved for clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index d8f8a1a69ed1..3f0c6546b47c 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -440,6 +440,7 @@ 433 i386 fspick sys_fspick 434 i386 pidfd_open sys_pidfd_open 435 i386 clone3 sys_clone3 +436 i386 close_range sys_close_range 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 78847b32e137..f8637c2c863d 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -357,6 +357,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 69d0d73876b3..d216ccba42f7 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -406,6 +406,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index f4a01305d9a6..31001e3323bc 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -850,6 +850,8 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open) #define __NR_clone3 435 __SYSCALL(__NR_clone3, sys_clone3) #endif +#define __NR_close_range 436 +__SYSCALL(__NR_close_range, sys_close_range) #define __NR_openat2 437 __SYSCALL(__NR_openat2, sys_openat2) -- cgit v1.2.3 From 60997c3d45d9a67daf01c56d805ae4fec37e0bd8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 3 Jun 2020 21:48:55 +0200 Subject: close_range: add CLOSE_RANGE_UNSHARE One of the use-cases of close_range() is to drop file descriptors just before execve(). This would usually be expressed in the sequence: unshare(CLONE_FILES); close_range(3, ~0U); as pointed out by Linus it might be desirable to have this be a part of close_range() itself under a new flag CLOSE_RANGE_UNSHARE. This expands {dup,unshare)_fd() to take a max_fds argument that indicates the maximum number of file descriptors to copy from the old struct files. When the user requests that all file descriptors are supposed to be closed via close_range(min, max) then we can cap via unshare_fd(min) and hence don't need to do any of the heavy fput() work for everything above min. The patch makes it so that if CLOSE_RANGE_UNSHARE is requested and we do in fact currently share our file descriptor table we create a new private copy. We then close all fds in the requested range and finally after we're done we install the new fd table. Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner --- fs/file.c | 65 +++++++++++++++++++++++++++++++++++----- fs/open.c | 5 +--- include/linux/fdtable.h | 8 +++-- include/uapi/linux/close_range.h | 9 ++++++ kernel/fork.c | 11 +++---- 5 files changed, 79 insertions(+), 19 deletions(-) create mode 100644 include/uapi/linux/close_range.h (limited to 'include/uapi') diff --git a/fs/file.c b/fs/file.c index 1b8ff05e8311..340bc9569f9d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -19,6 +19,7 @@ #include #include #include +#include unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -265,12 +266,22 @@ static unsigned int count_open_files(struct fdtable *fdt) return i; } +static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) +{ + unsigned int count; + + count = count_open_files(fdt); + if (max_fds < NR_OPEN_DEFAULT) + max_fds = NR_OPEN_DEFAULT; + return min(count, max_fds); +} + /* * Allocate a new files structure and copy contents from the * passed in files structure. * errorp will be valid only when the returned files_struct is NULL. */ -struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) +struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; @@ -297,7 +308,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = count_open_files(old_fdt); + open_files = sane_fdtable_size(old_fdt, max_fds); /* * Check whether we need to allocate a larger fd array and fd set. @@ -328,7 +339,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = count_open_files(old_fdt); + open_files = sane_fdtable_size(old_fdt, max_fds); } copy_fd_bitmaps(new_fdt, old_fdt, open_files); @@ -665,32 +676,72 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. */ -int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd) +int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) { unsigned int cur_max; + struct task_struct *me = current; + struct files_struct *cur_fds = me->files, *fds = NULL; + + if (flags & ~CLOSE_RANGE_UNSHARE) + return -EINVAL; if (fd > max_fd) return -EINVAL; rcu_read_lock(); - cur_max = files_fdtable(files)->max_fds; + cur_max = files_fdtable(cur_fds)->max_fds; rcu_read_unlock(); /* cap to last valid index into fdtable */ cur_max--; + if (flags & CLOSE_RANGE_UNSHARE) { + int ret; + unsigned int max_unshare_fds = NR_OPEN_MAX; + + /* + * If the requested range is greater than the current maximum, + * we're closing everything so only copy all file descriptors + * beneath the lowest file descriptor. + */ + if (max_fd >= cur_max) + max_unshare_fds = fd; + + ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); + if (ret) + return ret; + + /* + * We used to share our file descriptor table, and have now + * created a private one, make sure we're using it below. + */ + if (fds) + swap(cur_fds, fds); + } + max_fd = min(max_fd, cur_max); while (fd <= max_fd) { struct file *file; - file = pick_file(files, fd++); + file = pick_file(cur_fds, fd++); if (!file) continue; - filp_close(file, files); + filp_close(file, cur_fds); cond_resched(); } + if (fds) { + /* + * We're done closing the files we were supposed to. Time to install + * the new file descriptor table and drop the old one. + */ + task_lock(me); + me->files = cur_fds; + task_unlock(me); + put_files_struct(fds); + } + return 0; } diff --git a/fs/open.c b/fs/open.c index 073ea3c45347..5e62f18adc5b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1324,10 +1324,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { - if (flags) - return -EINVAL; - - return __close_range(current->files, fd, max_fd); + return __close_range(fd, max_fd, flags); } /* diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index fcd07181a365..a32bf47c593e 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -22,6 +22,7 @@ * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG +#define NR_OPEN_MAX ~0U struct fdtable { unsigned int max_fds; @@ -109,7 +110,7 @@ struct files_struct *get_files_struct(struct task_struct *); void put_files_struct(struct files_struct *fs); void reset_files_struct(struct files_struct *); int unshare_files(struct files_struct **); -struct files_struct *dup_fd(struct files_struct *, int *) __latent_entropy; +struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), @@ -121,9 +122,10 @@ extern void __fd_install(struct files_struct *files, unsigned int fd, struct file *file); extern int __close_fd(struct files_struct *files, unsigned int fd); -extern int __close_range(struct files_struct *files, unsigned int fd, - unsigned int max_fd); +extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); extern int __close_fd_get_file(unsigned int fd, struct file **res); +extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, + struct files_struct **new_fdp); extern struct kmem_cache *files_cachep; diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h new file mode 100644 index 000000000000..6928a9fdee3c --- /dev/null +++ b/include/uapi/linux/close_range.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_CLOSE_RANGE_H +#define _UAPI_LINUX_CLOSE_RANGE_H + +/* Unshare the file descriptor table before closing file descriptors. */ +#define CLOSE_RANGE_UNSHARE (1U << 1) + +#endif /* _UAPI_LINUX_CLOSE_RANGE_H */ + diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..8948121c8454 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1474,7 +1474,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) goto out; } - newf = dup_fd(oldf, &error); + newf = dup_fd(oldf, NR_OPEN_MAX, &error); if (!newf) goto out; @@ -2907,14 +2907,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) +int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, + struct files_struct **new_fdp) { struct files_struct *fd = current->files; int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, &error); + *new_fdp = dup_fd(fd, max_fds, &error); if (!*new_fdp) return error; } @@ -2974,7 +2975,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, &new_fd); + err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3063,7 +3064,7 @@ int unshare_files(struct files_struct **displaced) struct files_struct *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, ©); + error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); if (error || !copy) { *displaced = NULL; return error; -- cgit v1.2.3 From 03cc8353c2244c8b790c3c81a0f1532d34a9d738 Mon Sep 17 00:00:00 2001 From: Rob Gill Date: Mon, 1 Jun 2020 21:17:56 +0000 Subject: USB: core: additional Device Classes to debug/usb/devices Several newer USB Device classes are not presently reported individually at /sys/kernel/debug/usb/devices, (reported as "unk."). This patch adds the following classes: 0fh (Personal Healthcare devices), 10h (USB Type-C combined Audio/Video devices) 11h (USB billboard), 12h (USB Type-C Bridge). As defined at [https://www.usb.org/defined-class-codes] Corresponding classes defined in include/linux/usb/ch9.h. Signed-off-by: Rob Gill Link: https://lore.kernel.org/r/20200601211749.6878-1-rrobgill@protonmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devices.c | 4 ++++ include/uapi/linux/usb/ch9.h | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c index 94b6fa6e585e..696b2b692b83 100644 --- a/drivers/usb/core/devices.c +++ b/drivers/usb/core/devices.c @@ -133,6 +133,10 @@ static const struct class_info clas_info[] = { {USB_CLASS_CSCID, "scard"}, {USB_CLASS_CONTENT_SEC, "c-sec"}, {USB_CLASS_VIDEO, "video"}, + {USB_CLASS_PERSONAL_HEALTHCARE, "perhc"}, + {USB_CLASS_AUDIO_VIDEO, "av"}, + {USB_CLASS_BILLBOARD, "blbrd"}, + {USB_CLASS_USB_TYPE_C_BRIDGE, "bridg"}, {USB_CLASS_WIRELESS_CONTROLLER, "wlcon"}, {USB_CLASS_MISC, "misc"}, {USB_CLASS_APP_SPEC, "app."}, diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 2b623f36af6b..456ab0c2b586 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -326,6 +326,10 @@ struct usb_device_descriptor { #define USB_CLASS_CONTENT_SEC 0x0d /* content security */ #define USB_CLASS_VIDEO 0x0e #define USB_CLASS_WIRELESS_CONTROLLER 0xe0 +#define USB_CLASS_PERSONAL_HEALTHCARE 0x0f +#define USB_CLASS_AUDIO_VIDEO 0x10 +#define USB_CLASS_BILLBOARD 0x11 +#define USB_CLASS_USB_TYPE_C_BRIDGE 0x12 #define USB_CLASS_MISC 0xef #define USB_CLASS_APP_SPEC 0xfe #define USB_CLASS_VENDOR_SPEC 0xff -- cgit v1.2.3 From 81c7462883b0cc0a4eeef0687f80ad5b5baee5f6 Mon Sep 17 00:00:00 2001 From: Macpaul Lin Date: Thu, 18 Jun 2020 17:13:38 +0800 Subject: USB: replace hardcode maximum usb string length by definition Replace hardcoded maximum USB string length (126 bytes) by definition "USB_MAX_STRING_LEN". Signed-off-by: Macpaul Lin Acked-by: Alan Stern Link: https://lore.kernel.org/r/1592471618-29428-1-git-send-email-macpaul.lin@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/composite.c | 4 ++-- drivers/usb/gadget/configfs.c | 2 +- drivers/usb/gadget/usbstring.c | 4 ++-- include/uapi/linux/usb/ch9.h | 3 +++ 4 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index 5c1eb96a5c57..8fbf73467fef 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -1085,7 +1085,7 @@ static void collect_langs(struct usb_gadget_strings **sp, __le16 *buf) while (*sp) { s = *sp; language = cpu_to_le16(s->language); - for (tmp = buf; *tmp && tmp < &buf[126]; tmp++) { + for (tmp = buf; *tmp && tmp < &buf[USB_MAX_STRING_LEN]; tmp++) { if (*tmp == language) goto repeat; } @@ -1160,7 +1160,7 @@ static int get_string(struct usb_composite_dev *cdev, collect_langs(sp, s->wData); } - for (len = 0; len <= 126 && s->wData[len]; len++) + for (len = 0; len <= USB_MAX_STRING_LEN && s->wData[len]; len++) continue; if (!len) return -EINVAL; diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 9dc06a4e1b30..56051bb97349 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -103,7 +103,7 @@ static int usb_string_copy(const char *s, char **s_copy) char *str; char *copy = *s_copy; ret = strlen(s); - if (ret > 126) + if (ret > USB_MAX_STRING_LEN) return -EOVERFLOW; str = kstrdup(s, GFP_KERNEL); diff --git a/drivers/usb/gadget/usbstring.c b/drivers/usb/gadget/usbstring.c index 58a4d3325090..30f889ad3ad2 100644 --- a/drivers/usb/gadget/usbstring.c +++ b/drivers/usb/gadget/usbstring.c @@ -55,9 +55,9 @@ usb_gadget_get_string (const struct usb_gadget_strings *table, int id, u8 *buf) return -EINVAL; /* string descriptors have length, tag, then UTF16-LE text */ - len = min ((size_t) 126, strlen (s->s)); + len = min((size_t)USB_MAX_STRING_LEN, strlen(s->s)); len = utf8s_to_utf16s(s->s, len, UTF16_LITTLE_ENDIAN, - (wchar_t *) &buf[2], 126); + (wchar_t *) &buf[2], USB_MAX_STRING_LEN); if (len < 0) return -EINVAL; buf [0] = (len + 1) * 2; diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 456ab0c2b586..b1ed2ccfe9cf 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -368,6 +368,9 @@ struct usb_config_descriptor { /*-------------------------------------------------------------------------*/ +/* USB String descriptors can contain at most 126 characters. */ +#define USB_MAX_STRING_LEN 126 + /* USB_DT_STRING: String descriptor */ struct usb_string_descriptor { __u8 bLength; -- cgit v1.2.3 From f751820bc319a30d31a80fe511d88642aaefcdee Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 18 Jun 2020 13:07:13 -0600 Subject: vfio/type1: Fix migration info capability ID ID 1 is already used by the IOVA range capability, use ID 2. Reported-by: Liu Yi L Fixes: ad721705d09c ("vfio iommu: Add migration capability to report supported features") Reviewed-by: Kirti Wankhede Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index eca6692667a3..920470502329 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1030,7 +1030,7 @@ struct vfio_iommu_type1_info_cap_iova_range { * size in bytes that can be used by user applications when getting the dirty * bitmap. */ -#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION 1 +#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION 2 struct vfio_iommu_type1_info_cap_migration { struct vfio_info_cap_header header; -- cgit v1.2.3 From 94b292b277343190175d39172c903c0c5fb814f1 Mon Sep 17 00:00:00 2001 From: Ben Davis Date: Mon, 1 Jun 2020 17:28:17 +0100 Subject: drm: drm_fourcc: add NV15, Q410, Q401 YUV formats DRM_FORMAT_NV15 is a 2 plane format suitable for linear and 16x16 block-linear memory layouts (DRM_FORMAT_MOD_SAMSUNG_16_16_TILE). The format is similar to P010 with 4:2:0 sub-sampling but has no padding between components. Instead, luminance and chrominance samples are grouped into 4s so that each group is packed into an integer number of bytes: YYYY = UVUV = 4 * 10 bits = 40 bits = 5 bytes The '15' suffix refers to the optimum effective bits per pixel which is achieved when the total number of luminance samples is a multiple of 8. Q410 and Q401 are both 3 plane non-subsampled formats with 16 bits per component, but only 10 bits are used and 6 are padded. 'Q' is chosen as the first letter to denote 3 plane YUV444, (and is the next letter along from P which is usually 2 plane). V2: Updated block_w of NV15 to {4, 2, 0} V3: Updated commit message to include specific modifier name NV15: Tested-by: Jonas Karlman Reviewed-by: Brian Starkey Signed-off-by: Ben Davis Signed-off-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20200601162817.18230-1-ben.davis@arm.com --- drivers/gpu/drm/drm_fourcc.c | 12 ++++++++++++ include/uapi/drm/drm_fourcc.h | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/drm_fourcc.c b/drivers/gpu/drm/drm_fourcc.c index b234bfaeda06..722c7ebe4e88 100644 --- a/drivers/gpu/drm/drm_fourcc.c +++ b/drivers/gpu/drm/drm_fourcc.c @@ -274,6 +274,18 @@ const struct drm_format_info *__drm_format_info(u32 format) { .format = DRM_FORMAT_YUV420_10BIT, .depth = 0, .num_planes = 1, .cpp = { 0, 0, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true }, + { .format = DRM_FORMAT_NV15, .depth = 0, + .num_planes = 2, .char_per_block = { 5, 5, 0 }, + .block_w = { 4, 2, 0 }, .block_h = { 1, 1, 0 }, .hsub = 2, + .vsub = 2, .is_yuv = true }, + { .format = DRM_FORMAT_Q410, .depth = 0, + .num_planes = 3, .char_per_block = { 2, 2, 2 }, + .block_w = { 1, 1, 1 }, .block_h = { 1, 1, 1 }, .hsub = 0, + .vsub = 0, .is_yuv = true }, + { .format = DRM_FORMAT_Q401, .depth = 0, + .num_planes = 3, .char_per_block = { 2, 2, 2 }, + .block_w = { 1, 1, 1 }, .block_h = { 1, 1, 1 }, .hsub = 0, + .vsub = 0, .is_yuv = true }, }; unsigned int i; diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 9e488d10f8b4..3c4055e48371 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -236,6 +236,12 @@ extern "C" { #define DRM_FORMAT_NV61 fourcc_code('N', 'V', '6', '1') /* 2x1 subsampled Cb:Cr plane */ #define DRM_FORMAT_NV24 fourcc_code('N', 'V', '2', '4') /* non-subsampled Cr:Cb plane */ #define DRM_FORMAT_NV42 fourcc_code('N', 'V', '4', '2') /* non-subsampled Cb:Cr plane */ +/* + * 2 plane YCbCr + * index 0 = Y plane, [39:0] Y3:Y2:Y1:Y0 little endian + * index 1 = Cr:Cb plane, [39:0] Cr1:Cb1:Cr0:Cb0 little endian + */ +#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') /* 2x2 subsampled Cr:Cb plane */ /* * 2 plane YCbCr MSB aligned @@ -265,6 +271,22 @@ extern "C" { */ #define DRM_FORMAT_P016 fourcc_code('P', '0', '1', '6') /* 2x2 subsampled Cr:Cb plane 16 bits per channel */ +/* 3 plane non-subsampled (444) YCbCr + * 16 bits per component, but only 10 bits are used and 6 bits are padded + * index 0: Y plane, [15:0] Y:x [10:6] little endian + * index 1: Cb plane, [15:0] Cb:x [10:6] little endian + * index 2: Cr plane, [15:0] Cr:x [10:6] little endian + */ +#define DRM_FORMAT_Q410 fourcc_code('Q', '4', '1', '0') + +/* 3 plane non-subsampled (444) YCrCb + * 16 bits per component, but only 10 bits are used and 6 bits are padded + * index 0: Y plane, [15:0] Y:x [10:6] little endian + * index 1: Cr plane, [15:0] Cr:x [10:6] little endian + * index 2: Cb plane, [15:0] Cb:x [10:6] little endian + */ +#define DRM_FORMAT_Q401 fourcc_code('Q', '4', '0', '1') + /* * 3 plane YCbCr * index 0: Y plane, [7:0] Y -- cgit v1.2.3 From 79ce058032c391b12af928b1e30abf92482a270f Mon Sep 17 00:00:00 2001 From: Ben Davis Date: Thu, 30 Apr 2020 09:32:20 +0100 Subject: drm: drm_fourcc: Add uncompressed AFBC modifier AFBC has a mode that guarantees use of AFBC with an uncompressed payloads, we add a new modifier to support this mode. V2: updated modifier comment Signed-off-by: Ben Davis Acked-by: Liviu Dudau Signed-off-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20200430083220.17347-1-ben.davis@arm.com --- include/uapi/drm/drm_fourcc.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 3c4055e48371..299f9ac4840b 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -808,6 +808,18 @@ extern "C" { */ #define AFBC_FORMAT_MOD_BCH (1ULL << 11) +/* AFBC uncompressed storage mode + * + * Indicates that the buffer is using AFBC uncompressed storage mode. + * In this mode all superblock payloads in the buffer use the uncompressed + * storage mode, which is usually only used for data which cannot be compressed. + * The buffer layout is the same as for AFBC buffers without USM set, this only + * affects the storage mode of the individual superblocks. Note that even a + * buffer without USM set may use uncompressed storage mode for some or all + * superblocks, USM just guarantees it for all. + */ +#define AFBC_FORMAT_MOD_USM (1ULL << 12) + /* * Arm 16x16 Block U-Interleaved modifier * -- cgit v1.2.3 From 5769a351b89cd4d82016f18fa5f6c4077403564d Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Wed, 17 Jun 2020 17:53:55 +0800 Subject: io_uring: change the poll type to be 32-bits poll events should be 32-bits to cover EPOLLEXCLUSIVE. Explicit word-swap the poll32_events for big endian to make sure the ABI is not changed. We call this feature IORING_FEAT_POLL_32BITS, applications who want to use EPOLLEXCLUSIVE should check the feature bit first. Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- include/uapi/linux/io_uring.h | 4 +++- tools/io_uring/liburing.h | 6 +++++- 3 files changed, 17 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/fs/io_uring.c b/fs/io_uring.c index a78201b96179..0eb063daa9b5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4589,7 +4589,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; - u16 events; + u32 events; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4598,7 +4598,10 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (!poll->file) return -EBADF; - events = READ_ONCE(sqe->poll_events); + events = READ_ONCE(sqe->poll32_events); +#ifdef __BIG_ENDIAN + events = swahw32(events); +#endif poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; io_get_req_task(req); @@ -7928,7 +7931,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | + IORING_FEAT_POLL_32BITS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -8217,7 +8221,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); - BUILD_BUG_SQE_ELEM(28, __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 92c22699a5a7..8d033961cb78 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -31,7 +31,8 @@ struct io_uring_sqe { union { __kernel_rwf_t rw_flags; __u32 fsync_flags; - __u16 poll_events; + __u16 poll_events; /* compatibility */ + __u32 poll32_events; /* word-reversed for BE */ __u32 sync_range_flags; __u32 msg_flags; __u32 timeout_flags; @@ -248,6 +249,7 @@ struct io_uring_params { #define IORING_FEAT_RW_CUR_POS (1U << 3) #define IORING_FEAT_CUR_PERSONALITY (1U << 4) #define IORING_FEAT_FAST_POLL (1U << 5) +#define IORING_FEAT_POLL_32BITS (1U << 6) /* * io_uring_register(2) opcodes and arguments diff --git a/tools/io_uring/liburing.h b/tools/io_uring/liburing.h index 5f305c86b892..28a837b6069d 100644 --- a/tools/io_uring/liburing.h +++ b/tools/io_uring/liburing.h @@ -10,6 +10,7 @@ extern "C" { #include #include "../../include/uapi/linux/io_uring.h" #include +#include #include "barrier.h" /* @@ -145,11 +146,14 @@ static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, } static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd, - short poll_mask) + unsigned poll_mask) { memset(sqe, 0, sizeof(*sqe)); sqe->opcode = IORING_OP_POLL_ADD; sqe->fd = fd; +#if __BYTE_ORDER == __BIG_ENDIAN + poll_mask = __swahw32(poll_mask); +#endif sqe->poll_events = poll_mask; } -- cgit v1.2.3 From 2a916ecc405686c1d86f632281bc06aa75ebae4e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 19 Jun 2020 03:32:48 +0000 Subject: net/devlink: Support querying hardware address of port function PCI PF and VF devlink port can manage the function represented by a devlink port. Enable users to query port function's hardware address. Example of a PCI VF port which supports a port function: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:11:22:33:44:66 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "enp6s0pf0vf1", "flavour": "pcivf", "pfnum": 0, "vfnum": 1, "function": { "hw_addr": "00:11:22:33:44:66" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 12 ++++++++++++ include/uapi/linux/devlink.h | 10 ++++++++++ net/core/devlink.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+) (limited to 'include/uapi') diff --git a/include/net/devlink.h b/include/net/devlink.h index 1df6dfec26c2..56fc9cdb189d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1107,6 +1107,18 @@ struct devlink_ops { int (*trap_policer_counter_get)(struct devlink *devlink, const struct devlink_trap_policer *policer, u64 *p_drops); + /** + * @port_function_hw_addr_get: Port function's hardware address get function. + * + * Should be used by device drivers to report the hardware address of a function managed + * by the devlink port. Driver should return -EOPNOTSUPP if it doesn't support port + * function handling for a particular port. + * + * Note: @extack can be NULL when port notifier queries the port function. + */ + int (*port_function_hw_addr_get)(struct devlink *devlink, struct devlink_port *port, + u8 *hw_addr, int *hw_addr_len, + struct netlink_ext_ack *extack); }; static inline void *devlink_priv(struct devlink *devlink) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 08563e6a424d..07d0af8f5923 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -451,6 +451,8 @@ enum devlink_attr { DEVLINK_ATTR_TRAP_POLICER_RATE, /* u64 */ DEVLINK_ATTR_TRAP_POLICER_BURST, /* u64 */ + DEVLINK_ATTR_PORT_FUNCTION, /* nested */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, @@ -497,4 +499,12 @@ enum devlink_resource_unit { DEVLINK_RESOURCE_UNIT_ENTRY, }; +enum devlink_port_function_attr { + DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, + DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, /* binary */ + + __DEVLINK_PORT_FUNCTION_ATTR_MAX, + DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1 +}; + #endif /* _UAPI_LINUX_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 05197631d52a..b6848b607e9c 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -563,6 +563,49 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return 0; } +static int +devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, + struct netlink_ext_ack *extack) +{ + struct devlink *devlink = port->devlink; + const struct devlink_ops *ops; + struct nlattr *function_attr; + bool empty_nest = true; + int err = 0; + + function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); + if (!function_attr) + return -EMSGSIZE; + + ops = devlink->ops; + if (ops->port_function_hw_addr_get) { + int uninitialized_var(hw_addr_len); + u8 hw_addr[MAX_ADDR_LEN]; + + err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack); + if (err == -EOPNOTSUPP) { + /* Port function attributes are optional for a port. If port doesn't + * support function attribute, returning -EOPNOTSUPP is not an error. + */ + err = 0; + goto out; + } else if (err) { + goto out; + } + err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); + if (err) + goto out; + empty_nest = false; + } + +out: + if (err || empty_nest) + nla_nest_cancel(msg, function_attr); + else + nla_nest_end(msg, function_attr); + return err; +} + static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, @@ -608,6 +651,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, spin_unlock_bh(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; + if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) + goto nla_put_failure; genlmsg_end(msg, hdr); return 0; -- cgit v1.2.3 From b5872cd0e823e4cb50b3a75cd9522167eeb676a2 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Sat, 20 Jun 2020 22:01:56 +0530 Subject: devlink: Add support for board.serial_number to info_get cb. Board serial number is a serial number, often available in PCI *Vital Product Data*. Also, update devlink-info.rst documentation file. Cc: Jiri Pirko Cc: Jakub Kicinski Signed-off-by: Vasundhara Volam Reviewed-by: Michael Chan Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-info.rst | 12 +++++------- include/net/devlink.h | 2 ++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 8 ++++++++ 4 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/devlink/devlink-info.rst b/Documentation/networking/devlink/devlink-info.rst index 3fe11401b838..7572bf6de5c1 100644 --- a/Documentation/networking/devlink/devlink-info.rst +++ b/Documentation/networking/devlink/devlink-info.rst @@ -44,9 +44,11 @@ versions is generally discouraged - here, and via any other Linux API. reported for two ports of the same device or on two hosts of a multi-host device should be identical. - .. note:: ``devlink-info`` API should be extended with a new field - if devices want to report board/product serial number (often - reported in PCI *Vital Product Data* capability). + * - ``board.serial_number`` + - Board serial number of the device. + + This is usually the serial number of the board, often available in + PCI *Vital Product Data*. * - ``fixed`` - Group for hardware identifiers, and versions of components @@ -201,10 +203,6 @@ Future work The following extensions could be useful: - - product serial number - NIC boards often get labeled with a board serial - number rather than ASIC serial number; it'd be useful to add board serial - numbers to the API if they can be retrieved from the device; - - on-disk firmware file names - drivers list the file names of firmware they may need to load onto devices via the ``MODULE_FIRMWARE()`` macro. These, however, are per module, rather than per device. It'd be useful to list diff --git a/include/net/devlink.h b/include/net/devlink.h index 7007f93585a5..428f55f8197c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1284,6 +1284,8 @@ int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn); int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name); +int devlink_info_board_serial_number_put(struct devlink_info_req *req, + const char *bsn); int devlink_info_version_fixed_put(struct devlink_info_req *req, const char *version_name, const char *version_value); diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 07d0af8f5923..87c83a82991b 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -453,6 +453,8 @@ enum devlink_attr { DEVLINK_ATTR_PORT_FUNCTION, /* nested */ + DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, /* string */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index baa45eca6b5a..455998a57671 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4502,6 +4502,14 @@ int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) } EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); +int devlink_info_board_serial_number_put(struct devlink_info_req *req, + const char *bsn) +{ + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, + bsn); +} +EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); + static int devlink_info_version_put(struct devlink_info_req *req, int attr, const char *version_name, const char *version_value) -- cgit v1.2.3 From 23a60f834406c8e3805328b630d09d5546b460c1 Mon Sep 17 00:00:00 2001 From: Collin Walling Date: Mon, 22 Jun 2020 11:46:36 -0400 Subject: s390/kvm: diagnose 0x318 sync and reset DIAGNOSE 0x318 (diag318) sets information regarding the environment the VM is running in (Linux, z/VM, etc) and is observed via firmware/service events. This is a privileged s390x instruction that must be intercepted by SIE. Userspace handles the instruction as well as migration. Data is communicated via VCPU register synchronization. The Control Program Name Code (CPNC) is stored in the SIE block. The CPNC along with the Control Program Version Code (CPVC) are stored in the kvm_vcpu_arch struct. This data is reset on load normal and clear resets. Signed-off-by: Collin Walling Reviewed-by: Janosch Frank Acked-by: Cornelia Huck Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20200622154636.5499-3-walling@linux.ibm.com [borntraeger@de.ibm.com: fix sync_reg position] Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 4 +++- arch/s390/include/uapi/asm/kvm.h | 7 +++++-- arch/s390/kvm/kvm-s390.c | 11 ++++++++++- arch/s390/kvm/vsie.c | 1 + include/uapi/linux/kvm.h | 1 + 5 files changed, 20 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index cee3cb6455a2..371ec6beb618 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -260,7 +260,8 @@ struct kvm_s390_sie_block { __u32 scaol; /* 0x0064 */ __u8 sdf; /* 0x0068 */ __u8 epdx; /* 0x0069 */ - __u8 reserved6a[2]; /* 0x006a */ + __u8 cpnc; /* 0x006a */ + __u8 reserved6b; /* 0x006b */ __u32 todpr; /* 0x006c */ #define GISA_FORMAT1 0x00000001 __u32 gd; /* 0x0070 */ @@ -745,6 +746,7 @@ struct kvm_vcpu_arch { bool gs_enabled; bool skey_enabled; struct kvm_s390_pv_vcpu pv; + union diag318_info diag318_info; }; struct kvm_vm_stat { diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 436ec7636927..7a6b14874d65 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -231,11 +231,13 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_GSCB (1UL << 9) #define KVM_SYNC_BPBC (1UL << 10) #define KVM_SYNC_ETOKEN (1UL << 11) +#define KVM_SYNC_DIAG318 (1UL << 12) #define KVM_SYNC_S390_VALID_FIELDS \ (KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \ KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \ - KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN) + KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN | \ + KVM_SYNC_DIAG318) /* length and alignment of the sdnx as a power of two */ #define SDNXC 8 @@ -264,7 +266,8 @@ struct kvm_sync_regs { __u8 reserved2 : 7; __u8 padding1[51]; /* riccb needs to be 64byte aligned */ __u8 riccb[64]; /* runtime instrumentation controls block */ - __u8 padding2[192]; /* sdnx needs to be 256byte aligned */ + __u64 diag318; /* diagnose 0x318 info */ + __u8 padding2[184]; /* sdnx needs to be 256byte aligned */ union { __u8 sdnx[SDNXL]; /* state description annex */ struct { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d47c19718615..08e6cf6cb454 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -545,6 +545,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_AIS_MIGRATION: case KVM_CAP_S390_VCPU_RESETS: case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_S390_DIAG318: r = 1; break; case KVM_CAP_S390_HPAGE_1M: @@ -3267,7 +3268,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) KVM_SYNC_ACRS | KVM_SYNC_CRS | KVM_SYNC_ARCH0 | - KVM_SYNC_PFAULT; + KVM_SYNC_PFAULT | + KVM_SYNC_DIAG318; kvm_s390_set_prefix(vcpu, 0); if (test_kvm_facility(vcpu->kvm, 64)) vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; @@ -3562,6 +3564,7 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->pp = 0; vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.sie_block->todpr = 0; + vcpu->arch.sie_block->cpnc = 0; } } @@ -3579,6 +3582,7 @@ static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu) regs->etoken = 0; regs->etoken_extension = 0; + regs->diag318 = 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) @@ -4196,6 +4200,10 @@ static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) kvm_clear_async_pf_completion_queue(vcpu); } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_DIAG318) { + vcpu->arch.diag318_info.val = kvm_run->s.regs.diag318; + vcpu->arch.sie_block->cpnc = vcpu->arch.diag318_info.cpnc; + } /* * If userspace sets the riccb (e.g. after migration) to a valid state, * we should enable RI here instead of doing the lazy enablement. @@ -4297,6 +4305,7 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; + kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val; if (MACHINE_HAS_GS) { __ctl_set_bit(2, 4); if (vcpu->arch.gs_enabled) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 9e9056cebfcf..4f3cbf6003a9 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -548,6 +548,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->ecd |= scb_o->ecd & ECD_ETOKENF; scb_s->hpid = HPID_VSIE; + scb_s->cpnc = scb_o->cpnc; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4fdf30316582..35cdb4307904 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1031,6 +1031,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_SECURE_GUEST 181 #define KVM_CAP_HALT_POLL 182 #define KVM_CAP_ASYNC_PF_INT 183 +#define KVM_CAP_S390_DIAG318 184 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From ac53503ee38a1ffbc47c7cca6cbfc48ba9c65c5e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 14 May 2020 18:01:43 +0200 Subject: media: videobuf2: add V4L2_FLAG_MEMORY_NON_CONSISTENT flag By setting or clearing V4L2_FLAG_MEMORY_NON_CONSISTENT flag user-space should be able to set or clear queue's NON_CONSISTENT ->dma_attrs. Queue's ->dma_attrs are passed to the underlying allocator in __vb2_buf_mem_alloc(), so thus user-space is able to request vb2 buffer's memory to be either consistent (coherent) or non-consistent. The patch set also adds a corresponding capability flag: fill_buf_caps() reports V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS when queue supports user-space cache management hints. Note, however, that MMAP_CACHE_HINTS capability only valid when the queue is used for memory MMAP-ed streaming I/O. Signed-off-by: Sergey Senozhatsky Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/userspace-api/media/v4l/buffer.rst | 40 ++++++++++++++++++++-- .../userspace-api/media/v4l/vidioc-reqbufs.rst | 10 ++++++ drivers/media/common/videobuf2/videobuf2-v4l2.c | 2 ++ include/uapi/linux/videodev2.h | 3 ++ 4 files changed, 53 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/userspace-api/media/v4l/buffer.rst b/Documentation/userspace-api/media/v4l/buffer.rst index 951ae1ed485f..5088393b5a5c 100644 --- a/Documentation/userspace-api/media/v4l/buffer.rst +++ b/Documentation/userspace-api/media/v4l/buffer.rst @@ -577,7 +577,10 @@ Buffer Flags applications shall use this flag if the data captured in the buffer is not going to be touched by the CPU, instead the buffer will, probably, be passed on to a DMA-capable hardware unit for - further processing or output. + further processing or output. This flag is ignored unless the + queue is used for :ref:`memory mapping ` streaming I/O and + reports :ref:`V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS + ` capability. * .. _`V4L2-BUF-FLAG-NO-CACHE-CLEAN`: - ``V4L2_BUF_FLAG_NO_CACHE_CLEAN`` @@ -585,7 +588,10 @@ Buffer Flags - Caches do not have to be cleaned for this buffer. Typically applications shall use this flag for output buffers if the data in this buffer has not been created by the CPU but by some - DMA-capable unit, in which case caches have not been used. + DMA-capable unit, in which case caches have not been used. This flag + is ignored unless the queue is used for :ref:`memory mapping ` + streaming I/O and reports :ref:`V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS + ` capability. * .. _`V4L2-BUF-FLAG-M2M-HOLD-CAPTURE-BUF`: - ``V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF`` @@ -681,6 +687,36 @@ Buffer Flags \normalsize +.. _memory-flags: + +Memory Consistency Flags +======================== + +.. tabularcolumns:: |p{7.0cm}|p{2.2cm}|p{8.3cm}| + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 3 1 4 + + * .. _`V4L2-FLAG-MEMORY-NON-CONSISTENT`: + + - ``V4L2_FLAG_MEMORY_NON_CONSISTENT`` + - 0x00000001 + - A buffer is allocated either in consistent (it will be automatically + coherent between the CPU and the bus) or non-consistent memory. The + latter can provide performance gains, for instance the CPU cache + sync/flush operations can be avoided if the buffer is accessed by the + corresponding device only and the CPU does not read/write to/from that + buffer. However, this requires extra care from the driver -- it must + guarantee memory consistency by issuing a cache flush/sync when + consistency is needed. If this flag is set V4L2 will attempt to + allocate the buffer in non-consistent memory. The flag takes effect + only if the buffer is used for :ref:`memory mapping ` I/O and the + queue reports the :ref:`V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS + ` capability. .. c:type:: v4l2_memory diff --git a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst index b6d52083707b..96a59793d857 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst @@ -126,6 +126,7 @@ aborting or finishing any DMA in progress, an implicit .. _V4L2-BUF-CAP-SUPPORTS-REQUESTS: .. _V4L2-BUF-CAP-SUPPORTS-ORPHANED-BUFS: .. _V4L2-BUF-CAP-SUPPORTS-M2M-HOLD-CAPTURE-BUF: +.. _V4L2-BUF-CAP-SUPPORTS-MMAP-CACHE-HINTS: .. cssclass:: longtable @@ -156,6 +157,15 @@ aborting or finishing any DMA in progress, an implicit - Only valid for stateless decoders. If set, then userspace can set the ``V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF`` flag to hold off on returning the capture buffer until the OUTPUT timestamp changes. + * - ``V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS`` + - 0x00000040 + - This capability is set by the driver to indicate that the queue supports + cache and memory management hints. However, it's only valid when the + queue is used for :ref:`memory mapping ` streaming I/O. See + :ref:`V4L2_FLAG_MEMORY_NON_CONSISTENT `, + :ref:`V4L2_BUF_FLAG_NO_CACHE_INVALIDATE ` and + :ref:`V4L2_BUF_FLAG_NO_CACHE_CLEAN `. + Return Value ============ diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index f13851212cc8..e4b4354b42b8 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -710,6 +710,8 @@ static void fill_buf_caps(struct vb2_queue *q, u32 *caps) *caps |= V4L2_BUF_CAP_SUPPORTS_DMABUF; if (q->subsystem_flags & VB2_V4L2_FL_SUPPORTS_M2M_HOLD_CAPTURE_BUF) *caps |= V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF; + if (q->allow_cache_hints && q->io_modes & VB2_MMAP) + *caps |= V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS; #ifdef CONFIG_MEDIA_CONTROLLER_REQUEST_API if (q->supports_requests) *caps |= V4L2_BUF_CAP_SUPPORTS_REQUESTS; diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index c3a1cf1c507f..34ba1017b89b 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -189,6 +189,8 @@ enum v4l2_memory { V4L2_MEMORY_DMABUF = 4, }; +#define V4L2_FLAG_MEMORY_NON_CONSISTENT (1 << 0) + /* see also http://vektor.theorem.ca/graphics/ycbcr/ */ enum v4l2_colorspace { /* @@ -954,6 +956,7 @@ struct v4l2_requestbuffers { #define V4L2_BUF_CAP_SUPPORTS_REQUESTS (1 << 3) #define V4L2_BUF_CAP_SUPPORTS_ORPHANED_BUFS (1 << 4) #define V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF (1 << 5) +#define V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS (1 << 6) /** * struct v4l2_plane - plane info for multi-planar buffers -- cgit v1.2.3 From 1e0b2318fa75d186ee0d2be31843ce867385fcc4 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 14 May 2020 18:01:45 +0200 Subject: media: videobuf2: handle V4L2_FLAG_MEMORY_NON_CONSISTENT flag This patch lets user-space to request a non-consistent memory allocation during CREATE_BUFS and REQBUFS ioctl calls. = CREATE_BUFS struct v4l2_create_buffers has seven 4-byte reserved areas, so reserved[0] is renamed to ->flags. The struct, thus, now has six reserved 4-byte regions. = CREATE_BUFS32 struct v4l2_create_buffers32 has seven 4-byte reserved areas, so reserved[0] is renamed to ->flags. The struct, thus, now has six reserved 4-byte regions. = REQBUFS We use one bit of a ->reserved[1] member of struct v4l2_requestbuffers, which is now renamed to ->flags. Unlike v4l2_create_buffers, struct v4l2_requestbuffers does not have enough reserved room. Therefore for backward compatibility ->reserved and ->flags were put into anonymous union. Signed-off-by: Sergey Senozhatsky Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/vidioc-create-bufs.rst | 7 ++++++- .../userspace-api/media/v4l/vidioc-reqbufs.rst | 11 ++++++++-- drivers/media/common/videobuf2/videobuf2-core.c | 6 ++++++ drivers/media/common/videobuf2/videobuf2-v4l2.c | 24 ++++++++++++++++++---- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 10 +++++++-- drivers/media/v4l2-core/v4l2-ioctl.c | 5 +---- include/uapi/linux/videodev2.h | 11 ++++++++-- 7 files changed, 59 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst b/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst index e1afc5b504c2..f2a702870fad 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst @@ -121,7 +121,12 @@ than the number requested. other changes, then set ``count`` to 0, ``memory`` to ``V4L2_MEMORY_MMAP`` and ``format.type`` to the buffer type. * - __u32 - - ``reserved``\ [7] + - ``flags`` + - Specifies additional buffer management attributes. + See :ref:`memory-flags`. + + * - __u32 + - ``reserved``\ [6] - A place holder for future extensions. Drivers and applications must set the array to zero. diff --git a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst index 96a59793d857..75d894d9c36c 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst @@ -112,10 +112,17 @@ aborting or finishing any DMA in progress, an implicit ``V4L2_MEMORY_MMAP`` and ``type`` set to the buffer type. This will free any previously allocated buffers, so this is typically something that will be done at the start of the application. + * - union { + - (anonymous) + * - __u32 + - ``flags`` + - Specifies additional buffer management attributes. + See :ref:`memory-flags`. * - __u32 - ``reserved``\ [1] - - A place holder for future extensions. Drivers and applications - must set the array to zero. + - Kept for backwards compatibility. Use ``flags`` instead. + * - } + - .. tabularcolumns:: |p{6.1cm}|p{2.2cm}|p{8.7cm}| diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c index 0fdcf90330df..626c4db5134c 100644 --- a/drivers/media/common/videobuf2/videobuf2-core.c +++ b/drivers/media/common/videobuf2/videobuf2-core.c @@ -694,6 +694,9 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, unsigned int i; int ret; + if (flags & V4L2_FLAG_MEMORY_NON_CONSISTENT) + consistent_mem = false; + if (q->streaming) { dprintk(1, "streaming active\n"); return -EBUSY; @@ -837,6 +840,9 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory, bool consistent_mem = true; int ret; + if (flags & V4L2_FLAG_MEMORY_NON_CONSISTENT) + consistent_mem = false; + if (q->num_buffers == VB2_MAX_FRAME) { dprintk(1, "maximum number of buffers already allocated\n"); return -ENOBUFS; diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index 26a3ec333bb7..559a229cac41 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -718,12 +718,22 @@ static void fill_buf_caps(struct vb2_queue *q, u32 *caps) #endif } +static void clear_consistency_attr(struct vb2_queue *q, + int memory, + unsigned int *flags) +{ + if (!q->allow_cache_hints || memory != V4L2_MEMORY_MMAP) + *flags &= ~V4L2_FLAG_MEMORY_NON_CONSISTENT; +} + int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req) { int ret = vb2_verify_memory_type(q, req->memory, req->type); fill_buf_caps(q, &req->capabilities); - return ret ? ret : vb2_core_reqbufs(q, req->memory, 0, &req->count); + clear_consistency_attr(q, req->memory, &req->flags); + return ret ? ret : vb2_core_reqbufs(q, req->memory, + req->flags, &req->count); } EXPORT_SYMBOL_GPL(vb2_reqbufs); @@ -755,6 +765,7 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create) unsigned i; fill_buf_caps(q, &create->capabilities); + clear_consistency_attr(q, create->memory, &create->flags); create->index = q->num_buffers; if (create->count == 0) return ret != -EBUSY ? ret : 0; @@ -797,8 +808,11 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create) for (i = 0; i < requested_planes; i++) if (requested_sizes[i] == 0) return -EINVAL; - return ret ? ret : vb2_core_create_bufs(q, create->memory, 0, - &create->count, requested_planes, requested_sizes); + return ret ? ret : vb2_core_create_bufs(q, create->memory, + create->flags, + &create->count, + requested_planes, + requested_sizes); } EXPORT_SYMBOL_GPL(vb2_create_bufs); @@ -969,11 +983,12 @@ int vb2_ioctl_reqbufs(struct file *file, void *priv, int res = vb2_verify_memory_type(vdev->queue, p->memory, p->type); fill_buf_caps(vdev->queue, &p->capabilities); + clear_consistency_attr(vdev->queue, p->memory, &p->flags); if (res) return res; if (vb2_queue_is_busy(vdev, file)) return -EBUSY; - res = vb2_core_reqbufs(vdev->queue, p->memory, 0, &p->count); + res = vb2_core_reqbufs(vdev->queue, p->memory, p->flags, &p->count); /* If count == 0, then the owner has released all buffers and he is no longer owner of the queue. Otherwise we have a new owner. */ if (res == 0) @@ -991,6 +1006,7 @@ int vb2_ioctl_create_bufs(struct file *file, void *priv, p->index = vdev->queue->num_buffers; fill_buf_caps(vdev->queue, &p->capabilities); + clear_consistency_attr(vdev->queue, p->memory, &p->flags); /* * If count == 0, then just check if memory and type are valid. * Any -EBUSY result from vb2_verify_memory_type can be mapped to 0. diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index a99e82ec9ab6..593bcf6c3735 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -246,6 +246,9 @@ struct v4l2_format32 { * @memory: buffer memory type * @format: frame format, for which buffers are requested * @capabilities: capabilities of this buffer type. + * @flags: additional buffer management attributes (ignored unless the + * queue has V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS capability and + * configured for MMAP streaming I/O). * @reserved: future extensions */ struct v4l2_create_buffers32 { @@ -254,7 +257,8 @@ struct v4l2_create_buffers32 { __u32 memory; /* enum v4l2_memory */ struct v4l2_format32 format; __u32 capabilities; - __u32 reserved[7]; + __u32 flags; + __u32 reserved[6]; }; static int __bufsize_v4l2_format(struct v4l2_format32 __user *p32, u32 *size) @@ -355,7 +359,8 @@ static int get_v4l2_create32(struct v4l2_create_buffers __user *p64, { if (!access_ok(p32, sizeof(*p32)) || copy_in_user(p64, p32, - offsetof(struct v4l2_create_buffers32, format))) + offsetof(struct v4l2_create_buffers32, format)) || + assign_in_user(&p64->flags, &p32->flags)) return -EFAULT; return __get_v4l2_format32(&p64->format, &p32->format, aux_buf, aux_space); @@ -417,6 +422,7 @@ static int put_v4l2_create32(struct v4l2_create_buffers __user *p64, copy_in_user(p32, p64, offsetof(struct v4l2_create_buffers32, format)) || assign_in_user(&p32->capabilities, &p64->capabilities) || + assign_in_user(&p32->flags, &p64->flags) || copy_in_user(p32->reserved, p64->reserved, sizeof(p64->reserved))) return -EFAULT; return __put_v4l2_format32(&p64->format, &p32->format); diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 2322f08a98be..02bfef0da76d 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -2038,9 +2038,6 @@ static int v4l_reqbufs(const struct v4l2_ioctl_ops *ops, if (ret) return ret; - - CLEAR_AFTER_FIELD(p, capabilities); - return ops->vidioc_reqbufs(file, fh, p); } @@ -2080,7 +2077,7 @@ static int v4l_create_bufs(const struct v4l2_ioctl_ops *ops, if (ret) return ret; - CLEAR_AFTER_FIELD(create, capabilities); + CLEAR_AFTER_FIELD(create, flags); v4l_sanitize_format(&create->format); diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 34ba1017b89b..fec2607a07e3 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -946,7 +946,10 @@ struct v4l2_requestbuffers { __u32 type; /* enum v4l2_buf_type */ __u32 memory; /* enum v4l2_memory */ __u32 capabilities; - __u32 reserved[1]; + union { + __u32 flags; + __u32 reserved[1]; + }; }; /* capabilities for struct v4l2_requestbuffers and v4l2_create_buffers */ @@ -2450,6 +2453,9 @@ struct v4l2_dbg_chip_info { * @memory: enum v4l2_memory; buffer memory type * @format: frame format, for which buffers are requested * @capabilities: capabilities of this buffer type. + * @flags: additional buffer management attributes (ignored unless the + * queue has V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS capability + * and configured for MMAP streaming I/O). * @reserved: future extensions */ struct v4l2_create_buffers { @@ -2458,7 +2464,8 @@ struct v4l2_create_buffers { __u32 memory; struct v4l2_format format; __u32 capabilities; - __u32 reserved[7]; + __u32 flags; + __u32 reserved[6]; }; /* -- cgit v1.2.3 From 286cf7d3a99e1ca8c1d8e674b9a98f2dbe8520dc Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 26 May 2020 10:59:53 +0200 Subject: media: videodev2.h: add V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL flag Add the V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL flag to signal that the coded frame interval can be set separately from the raw frame interval for stateful encoders. Signed-off-by: Hans Verkuil Reviewed-by: Michael Tretter Acked-by: Tomasz Figa Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/vidioc-enum-fmt.rst | 30 ++++++++++++++++++---- .../userspace-api/media/videodev2.h.rst.exceptions | 1 + include/uapi/linux/videodev2.h | 1 + 3 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst index a53dd3d7f7e2..05835e04c20b 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst @@ -167,17 +167,37 @@ the ``mbus_code`` field is handled differently: - The hardware decoder for this compressed bytestream format (aka coded format) is capable of parsing a continuous bytestream. Applications do not need to parse the bytestream themselves to find the boundaries - between frames/fields. This flag can only be used in combination with - the ``V4L2_FMT_FLAG_COMPRESSED`` flag, since this applies to compressed + between frames/fields. + + This flag can only be used in combination with the + ``V4L2_FMT_FLAG_COMPRESSED`` flag, since this applies to compressed formats only. This flag is valid for stateful decoders only. * - ``V4L2_FMT_FLAG_DYN_RESOLUTION`` - 0x0008 - Dynamic resolution switching is supported by the device for this compressed bytestream format (aka coded format). It will notify the user via the event ``V4L2_EVENT_SOURCE_CHANGE`` when changes in the video - parameters are detected. This flag can only be used in combination - with the ``V4L2_FMT_FLAG_COMPRESSED`` flag, since this applies to - compressed formats only. It is also only applies to stateful codecs. + parameters are detected. + + This flag can only be used in combination with the + ``V4L2_FMT_FLAG_COMPRESSED`` flag, since this applies to + compressed formats only. This flag is valid for stateful codecs only. + * - ``V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL`` + - 0x0010 + - The hardware encoder supports setting the ``CAPTURE`` coded frame + interval separately from the ``OUTPUT`` raw frame interval. + Setting the ``OUTPUT`` raw frame interval with :ref:`VIDIOC_S_PARM ` + also sets the ``CAPTURE`` coded frame interval to the same value. + If this flag is set, then the ``CAPTURE`` coded frame interval can be + set to a different value afterwards. This is typically used for + offline encoding where the ``OUTPUT`` raw frame interval is used as + a hint for reserving hardware encoder resources and the ``CAPTURE`` coded + frame interval is the actual frame rate embedded in the encoded video + stream. + + This flag can only be used in combination with the + ``V4L2_FMT_FLAG_COMPRESSED`` flag, since this applies to + compressed formats only. This flag is valid for stateful encoders only. Return Value diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions index a625fb90e3a9..ca05e4e126b2 100644 --- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions +++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions @@ -187,6 +187,7 @@ replace define V4L2_FMT_FLAG_COMPRESSED fmtdesc-flags replace define V4L2_FMT_FLAG_EMULATED fmtdesc-flags replace define V4L2_FMT_FLAG_CONTINUOUS_BYTESTREAM fmtdesc-flags replace define V4L2_FMT_FLAG_DYN_RESOLUTION fmtdesc-flags +replace define V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL fmtdesc-flags # V4L2 timecode types replace define V4L2_TC_TYPE_24FPS timecode-type diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index fec2607a07e3..303805438814 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -794,6 +794,7 @@ struct v4l2_fmtdesc { #define V4L2_FMT_FLAG_EMULATED 0x0002 #define V4L2_FMT_FLAG_CONTINUOUS_BYTESTREAM 0x0004 #define V4L2_FMT_FLAG_DYN_RESOLUTION 0x0008 +#define V4L2_FMT_FLAG_ENC_CAP_FRAME_INTERVAL 0x0010 /* Frame Size and frame rate enumeration */ /* -- cgit v1.2.3 From 79a28ddd18e9c653f13f60dfabee15c024e64b9b Mon Sep 17 00:00:00 2001 From: Alexandre Cassen Date: Tue, 23 Jun 2020 10:33:45 +0200 Subject: rtnetlink: add keepalived rtm_protocol Keepalived can set global static ip routes or virtual ip routes dynamically following VRRP protocol states. Using a dedicated rtm_protocol will help keeping track of it. Changes in v2: - fix tab/space indenting Signed-off-by: Alexandre Cassen Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 45 +++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 073e71ef6bdd..879e64950a0a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -257,12 +257,12 @@ enum { /* rtm_protocol */ -#define RTPROT_UNSPEC 0 -#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; - not used by current IPv4 */ -#define RTPROT_KERNEL 2 /* Route installed by kernel */ -#define RTPROT_BOOT 3 /* Route installed during boot */ -#define RTPROT_STATIC 4 /* Route installed by administrator */ +#define RTPROT_UNSPEC 0 +#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; + not used by current IPv4 */ +#define RTPROT_KERNEL 2 /* Route installed by kernel */ +#define RTPROT_BOOT 3 /* Route installed during boot */ +#define RTPROT_STATIC 4 /* Route installed by administrator */ /* Values of protocol >= RTPROT_STATIC are not interpreted by kernel; they are just passed from user and back as is. @@ -271,22 +271,23 @@ enum { avoid conflicts. */ -#define RTPROT_GATED 8 /* Apparently, GateD */ -#define RTPROT_RA 9 /* RDISC/ND router advertisements */ -#define RTPROT_MRT 10 /* Merit MRT */ -#define RTPROT_ZEBRA 11 /* Zebra */ -#define RTPROT_BIRD 12 /* BIRD */ -#define RTPROT_DNROUTED 13 /* DECnet routing daemon */ -#define RTPROT_XORP 14 /* XORP */ -#define RTPROT_NTK 15 /* Netsukuku */ -#define RTPROT_DHCP 16 /* DHCP client */ -#define RTPROT_MROUTED 17 /* Multicast daemon */ -#define RTPROT_BABEL 42 /* Babel daemon */ -#define RTPROT_BGP 186 /* BGP Routes */ -#define RTPROT_ISIS 187 /* ISIS Routes */ -#define RTPROT_OSPF 188 /* OSPF Routes */ -#define RTPROT_RIP 189 /* RIP Routes */ -#define RTPROT_EIGRP 192 /* EIGRP Routes */ +#define RTPROT_GATED 8 /* Apparently, GateD */ +#define RTPROT_RA 9 /* RDISC/ND router advertisements */ +#define RTPROT_MRT 10 /* Merit MRT */ +#define RTPROT_ZEBRA 11 /* Zebra */ +#define RTPROT_BIRD 12 /* BIRD */ +#define RTPROT_DNROUTED 13 /* DECnet routing daemon */ +#define RTPROT_XORP 14 /* XORP */ +#define RTPROT_NTK 15 /* Netsukuku */ +#define RTPROT_DHCP 16 /* DHCP client */ +#define RTPROT_MROUTED 17 /* Multicast daemon */ +#define RTPROT_KEEPALIVED 18 /* Keepalived daemon */ +#define RTPROT_BABEL 42 /* Babel daemon */ +#define RTPROT_BGP 186 /* BGP Routes */ +#define RTPROT_ISIS 187 /* ISIS Routes */ +#define RTPROT_OSPF 188 /* OSPF Routes */ +#define RTPROT_RIP 189 /* RIP Routes */ +#define RTPROT_EIGRP 192 /* EIGRP Routes */ /* rtm_scope -- cgit v1.2.3 From 2464bc7c2897b7549146a743045089d3aea7b85b Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 23 Jun 2020 11:05:40 +0200 Subject: bridge: uapi: mrp: Fix MRP_PORT_ROLE Currently the MRP_PORT_ROLE_NONE has the value 0x2 but this is in conflict with the IEC 62439-2 standard. The standard defines the following port roles: primary (0x0), secondary(0x1), interconnect(0x2). Therefore remove the port role none. Fixes: 4714d13791f831 ("bridge: uapi: mrp: Add mrp attributes.") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/uapi/linux/mrp_bridge.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h index 84f15f48a7cb..bee366540212 100644 --- a/include/uapi/linux/mrp_bridge.h +++ b/include/uapi/linux/mrp_bridge.h @@ -36,7 +36,6 @@ enum br_mrp_port_state_type { enum br_mrp_port_role_type { BR_MRP_PORT_ROLE_PRIMARY, BR_MRP_PORT_ROLE_SECONDARY, - BR_MRP_PORT_ROLE_NONE, }; enum br_mrp_tlv_header_type { -- cgit v1.2.3 From bdb7b79b4ce864a724250e1d35948c46f135de36 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 22 Jun 2020 20:22:21 -0700 Subject: bpf: Switch most helper return values from 32-bit int to 64-bit long Switch most of BPF helper definitions from returning int to long. These definitions are coming from comments in BPF UAPI header and are used to generate bpf_helper_defs.h (under libbpf) to be later included and used from BPF programs. In actual in-kernel implementation, all the helpers are defined as returning u64, but due to some historical reasons, most of them are actually defined as returning int in UAPI (usually, to return 0 on success, and negative value on error). This actually causes Clang to quite often generate sub-optimal code, because compiler believes that return value is 32-bit, and in a lot of cases has to be up-converted (usually with a pair of 32-bit bit shifts) to 64-bit values, before they can be used further in BPF code. Besides just "polluting" the code, these 32-bit shifts quite often cause problems for cases in which return value matters. This is especially the case for the family of bpf_probe_read_str() functions. There are few other similar helpers (e.g., bpf_read_branch_records()), in which return value is used by BPF program logic to record variable-length data and process it. For such cases, BPF program logic carefully manages offsets within some array or map to read variable-length data. For such uses, it's crucial for BPF verifier to track possible range of register values to prove that all the accesses happen within given memory bounds. Those extraneous zero-extending bit shifts, inserted by Clang (and quite often interleaved with other code, which makes the issues even more challenging and sometimes requires employing extra per-variable compiler barriers), throws off verifier logic and makes it mark registers as having unknown variable offset. We'll study this pattern a bit later below. Another common pattern is to check return of BPF helper for non-zero state to detect error conditions and attempt alternative actions in such case. Even in this simple and straightforward case, this 32-bit vs BPF's native 64-bit mode quite often leads to sub-optimal and unnecessary extra code. We'll look at this pattern as well. Clang's BPF target supports two modes of code generation: ALU32, in which it is capable of using lower 32-bit parts of registers, and no-ALU32, in which only full 64-bit registers are being used. ALU32 mode somewhat mitigates the above described problems, but not in all cases. This patch switches all the cases in which BPF helpers return 0 or negative error from returning int to returning long. It is shown below that such change in definition leads to equivalent or better code. No-ALU32 mode benefits more, but ALU32 mode doesn't degrade or still gets improved code generation. Another class of cases switched from int to long are bpf_probe_read_str()-like helpers, which encode successful case as non-negative values, while still returning negative value for errors. In all of such cases, correctness is preserved due to two's complement encoding of negative values and the fact that all helpers return values with 32-bit absolute value. Two's complement ensures that for negative values higher 32 bits are all ones and when truncated, leave valid negative 32-bit value with the same value. Non-negative values have upper 32 bits set to zero and similarly preserve value when high 32 bits are truncated. This means that just casting to int/u32 is correct and efficient (and in ALU32 mode doesn't require any extra shifts). To minimize the chances of regressions, two code patterns were investigated, as mentioned above. For both patterns, BPF assembly was analyzed in ALU32/NO-ALU32 compiler modes, both with current 32-bit int return type and new 64-bit long return type. Case 1. Variable-length data reading and concatenation. This is quite ubiquitous pattern in tracing/monitoring applications, reading data like process's environment variables, file path, etc. In such case, many pieces of string-like variable-length data are read into a single big buffer, and at the end of the process, only a part of array containing actual data is sent to user-space for further processing. This case is tested in test_varlen.c selftest (in the next patch). Code flow is roughly as follows: void *payload = &sample->payload; u64 len; len = bpf_probe_read_kernel_str(payload, MAX_SZ1, &source_data1); if (len <= MAX_SZ1) { payload += len; sample->len1 = len; } len = bpf_probe_read_kernel_str(payload, MAX_SZ2, &source_data2); if (len <= MAX_SZ2) { payload += len; sample->len2 = len; } /* and so on */ sample->total_len = payload - &sample->payload; /* send over, e.g., perf buffer */ There could be two variations with slightly different code generated: when len is 64-bit integer and when it is 32-bit integer. Both variations were analysed. BPF assembly instructions between two successive invocations of bpf_probe_read_kernel_str() were used to check code regressions. Results are below, followed by short analysis. Left side is using helpers with int return type, the right one is after the switch to long. ALU32 + INT ALU32 + LONG =========== ============ 64-BIT (13 insns): 64-BIT (10 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: if w0 > 256 goto +9 18: if r0 > 256 goto +6 19: w1 = w0 19: r1 = 0 ll 20: r1 <<= 32 21: *(u64 *)(r1 + 0) = r0 21: r1 s>>= 32 22: r6 = 0 ll 22: r2 = 0 ll 24: r6 += r0 24: *(u64 *)(r2 + 0) = r1 00000000000000c8 : 25: r6 = 0 ll 25: r1 = r6 27: r6 += r1 26: w2 = 256 00000000000000e0 : 27: r3 = 0 ll 28: r1 = r6 29: call 115 29: w2 = 256 30: r3 = 0 ll 32: call 115 32-BIT (11 insns): 32-BIT (12 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: if w0 > 256 goto +7 18: if w0 > 256 goto +8 19: r1 = 0 ll 19: r1 = 0 ll 21: *(u32 *)(r1 + 0) = r0 21: *(u32 *)(r1 + 0) = r0 22: w1 = w0 22: r0 <<= 32 23: r6 = 0 ll 23: r0 >>= 32 25: r6 += r1 24: r6 = 0 ll 00000000000000d0 : 26: r6 += r0 26: r1 = r6 00000000000000d8 : 27: w2 = 256 27: r1 = r6 28: r3 = 0 ll 28: w2 = 256 30: call 115 29: r3 = 0 ll 31: call 115 In ALU32 mode, the variant using 64-bit length variable clearly wins and avoids unnecessary zero-extension bit shifts. In practice, this is even more important and good, because BPF code won't need to do extra checks to "prove" that payload/len are within good bounds. 32-bit len is one instruction longer. Clang decided to do 64-to-32 casting with two bit shifts, instead of equivalent `w1 = w0` assignment. The former uses extra register. The latter might potentially lose some range information, but not for 32-bit value. So in this case, verifier infers that r0 is [0, 256] after check at 18:, and shifting 32 bits left/right keeps that range intact. We should probably look into Clang's logic and see why it chooses bitshifts over sub-register assignments for this. NO-ALU32 + INT NO-ALU32 + LONG ============== =============== 64-BIT (14 insns): 64-BIT (10 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: r0 <<= 32 18: if r0 > 256 goto +6 19: r1 = r0 19: r1 = 0 ll 20: r1 >>= 32 21: *(u64 *)(r1 + 0) = r0 21: if r1 > 256 goto +7 22: r6 = 0 ll 22: r0 s>>= 32 24: r6 += r0 23: r1 = 0 ll 00000000000000c8 : 25: *(u64 *)(r1 + 0) = r0 25: r1 = r6 26: r6 = 0 ll 26: r2 = 256 28: r6 += r0 27: r3 = 0 ll 00000000000000e8 : 29: call 115 29: r1 = r6 30: r2 = 256 31: r3 = 0 ll 33: call 115 32-BIT (13 insns): 32-BIT (13 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: r1 = r0 18: r1 = r0 19: r1 <<= 32 19: r1 <<= 32 20: r1 >>= 32 20: r1 >>= 32 21: if r1 > 256 goto +6 21: if r1 > 256 goto +6 22: r2 = 0 ll 22: r2 = 0 ll 24: *(u32 *)(r2 + 0) = r0 24: *(u32 *)(r2 + 0) = r0 25: r6 = 0 ll 25: r6 = 0 ll 27: r6 += r1 27: r6 += r1 00000000000000e0 : 00000000000000e0 : 28: r1 = r6 28: r1 = r6 29: r2 = 256 29: r2 = 256 30: r3 = 0 ll 30: r3 = 0 ll 32: call 115 32: call 115 In NO-ALU32 mode, for the case of 64-bit len variable, Clang generates much superior code, as expected, eliminating unnecessary bit shifts. For 32-bit len, code is identical. So overall, only ALU-32 32-bit len case is more-or-less equivalent and the difference stems from internal Clang decision, rather than compiler lacking enough information about types. Case 2. Let's look at the simpler case of checking return result of BPF helper for errors. The code is very simple: long bla; if (bpf_probe_read_kenerl(&bla, sizeof(bla), 0)) return 1; else return 0; ALU32 + CHECK (9 insns) ALU32 + CHECK (9 insns) ==================================== ==================================== 0: r1 = r10 0: r1 = r10 1: r1 += -8 1: r1 += -8 2: w2 = 8 2: w2 = 8 3: r3 = 0 3: r3 = 0 4: call 113 4: call 113 5: w1 = w0 5: r1 = r0 6: w0 = 1 6: w0 = 1 7: if w1 != 0 goto +1 7: if r1 != 0 goto +1 8: w0 = 0 8: w0 = 0 0000000000000048 : 0000000000000048 : 9: exit 9: exit Almost identical code, the only difference is the use of full register assignment (r1 = r0) vs half-registers (w1 = w0) in instruction #5. On 32-bit architectures, new BPF assembly might be slightly less optimal, in theory. But one can argue that's not a big issue, given that use of full registers is still prevalent (e.g., for parameter passing). NO-ALU32 + CHECK (11 insns) NO-ALU32 + CHECK (9 insns) ==================================== ==================================== 0: r1 = r10 0: r1 = r10 1: r1 += -8 1: r1 += -8 2: r2 = 8 2: r2 = 8 3: r3 = 0 3: r3 = 0 4: call 113 4: call 113 5: r1 = r0 5: r1 = r0 6: r1 <<= 32 6: r0 = 1 7: r1 >>= 32 7: if r1 != 0 goto +1 8: r0 = 1 8: r0 = 0 9: if r1 != 0 goto +1 0000000000000048 : 10: r0 = 0 9: exit 0000000000000058 : 11: exit NO-ALU32 is a clear improvement, getting rid of unnecessary zero-extension bit shifts. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200623032224.4020118-1-andriin@fb.com --- include/uapi/linux/bpf.h | 192 ++++++++++++++++++++--------------------- tools/include/uapi/linux/bpf.h | 192 ++++++++++++++++++++--------------------- 2 files changed, 192 insertions(+), 192 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 19684813faae..be0efee49093 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -653,7 +653,7 @@ union bpf_attr { * Map value associated to *key*, or **NULL** if no entry was * found. * - * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) * Description * Add or update the value of the entry associated to *key* in * *map* with *value*. *flags* is one of: @@ -671,13 +671,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_delete_elem(struct bpf_map *map, const void *key) + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) * Description * Delete entry with *key* from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. @@ -695,7 +695,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) * Description * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) @@ -775,7 +775,7 @@ union bpf_attr { * Return * The SMP id of the processor running the program. * - * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. *flags* are a combination of @@ -792,7 +792,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) * Description * Recompute the layer 3 (e.g. IP) checksum for the packet * associated to *skb*. Computation is incremental, so the helper @@ -817,7 +817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) * Description * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the * packet associated to *skb*. Computation is incremental, so the @@ -849,7 +849,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) * Description * This special helper is used to trigger a "tail call", or in * other words, to jump into another eBPF program. The same stack @@ -880,7 +880,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) * Description * Clone and redirect the packet associated to *skb* to another * net device of index *ifindex*. Both ingress and egress @@ -916,7 +916,7 @@ union bpf_attr { * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(void *buf, u32 size_of_buf) + * long bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of @@ -953,7 +953,7 @@ union bpf_attr { * Return * The classid, or 0 for the default unconfigured classid. * - * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) * Description * Push a *vlan_tci* (VLAN tag control information) of protocol * *vlan_proto* to the packet associated to *skb*, then update @@ -969,7 +969,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_vlan_pop(struct sk_buff *skb) + * long bpf_skb_vlan_pop(struct sk_buff *skb) * Description * Pop a VLAN header from the packet associated to *skb*. * @@ -981,7 +981,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Get tunnel metadata. This helper takes a pointer *key* to an * empty **struct bpf_tunnel_key** of **size**, that will be @@ -1032,7 +1032,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Populate tunnel metadata for packet associated to *skb.* The * tunnel metadata is set to the contents of *key*, of *size*. The @@ -1098,7 +1098,7 @@ union bpf_attr { * The value of the perf event counter read from the map, or a * negative error code in case of failure. * - * int bpf_redirect(u32 ifindex, u64 flags) + * long bpf_redirect(u32 ifindex, u64 flags) * Description * Redirect the packet to another net device of index *ifindex*. * This helper is somewhat similar to **bpf_clone_redirect**\ @@ -1145,7 +1145,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1190,7 +1190,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from @@ -1207,7 +1207,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1276,7 +1276,7 @@ union bpf_attr { * The checksum result, or a negative error code in case of * failure. * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* @@ -1294,7 +1294,7 @@ union bpf_attr { * Return * The size of the option data retrieved. * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. @@ -1304,7 +1304,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) * Description * Change the protocol of the *skb* to *proto*. Currently * supported are transition from IPv4 to IPv6, and from IPv6 to @@ -1331,7 +1331,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_type(struct sk_buff *skb, u32 type) + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) * Description * Change the packet type for the packet associated to *skb*. This * comes down to setting *skb*\ **->pkt_type** to *type*, except @@ -1358,7 +1358,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) * Description * Check whether *skb* is a descendant of the cgroup2 held by * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. @@ -1389,7 +1389,7 @@ union bpf_attr { * Return * A pointer to the current task struct. * - * int bpf_probe_write_user(void *dst, const void *src, u32 len) + * long bpf_probe_write_user(void *dst, const void *src, u32 len) * Description * Attempt in a safe way to write *len* bytes from the buffer * *src* to *dst* in memory. It only works for threads that are in @@ -1408,7 +1408,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) * Description * Check whether the probe is being run is the context of a given * subset of the cgroup2 hierarchy. The cgroup2 to test is held by @@ -1420,7 +1420,7 @@ union bpf_attr { * * 1, if the *skb* task does not belong to the cgroup2. * * A negative error code, if an error occurred. * - * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) * Description * Resize (trim or grow) the packet associated to *skb* to the * new *len*. The *flags* are reserved for future usage, and must @@ -1444,7 +1444,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) * Description * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes @@ -1500,7 +1500,7 @@ union bpf_attr { * recalculation the next time the kernel tries to access this * hash or when the **bpf_get_hash_recalc**\ () helper is called. * - * int bpf_get_numa_node_id(void) + * long bpf_get_numa_node_id(void) * Description * Return the id of the current NUMA node. The primary use case * for this helper is the selection of sockets for the local NUMA @@ -1511,7 +1511,7 @@ union bpf_attr { * Return * The id of current NUMA node. * - * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) * Description * Grows headroom of packet associated to *skb* and adjusts the * offset of the MAC header accordingly, adding *len* bytes of @@ -1532,7 +1532,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that * it is possible to use a negative value for *delta*. This helper @@ -1547,7 +1547,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for @@ -1595,14 +1595,14 @@ union bpf_attr { * is returned (note that **overflowuid** might also be the actual * UID value for the socket). * - * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) + * long bpf_set_hash(struct sk_buff *skb, u32 hash) * Description * Set the full hash for *skb* (set the field *skb*\ **->hash**) * to value *hash*. * Return * 0 * - * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1630,7 +1630,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. @@ -1676,7 +1676,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain @@ -1697,7 +1697,7 @@ union bpf_attr { * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. * - * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and @@ -1708,7 +1708,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to @@ -1727,7 +1727,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) * Description * Adjust the address pointed by *xdp_md*\ **->data_meta** by * *delta* (which can be positive or negative). Note that this @@ -1756,7 +1756,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) * Description * Read the value of a perf event counter, and store it into *buf* * of size *buf_size*. This helper relies on a *map* of type @@ -1806,7 +1806,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For en eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in @@ -1817,7 +1817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1842,7 +1842,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_override_return(struct pt_regs *regs, u64 rc) + * long bpf_override_return(struct pt_regs *regs, u64 rc) * Description * Used for error injection, this helper uses kprobes to override * the return value of the probed function, and to set it to *rc*. @@ -1867,7 +1867,7 @@ union bpf_attr { * Return * 0 * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to @@ -1911,7 +1911,7 @@ union bpf_attr { * be set is returned (which comes down to 0 if all bits were set * as required). * - * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -1925,7 +1925,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, apply the verdict of the eBPF program to * the next *bytes* (number of bytes) of message *msg*. @@ -1959,7 +1959,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, prevent the execution of the verdict eBPF * program for message *msg* until *bytes* (byte number) have been @@ -1977,7 +1977,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) * Description * For socket policies, pull in non-linear data from user space * for *msg* and set pointers *msg*\ **->data** and *msg*\ @@ -2008,7 +2008,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing @@ -2026,7 +2026,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is * possible to both shrink and grow the packet tail. @@ -2040,7 +2040,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) * Description * Retrieve the XFRM state (IP transform framework, see also * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. @@ -2056,7 +2056,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer @@ -2089,7 +2089,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -2111,7 +2111,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. * If lookup is successful and result shows packet is to be @@ -2142,7 +2142,7 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * - * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to @@ -2161,7 +2161,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -2175,7 +2175,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. @@ -2189,7 +2189,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) * Description * Encapsulate the packet associated to *skb* within a Layer 3 * protocol header. This header is provided in the buffer at @@ -2226,7 +2226,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. Only the flags, tag and TLVs @@ -2241,7 +2241,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) * Description * Adjust the size allocated to TLVs in the outermost IPv6 * Segment Routing Header contained in the packet associated to @@ -2257,7 +2257,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) * Description * Apply an IPv6 Segment Routing action of type *action* to the * packet associated to *skb*. Each action takes a parameter @@ -2286,7 +2286,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_repeat(void *ctx) + * long bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded repeat key message. This delays @@ -2305,7 +2305,7 @@ union bpf_attr { * Return * 0 * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded key press with *scancode*, @@ -2370,7 +2370,7 @@ union bpf_attr { * Return * A pointer to the local storage area. * - * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. @@ -2471,7 +2471,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(struct bpf_sock *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -2479,7 +2479,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) * Description * Push an element *value* in *map*. *flags* is one of: * @@ -2489,19 +2489,19 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * long bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * long bpf_map_peek_elem(struct bpf_map *map, void *value) * Description * Get an element from *map* without removing it. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. @@ -2517,7 +2517,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if @@ -2529,7 +2529,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded pointer movement. @@ -2543,7 +2543,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_lock(struct bpf_spin_lock *lock) + * long bpf_spin_lock(struct bpf_spin_lock *lock) * Description * Acquire a spinlock represented by the pointer *lock*, which is * stored as part of a value of a map. Taking the lock allows to @@ -2591,7 +2591,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * long bpf_spin_unlock(struct bpf_spin_lock *lock) * Description * Release the *lock* previously locked by a call to * **bpf_spin_lock**\ (\ *lock*\ ). @@ -2614,7 +2614,7 @@ union bpf_attr { * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * - * int bpf_skb_ecn_set_ce(struct sk_buff *skb) + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** @@ -2651,7 +2651,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2666,7 +2666,7 @@ union bpf_attr { * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. * - * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) * Description * Get name of sysctl in /proc/sys/ and copy it into provided by * program buffer *buf* of size *buf_len*. @@ -2682,7 +2682,7 @@ union bpf_attr { * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). * - * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get current value of sysctl as it is presented in /proc/sys * (incl. newline, etc), and copy it as a string into provided @@ -2701,7 +2701,7 @@ union bpf_attr { * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. * - * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get new value being written by user space to sysctl (before * the actual write happens) and copy it as a string into @@ -2718,7 +2718,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) * Description * Override new value being written by user space to sysctl with * value provided by program in buffer *buf* of size *buf_len*. @@ -2735,7 +2735,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to a long integer according to the given base @@ -2759,7 +2759,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to an unsigned long integer according to the @@ -2810,7 +2810,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return @@ -2818,7 +2818,7 @@ union bpf_attr { * * **-ENOENT** if the bpf-local-storage cannot be found. * - * int bpf_send_signal(u32 sig) + * long bpf_send_signal(u32 sig) * Description * Send signal *sig* to the process of the current task. * The signal may be delivered to any of this process's threads. @@ -2859,7 +2859,7 @@ union bpf_attr { * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 * - * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -2883,21 +2883,21 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from user space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from kernel space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe user address * *unsafe_ptr* to *dst*. The *size* should include the @@ -2941,7 +2941,7 @@ union bpf_attr { * including the trailing NUL character. On error, a negative * value. * - * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. @@ -2949,14 +2949,14 @@ union bpf_attr { * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. * - * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * Description * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_send_signal_thread(u32 sig) + * long bpf_send_signal_thread(u32 sig) * Description * Send signal *sig* to the thread corresponding to the current task. * Return @@ -2976,7 +2976,7 @@ union bpf_attr { * Return * The 64 bit jiffies * - * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * Description * For an eBPF program attached to a perf event, retrieve the * branch records (**struct perf_branch_entry**) associated to *ctx* @@ -2995,7 +2995,7 @@ union bpf_attr { * * **-ENOENT** if architecture does not support branch records. * - * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) * Description * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. @@ -3007,7 +3007,7 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * - * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -3062,7 +3062,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, @@ -3097,7 +3097,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print * out the format string. @@ -3126,7 +3126,7 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) * Description * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. * The *m* represents the seq_file. The *data* and *len* represent the @@ -3221,7 +3221,7 @@ union bpf_attr { * Return * Requested value, or 0, if flags are not recognized. * - * int bpf_csum_level(struct sk_buff *skb, u64 level) + * long bpf_csum_level(struct sk_buff *skb, u64 level) * Description * Change the skbs checksum level by one layer up or down, or * reset it entirely to none in order to have the stack perform diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 19684813faae..be0efee49093 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -653,7 +653,7 @@ union bpf_attr { * Map value associated to *key*, or **NULL** if no entry was * found. * - * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) * Description * Add or update the value of the entry associated to *key* in * *map* with *value*. *flags* is one of: @@ -671,13 +671,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_delete_elem(struct bpf_map *map, const void *key) + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) * Description * Delete entry with *key* from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. @@ -695,7 +695,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) * Description * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) @@ -775,7 +775,7 @@ union bpf_attr { * Return * The SMP id of the processor running the program. * - * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. *flags* are a combination of @@ -792,7 +792,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) * Description * Recompute the layer 3 (e.g. IP) checksum for the packet * associated to *skb*. Computation is incremental, so the helper @@ -817,7 +817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) * Description * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the * packet associated to *skb*. Computation is incremental, so the @@ -849,7 +849,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) * Description * This special helper is used to trigger a "tail call", or in * other words, to jump into another eBPF program. The same stack @@ -880,7 +880,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) * Description * Clone and redirect the packet associated to *skb* to another * net device of index *ifindex*. Both ingress and egress @@ -916,7 +916,7 @@ union bpf_attr { * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(void *buf, u32 size_of_buf) + * long bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of @@ -953,7 +953,7 @@ union bpf_attr { * Return * The classid, or 0 for the default unconfigured classid. * - * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) * Description * Push a *vlan_tci* (VLAN tag control information) of protocol * *vlan_proto* to the packet associated to *skb*, then update @@ -969,7 +969,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_vlan_pop(struct sk_buff *skb) + * long bpf_skb_vlan_pop(struct sk_buff *skb) * Description * Pop a VLAN header from the packet associated to *skb*. * @@ -981,7 +981,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Get tunnel metadata. This helper takes a pointer *key* to an * empty **struct bpf_tunnel_key** of **size**, that will be @@ -1032,7 +1032,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Populate tunnel metadata for packet associated to *skb.* The * tunnel metadata is set to the contents of *key*, of *size*. The @@ -1098,7 +1098,7 @@ union bpf_attr { * The value of the perf event counter read from the map, or a * negative error code in case of failure. * - * int bpf_redirect(u32 ifindex, u64 flags) + * long bpf_redirect(u32 ifindex, u64 flags) * Description * Redirect the packet to another net device of index *ifindex*. * This helper is somewhat similar to **bpf_clone_redirect**\ @@ -1145,7 +1145,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1190,7 +1190,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from @@ -1207,7 +1207,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1276,7 +1276,7 @@ union bpf_attr { * The checksum result, or a negative error code in case of * failure. * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* @@ -1294,7 +1294,7 @@ union bpf_attr { * Return * The size of the option data retrieved. * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. @@ -1304,7 +1304,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) * Description * Change the protocol of the *skb* to *proto*. Currently * supported are transition from IPv4 to IPv6, and from IPv6 to @@ -1331,7 +1331,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_type(struct sk_buff *skb, u32 type) + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) * Description * Change the packet type for the packet associated to *skb*. This * comes down to setting *skb*\ **->pkt_type** to *type*, except @@ -1358,7 +1358,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) * Description * Check whether *skb* is a descendant of the cgroup2 held by * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. @@ -1389,7 +1389,7 @@ union bpf_attr { * Return * A pointer to the current task struct. * - * int bpf_probe_write_user(void *dst, const void *src, u32 len) + * long bpf_probe_write_user(void *dst, const void *src, u32 len) * Description * Attempt in a safe way to write *len* bytes from the buffer * *src* to *dst* in memory. It only works for threads that are in @@ -1408,7 +1408,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) * Description * Check whether the probe is being run is the context of a given * subset of the cgroup2 hierarchy. The cgroup2 to test is held by @@ -1420,7 +1420,7 @@ union bpf_attr { * * 1, if the *skb* task does not belong to the cgroup2. * * A negative error code, if an error occurred. * - * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) * Description * Resize (trim or grow) the packet associated to *skb* to the * new *len*. The *flags* are reserved for future usage, and must @@ -1444,7 +1444,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) * Description * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes @@ -1500,7 +1500,7 @@ union bpf_attr { * recalculation the next time the kernel tries to access this * hash or when the **bpf_get_hash_recalc**\ () helper is called. * - * int bpf_get_numa_node_id(void) + * long bpf_get_numa_node_id(void) * Description * Return the id of the current NUMA node. The primary use case * for this helper is the selection of sockets for the local NUMA @@ -1511,7 +1511,7 @@ union bpf_attr { * Return * The id of current NUMA node. * - * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) * Description * Grows headroom of packet associated to *skb* and adjusts the * offset of the MAC header accordingly, adding *len* bytes of @@ -1532,7 +1532,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that * it is possible to use a negative value for *delta*. This helper @@ -1547,7 +1547,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for @@ -1595,14 +1595,14 @@ union bpf_attr { * is returned (note that **overflowuid** might also be the actual * UID value for the socket). * - * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) + * long bpf_set_hash(struct sk_buff *skb, u32 hash) * Description * Set the full hash for *skb* (set the field *skb*\ **->hash**) * to value *hash*. * Return * 0 * - * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1630,7 +1630,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. @@ -1676,7 +1676,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain @@ -1697,7 +1697,7 @@ union bpf_attr { * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. * - * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and @@ -1708,7 +1708,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to @@ -1727,7 +1727,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) * Description * Adjust the address pointed by *xdp_md*\ **->data_meta** by * *delta* (which can be positive or negative). Note that this @@ -1756,7 +1756,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) * Description * Read the value of a perf event counter, and store it into *buf* * of size *buf_size*. This helper relies on a *map* of type @@ -1806,7 +1806,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For en eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in @@ -1817,7 +1817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1842,7 +1842,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_override_return(struct pt_regs *regs, u64 rc) + * long bpf_override_return(struct pt_regs *regs, u64 rc) * Description * Used for error injection, this helper uses kprobes to override * the return value of the probed function, and to set it to *rc*. @@ -1867,7 +1867,7 @@ union bpf_attr { * Return * 0 * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to @@ -1911,7 +1911,7 @@ union bpf_attr { * be set is returned (which comes down to 0 if all bits were set * as required). * - * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -1925,7 +1925,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, apply the verdict of the eBPF program to * the next *bytes* (number of bytes) of message *msg*. @@ -1959,7 +1959,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, prevent the execution of the verdict eBPF * program for message *msg* until *bytes* (byte number) have been @@ -1977,7 +1977,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) * Description * For socket policies, pull in non-linear data from user space * for *msg* and set pointers *msg*\ **->data** and *msg*\ @@ -2008,7 +2008,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing @@ -2026,7 +2026,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is * possible to both shrink and grow the packet tail. @@ -2040,7 +2040,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) * Description * Retrieve the XFRM state (IP transform framework, see also * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. @@ -2056,7 +2056,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer @@ -2089,7 +2089,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -2111,7 +2111,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. * If lookup is successful and result shows packet is to be @@ -2142,7 +2142,7 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * - * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to @@ -2161,7 +2161,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -2175,7 +2175,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. @@ -2189,7 +2189,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) * Description * Encapsulate the packet associated to *skb* within a Layer 3 * protocol header. This header is provided in the buffer at @@ -2226,7 +2226,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. Only the flags, tag and TLVs @@ -2241,7 +2241,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) * Description * Adjust the size allocated to TLVs in the outermost IPv6 * Segment Routing Header contained in the packet associated to @@ -2257,7 +2257,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) * Description * Apply an IPv6 Segment Routing action of type *action* to the * packet associated to *skb*. Each action takes a parameter @@ -2286,7 +2286,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_repeat(void *ctx) + * long bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded repeat key message. This delays @@ -2305,7 +2305,7 @@ union bpf_attr { * Return * 0 * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded key press with *scancode*, @@ -2370,7 +2370,7 @@ union bpf_attr { * Return * A pointer to the local storage area. * - * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. @@ -2471,7 +2471,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(struct bpf_sock *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -2479,7 +2479,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) * Description * Push an element *value* in *map*. *flags* is one of: * @@ -2489,19 +2489,19 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * long bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * long bpf_map_peek_elem(struct bpf_map *map, void *value) * Description * Get an element from *map* without removing it. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. @@ -2517,7 +2517,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if @@ -2529,7 +2529,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded pointer movement. @@ -2543,7 +2543,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_lock(struct bpf_spin_lock *lock) + * long bpf_spin_lock(struct bpf_spin_lock *lock) * Description * Acquire a spinlock represented by the pointer *lock*, which is * stored as part of a value of a map. Taking the lock allows to @@ -2591,7 +2591,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * long bpf_spin_unlock(struct bpf_spin_lock *lock) * Description * Release the *lock* previously locked by a call to * **bpf_spin_lock**\ (\ *lock*\ ). @@ -2614,7 +2614,7 @@ union bpf_attr { * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * - * int bpf_skb_ecn_set_ce(struct sk_buff *skb) + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** @@ -2651,7 +2651,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2666,7 +2666,7 @@ union bpf_attr { * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. * - * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) * Description * Get name of sysctl in /proc/sys/ and copy it into provided by * program buffer *buf* of size *buf_len*. @@ -2682,7 +2682,7 @@ union bpf_attr { * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). * - * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get current value of sysctl as it is presented in /proc/sys * (incl. newline, etc), and copy it as a string into provided @@ -2701,7 +2701,7 @@ union bpf_attr { * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. * - * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get new value being written by user space to sysctl (before * the actual write happens) and copy it as a string into @@ -2718,7 +2718,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) * Description * Override new value being written by user space to sysctl with * value provided by program in buffer *buf* of size *buf_len*. @@ -2735,7 +2735,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to a long integer according to the given base @@ -2759,7 +2759,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to an unsigned long integer according to the @@ -2810,7 +2810,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return @@ -2818,7 +2818,7 @@ union bpf_attr { * * **-ENOENT** if the bpf-local-storage cannot be found. * - * int bpf_send_signal(u32 sig) + * long bpf_send_signal(u32 sig) * Description * Send signal *sig* to the process of the current task. * The signal may be delivered to any of this process's threads. @@ -2859,7 +2859,7 @@ union bpf_attr { * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 * - * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -2883,21 +2883,21 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from user space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from kernel space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe user address * *unsafe_ptr* to *dst*. The *size* should include the @@ -2941,7 +2941,7 @@ union bpf_attr { * including the trailing NUL character. On error, a negative * value. * - * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. @@ -2949,14 +2949,14 @@ union bpf_attr { * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. * - * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * Description * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_send_signal_thread(u32 sig) + * long bpf_send_signal_thread(u32 sig) * Description * Send signal *sig* to the thread corresponding to the current task. * Return @@ -2976,7 +2976,7 @@ union bpf_attr { * Return * The 64 bit jiffies * - * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * Description * For an eBPF program attached to a perf event, retrieve the * branch records (**struct perf_branch_entry**) associated to *ctx* @@ -2995,7 +2995,7 @@ union bpf_attr { * * **-ENOENT** if architecture does not support branch records. * - * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) * Description * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. @@ -3007,7 +3007,7 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * - * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -3062,7 +3062,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, @@ -3097,7 +3097,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print * out the format string. @@ -3126,7 +3126,7 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) * Description * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. * The *m* represents the seq_file. The *data* and *len* represent the @@ -3221,7 +3221,7 @@ union bpf_attr { * Return * Requested value, or 0, if flags are not recognized. * - * int bpf_csum_level(struct sk_buff *skb, u64 level) + * long bpf_csum_level(struct sk_buff *skb, u64 level) * Description * Change the skbs checksum level by one layer up or down, or * reset it entirely to none in order to have the stack perform -- cgit v1.2.3 From bcc7f554cfa7e0ac77c7adc4027c16f4a2f99c6f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 23 Jun 2020 16:39:35 +0100 Subject: bpf: Fix formatting in documentation for BPF helpers When producing the bpf-helpers.7 man page from the documentation from the BPF user space header file, rst2man complains: :2636: (ERROR/3) Unexpected indentation. :2640: (WARNING/2) Block quote ends without a blank line; unexpected unindent. Let's fix formatting for the relevant chunk (item list in bpf_ringbuf_query()'s description), and for a couple other functions. Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200623153935.6215-1-quentin@isovalent.com --- include/uapi/linux/bpf.h | 41 +++++++++++++++++++++-------------------- tools/include/uapi/linux/bpf.h | 41 +++++++++++++++++++++-------------------- 2 files changed, 42 insertions(+), 40 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 974a71342aea..8bd33050b7bb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3171,13 +3171,12 @@ union bpf_attr { * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return - * 0, on success; - * < 0, on error. + * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description @@ -3189,20 +3188,20 @@ union bpf_attr { * void bpf_ringbuf_submit(void *data, u64 flags) * Description * Submit reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * * void bpf_ringbuf_discard(void *data, u64 flags) * Description * Discard reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * @@ -3210,16 +3209,18 @@ union bpf_attr { * Description * Query various characteristics of provided ring buffer. What * exactly is queries is determined by *flags*: - * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; - * - BPF_RB_RING_SIZE - the size of ring buffer; - * - BPF_RB_CONS_POS - consumer position (can wrap around); - * - BPF_RB_PROD_POS - producer(s) position (can wrap around); - * Data returned is just a momentary snapshots of actual values + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values * and could be inaccurate, so this facility should be used to * power heuristics and for reporting, not to make 100% correct * calculation. * Return - * Requested value, or 0, if flags are not recognized. + * Requested value, or 0, if *flags* are not recognized. * * int bpf_csum_level(struct sk_buff *skb, u64 level) * Description diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 974a71342aea..8bd33050b7bb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3171,13 +3171,12 @@ union bpf_attr { * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return - * 0, on success; - * < 0, on error. + * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description @@ -3189,20 +3188,20 @@ union bpf_attr { * void bpf_ringbuf_submit(void *data, u64 flags) * Description * Submit reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * * void bpf_ringbuf_discard(void *data, u64 flags) * Description * Discard reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * @@ -3210,16 +3209,18 @@ union bpf_attr { * Description * Query various characteristics of provided ring buffer. What * exactly is queries is determined by *flags*: - * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; - * - BPF_RB_RING_SIZE - the size of ring buffer; - * - BPF_RB_CONS_POS - consumer position (can wrap around); - * - BPF_RB_PROD_POS - producer(s) position (can wrap around); - * Data returned is just a momentary snapshots of actual values + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values * and could be inaccurate, so this facility should be used to * power heuristics and for reporting, not to make 100% correct * calculation. * Return - * Requested value, or 0, if flags are not recognized. + * Requested value, or 0, if *flags* are not recognized. * * int bpf_csum_level(struct sk_buff *skb, u64 level) * Description -- cgit v1.2.3 From 428d2459cceb77357b81c242ca22462a6a904817 Mon Sep 17 00:00:00 2001 From: Petr Vaněk Date: Sat, 30 May 2020 14:39:12 +0200 Subject: xfrm: introduce oseq-may-wrap flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RFC 4303 in section 3.3.3 suggests to disable anti-replay for manually distributed ICVs in which case the sender does not need to monitor or reset the counter. However, the sender still increments the counter and when it reaches the maximum value, the counter rolls over back to zero. This patch introduces new extra_flag XFRM_SA_XFLAG_OSEQ_MAY_WRAP which allows sequence number to cycle in outbound packets if set. This flag is used only in legacy and bmp code, because esn should not be negotiated if anti-replay is disabled (see note in 3.3.3 section). Signed-off-by: Petr Vaněk Acked-by: Christophe Gouault Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 1 + net/xfrm/xfrm_replay.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index ff7cfdc6cb44..ffc6a5391bb7 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -387,6 +387,7 @@ struct xfrm_usersa_info { }; #define XFRM_SA_XFLAG_DONT_ENCAP_DSCP 1 +#define XFRM_SA_XFLAG_OSEQ_MAY_WRAP 2 struct xfrm_usersa_id { xfrm_address_t daddr; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 98943f8d01aa..c6a4338a0d08 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -89,7 +89,8 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq; XFRM_SKB_CB(skb)->seq.output.hi = 0; - if (unlikely(x->replay.oseq == 0)) { + if (unlikely(x->replay.oseq == 0) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { x->replay.oseq--; xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -168,7 +169,8 @@ static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq; XFRM_SKB_CB(skb)->seq.output.hi = 0; - if (unlikely(replay_esn->oseq == 0)) { + if (unlikely(replay_esn->oseq == 0) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { replay_esn->oseq--; xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -572,7 +574,8 @@ static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *sk XFRM_SKB_CB(skb)->seq.output.hi = 0; xo->seq.hi = 0; - if (unlikely(oseq < x->replay.oseq)) { + if (unlikely(oseq < x->replay.oseq) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -611,7 +614,8 @@ static int xfrm_replay_overflow_offload_bmp(struct xfrm_state *x, struct sk_buff XFRM_SKB_CB(skb)->seq.output.hi = 0; xo->seq.hi = 0; - if (unlikely(oseq < replay_esn->oseq)) { + if (unlikely(oseq < replay_esn->oseq) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; -- cgit v1.2.3 From 0b8975bdc0cc5310d48d9bdd871cefebe1f94c99 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Jun 2020 10:27:48 -0700 Subject: dmaengine: idxd: fix hw descriptor fields for delta record Fix the hw descriptor fields for delta record in user exported idxd.h header. Missing the "expected result mask" field. Reported-by: Mona Hossain Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/159120526866.65385.536565786678052944.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- include/uapi/linux/idxd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index 1f412fbf561b..e103c1434e4b 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -110,9 +110,12 @@ struct dsa_hw_desc { uint16_t rsvd1; union { uint8_t expected_res; + /* create delta record */ struct { uint64_t delta_addr; uint32_t max_delta_size; + uint32_t delt_rsvd; + uint8_t expected_res_mask; }; uint32_t delta_rec_size; uint64_t dest2; -- cgit v1.2.3 From 65959522f8060659e308977f09f3eb7b7af5e43f Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 23 Jun 2020 14:30:40 +0300 Subject: RDMA: Add support to dump resource tracker in RAW format Add support to get resource dump in raw format. It enable drivers to return the entire device specific QP/CQ/MR context without a need from the driver to set each field separately. The raw query returns only the device specific data, general data is still returned by using the existing queries. Example: $ rdma res show mr dev mlx5_1 mrn 2 -r -j [{"ifindex":7,"ifname":"mlx5_1", "data":[0,4,255,254,0,0,0,0,0,0,0,0,16,28,0,216,...]}] Link: https://lore.kernel.org/r/20200623113043.1228482-9-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 3 + drivers/infiniband/core/nldev.c | 180 +++++++++++++++++++++++++-------------- include/rdma/ib_verbs.h | 3 + include/uapi/rdma/rdma_netlink.h | 8 ++ 4 files changed, 132 insertions(+), 62 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index cbe95e729cf1..1335ed1f1e4a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2619,8 +2619,11 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, enable_driver); SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry); + SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_mr_entry); + SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_qp_entry); + SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 394e307c342c..1051b5622b62 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -114,6 +114,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_RAW] = { .type = NLA_BINARY }, [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, @@ -446,11 +447,11 @@ static int fill_res_name_pid(struct sk_buff *msg, return err ? -EMSGSIZE : 0; } -static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, - struct rdma_restrack_entry *res, uint32_t port) +static int fill_res_qp_entry_query(struct sk_buff *msg, + struct rdma_restrack_entry *res, + struct ib_device *dev, + struct ib_qp *qp) { - struct ib_qp *qp = container_of(res, struct ib_qp, res); - struct ib_device *dev = qp->device; struct ib_qp_init_attr qp_init_attr; struct ib_qp_attr qp_attr; int ret; @@ -459,16 +460,6 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, if (ret) return ret; - if (port && port != qp_attr.port_num) - return -EAGAIN; - - /* In create_qp() port is not set yet */ - if (qp_attr.port_num && - nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp_attr.port_num)) - goto err; - - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num)) - goto err; if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN, qp_attr.dest_qp_num)) @@ -492,13 +483,6 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; - if (!rdma_is_kernel_res(res) && - nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) - goto err; - - if (fill_res_name_pid(msg, res)) - goto err; - if (dev->ops.fill_res_qp_entry) return dev->ops.fill_res_qp_entry(msg, qp); return 0; @@ -506,6 +490,48 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, err: return -EMSGSIZE; } +static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + struct ib_device *dev = qp->device; + int ret; + + if (port && port != qp->port) + return -EAGAIN; + + /* In create_qp() port is not set yet */ + if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port)) + return -EINVAL; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num); + if (ret) + return -EMSGSIZE; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) + return -EMSGSIZE; + + ret = fill_res_name_pid(msg, res); + if (ret) + return -EMSGSIZE; + + return fill_res_qp_entry_query(msg, res, dev, qp); +} + +static int fill_res_qp_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + struct ib_device *dev = qp->device; + + if (port && port != qp->port) + return -EAGAIN; + if (!dev->ops.fill_res_qp_entry_raw) + return -EINVAL; + return dev->ops.fill_res_qp_entry_raw(msg, qp); +} + static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { @@ -565,34 +591,42 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct ib_device *dev = cq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) - goto err; + return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD)) - goto err; + return -EMSGSIZE; /* Poll context is only valid for kernel CQs */ if (rdma_is_kernel_res(res) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) - goto err; + return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL))) - goto err; + return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) - goto err; + return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, cq->uobject->uevent.uobject.context->res.id)) - goto err; + return -EMSGSIZE; if (fill_res_name_pid(msg, res)) - goto err; + return -EMSGSIZE; - if (dev->ops.fill_res_cq_entry) - return dev->ops.fill_res_cq_entry(msg, cq); - return 0; + return (dev->ops.fill_res_cq_entry) ? + dev->ops.fill_res_cq_entry(msg, cq) : 0; +} -err: return -EMSGSIZE; +static int fill_res_cq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_cq *cq = container_of(res, struct ib_cq, res); + struct ib_device *dev = cq->device; + + if (!dev->ops.fill_res_cq_entry_raw) + return -EINVAL; + return dev->ops.fill_res_cq_entry_raw(msg, cq); } static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, @@ -603,30 +637,39 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey)) - goto err; + return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) - goto err; + return -EMSGSIZE; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, RDMA_NLDEV_ATTR_PAD)) - goto err; + return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) - goto err; + return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) - goto err; + return -EMSGSIZE; if (fill_res_name_pid(msg, res)) - goto err; + return -EMSGSIZE; - if (dev->ops.fill_res_mr_entry) - return dev->ops.fill_res_mr_entry(msg, mr); - return 0; + return (dev->ops.fill_res_mr_entry) ? + dev->ops.fill_res_mr_entry(msg, mr) : + 0; +} -err: return -EMSGSIZE; +static int fill_res_mr_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_mr *mr = container_of(res, struct ib_mr, res); + struct ib_device *dev = mr->pd->device; + + if (!dev->ops.fill_res_mr_entry_raw) + return -EINVAL; + return dev->ops.fill_res_mr_entry_raw(msg, mr); } static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, @@ -1149,7 +1192,6 @@ static int nldev_res_get_dumpit(struct sk_buff *skb, struct nldev_fill_res_entry { enum rdma_nldev_attr nldev_attr; - enum rdma_nldev_command nldev_cmd; u8 flags; u32 entry; u32 id; @@ -1161,40 +1203,34 @@ enum nldev_res_flags { static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY, .id = RDMA_NLDEV_ATTR_RES_LQPN, }, [RDMA_RESTRACK_CM_ID] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CM_IDN, }, [RDMA_RESTRACK_CQ] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CQN, }, [RDMA_RESTRACK_MR] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, .id = RDMA_NLDEV_ATTR_RES_MRN, }, [RDMA_RESTRACK_PD] = { - .nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, .id = RDMA_NLDEV_ATTR_RES_PDN, }, [RDMA_RESTRACK_COUNTER] = { - .nldev_cmd = RDMA_NLDEV_CMD_STAT_GET, .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER, .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID, @@ -1253,7 +1289,8 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd), + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NL_GET_OP(nlh->nlmsg_type)), 0, 0); if (fill_nldev_handle(msg, device)) { @@ -1331,7 +1368,8 @@ static int res_get_common_dumpit(struct sk_buff *skb, } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd), + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NL_GET_OP(cb->nlh->nlmsg_type)), 0, NLM_F_MULTI); if (fill_nldev_handle(skb, device)) { @@ -1413,26 +1451,29 @@ err_index: return ret; } -#define RES_GET_FUNCS(name, type) \ - static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ +#define RES_GET_FUNCS(name, type) \ + static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ struct netlink_callback *cb) \ - { \ - return res_get_common_dumpit(skb, cb, type, \ - fill_res_##name##_entry); \ - } \ - static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ - struct nlmsghdr *nlh, \ + { \ + return res_get_common_dumpit(skb, cb, type, \ + fill_res_##name##_entry); \ + } \ + static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ + struct nlmsghdr *nlh, \ struct netlink_ext_ack *extack) \ - { \ - return res_get_common_doit(skb, nlh, extack, type, \ - fill_res_##name##_entry); \ + { \ + return res_get_common_doit(skb, nlh, extack, type, \ + fill_res_##name##_entry); \ } RES_GET_FUNCS(qp, RDMA_RESTRACK_QP); +RES_GET_FUNCS(qp_raw, RDMA_RESTRACK_QP); RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); +RES_GET_FUNCS(cq_raw, RDMA_RESTRACK_CQ); RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); +RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR); RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER); static LIST_HEAD(link_ops); @@ -2117,6 +2158,21 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_stat_del_doit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_RES_QP_GET_RAW] = { + .doit = nldev_res_get_qp_raw_doit, + .dump = nldev_res_get_qp_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_CQ_GET_RAW] = { + .doit = nldev_res_get_cq_raw_doit, + .dump = nldev_res_get_cq_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_MR_GET_RAW] = { + .doit = nldev_res_get_mr_raw_doit, + .dump = nldev_res_get_mr_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, }; void __init nldev_init(void) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9127cffafccd..77106ff3cd26 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2583,8 +2583,11 @@ struct ib_device_ops { * Allows rdma drivers to add their own restrack attributes. */ int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); + int (*fill_res_mr_entry_raw)(struct sk_buff *msg, struct ib_mr *ibmr); int (*fill_res_cq_entry)(struct sk_buff *msg, struct ib_cq *ibcq); + int (*fill_res_cq_entry_raw)(struct sk_buff *msg, struct ib_cq *ibcq); int (*fill_res_qp_entry)(struct sk_buff *msg, struct ib_qp *ibqp); + int (*fill_res_qp_entry_raw)(struct sk_buff *msg, struct ib_qp *ibqp); int (*fill_res_cm_id_entry)(struct sk_buff *msg, struct rdma_cm_id *id); /* Device lifecycle callbacks */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 8e277783fa96..3826143d420d 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -287,6 +287,12 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_STAT_DEL, + RDMA_NLDEV_CMD_RES_QP_GET_RAW, + + RDMA_NLDEV_CMD_RES_CQ_GET_RAW, + + RDMA_NLDEV_CMD_RES_MR_GET_RAW, + RDMA_NLDEV_NUM_OPS }; @@ -525,6 +531,8 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_DEV_DIM, /* u8 */ + RDMA_NLDEV_ATTR_RES_RAW, /* binary */ + /* * Always the end */ -- cgit v1.2.3 From 62fb45d317c5fa08e4db093441835bb6f33acbd7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 18 Jun 2020 16:42:06 +0200 Subject: USB: ch9: add "USB_" prefix in front of TEST defines For some reason, the TEST_ defines in the usb/ch9.h files did not have the USB_ prefix on it, making it a bit confusing when reading the file, as well as not the nicest thing to do in a uapi file. So fix that up and add the USB_ prefix on to them, and fix up all in-kernel usages. This included deleting the duplicate copy in the net2272.h file. Cc: Felipe Balbi Cc: Michal Simek Cc: Mathias Nyman Cc: Pawel Laszczak Cc: YueHaibing Cc: Nathan Chancellor Cc: Jason Yan Cc: Jia-Ju Bai Cc: Stephen Boyd Cc: Christophe JAILLET Cc: Arnd Bergmann Cc: Jules Irenge Cc: Alan Stern Cc: Thinh Nguyen Cc: Rob Gill Cc: Macpaul Lin Acked-by: Minas Harutyunyan Acked-by: Bin Liu Acked-by: Chunfeng Yun Acked-by: Peter Chen Link: https://lore.kernel.org/r/20200618144206.2655890-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/ep0.c | 8 ++++---- drivers/usb/chipidea/udc.c | 10 +++++----- drivers/usb/common/debug.c | 10 +++++----- drivers/usb/dwc2/debugfs.c | 20 ++++++++++---------- drivers/usb/dwc2/gadget.c | 10 +++++----- drivers/usb/dwc3/debugfs.c | 20 ++++++++++---------- drivers/usb/dwc3/ep0.c | 10 +++++----- drivers/usb/dwc3/gadget.c | 10 +++++----- drivers/usb/gadget/udc/bdc/bdc_ep.c | 10 +++++----- drivers/usb/gadget/udc/gr_udc.c | 4 ++-- drivers/usb/gadget/udc/mv_udc_core.c | 2 +- drivers/usb/gadget/udc/net2272.c | 2 +- drivers/usb/gadget/udc/net2272.h | 5 ----- drivers/usb/gadget/udc/udc-xilinx.c | 4 ++-- drivers/usb/host/xhci-hub.c | 7 ++++--- drivers/usb/misc/ehset.c | 8 ++++---- drivers/usb/mtu3/mtu3_gadget_ep0.c | 16 ++++++++-------- drivers/usb/musb/musb_gadget_ep0.c | 20 ++++++++------------ drivers/usb/musb/musb_virthub.c | 20 ++++++++++---------- include/uapi/linux/usb/ch9.h | 10 +++++----- 20 files changed, 99 insertions(+), 107 deletions(-) (limited to 'include/uapi') diff --git a/drivers/usb/cdns3/ep0.c b/drivers/usb/cdns3/ep0.c index 82645a2a0f52..04a522f5ae58 100644 --- a/drivers/usb/cdns3/ep0.c +++ b/drivers/usb/cdns3/ep0.c @@ -328,10 +328,10 @@ static int cdns3_ep0_feature_handle_device(struct cdns3_device *priv_dev, return -EINVAL; switch (tmode >> 8) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: cdns3_set_register_bit(&priv_dev->regs->usb_cmd, USB_CMD_STMODE | USB_STS_TMODE_SEL(tmode - 1)); diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c index db0cfde0cc3c..4beb25888917 100644 --- a/drivers/usb/chipidea/udc.c +++ b/drivers/usb/chipidea/udc.c @@ -1215,11 +1215,11 @@ __acquires(ci->lock) case USB_DEVICE_TEST_MODE: tmode = le16_to_cpu(req.wIndex) >> 8; switch (tmode) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: - case TEST_FORCE_EN: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: + case USB_TEST_FORCE_ENABLE: ci->test_mode = tmode; err = isr_setup_status_phase( ci); diff --git a/drivers/usb/common/debug.c b/drivers/usb/common/debug.c index 92a986aeaa5d..410acd670ca7 100644 --- a/drivers/usb/common/debug.c +++ b/drivers/usb/common/debug.c @@ -53,15 +53,15 @@ static const char *usb_decode_device_feature(u16 wValue) static const char *usb_decode_test_mode(u16 wIndex) { switch (wIndex) { - case TEST_J: + case USB_TEST_J: return ": TEST_J"; - case TEST_K: + case USB_TEST_K: return ": TEST_K"; - case TEST_SE0_NAK: + case USB_TEST_SE0_NAK: return ": TEST_SE0_NAK"; - case TEST_PACKET: + case USB_TEST_PACKET: return ": TEST_PACKET"; - case TEST_FORCE_EN: + case USB_TEST_FORCE_ENABLE: return ": TEST_FORCE_EN"; default: return ": UNKNOWN"; diff --git a/drivers/usb/dwc2/debugfs.c b/drivers/usb/dwc2/debugfs.c index 3a0dcbfbc827..aaafd463d72a 100644 --- a/drivers/usb/dwc2/debugfs.c +++ b/drivers/usb/dwc2/debugfs.c @@ -37,15 +37,15 @@ static ssize_t testmode_write(struct file *file, const char __user *ubuf, size_t return -EFAULT; if (!strncmp(buf, "test_j", 6)) - testmode = TEST_J; + testmode = USB_TEST_J; else if (!strncmp(buf, "test_k", 6)) - testmode = TEST_K; + testmode = USB_TEST_K; else if (!strncmp(buf, "test_se0_nak", 12)) - testmode = TEST_SE0_NAK; + testmode = USB_TEST_SE0_NAK; else if (!strncmp(buf, "test_packet", 11)) - testmode = TEST_PACKET; + testmode = USB_TEST_PACKET; else if (!strncmp(buf, "test_force_enable", 17)) - testmode = TEST_FORCE_EN; + testmode = USB_TEST_FORCE_ENABLE; else testmode = 0; @@ -78,19 +78,19 @@ static int testmode_show(struct seq_file *s, void *unused) case 0: seq_puts(s, "no test\n"); break; - case TEST_J: + case USB_TEST_J: seq_puts(s, "test_j\n"); break; - case TEST_K: + case USB_TEST_K: seq_puts(s, "test_k\n"); break; - case TEST_SE0_NAK: + case USB_TEST_SE0_NAK: seq_puts(s, "test_se0_nak\n"); break; - case TEST_PACKET: + case USB_TEST_PACKET: seq_puts(s, "test_packet\n"); break; - case TEST_FORCE_EN: + case USB_TEST_FORCE_ENABLE: seq_puts(s, "test_force_enable\n"); break; default: diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index 12b98b466287..38fc46b0c026 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -1561,11 +1561,11 @@ int dwc2_hsotg_set_test_mode(struct dwc2_hsotg *hsotg, int testmode) dctl &= ~DCTL_TSTCTL_MASK; switch (testmode) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: - case TEST_FORCE_EN: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: + case USB_TEST_FORCE_ENABLE: dctl |= testmode << DCTL_TSTCTL_SHIFT; break; default: diff --git a/drivers/usb/dwc3/debugfs.c b/drivers/usb/dwc3/debugfs.c index 6d9de334e46a..14dc6a37305d 100644 --- a/drivers/usb/dwc3/debugfs.c +++ b/drivers/usb/dwc3/debugfs.c @@ -466,19 +466,19 @@ static int dwc3_testmode_show(struct seq_file *s, void *unused) case 0: seq_printf(s, "no test\n"); break; - case TEST_J: + case USB_TEST_J: seq_printf(s, "test_j\n"); break; - case TEST_K: + case USB_TEST_K: seq_printf(s, "test_k\n"); break; - case TEST_SE0_NAK: + case USB_TEST_SE0_NAK: seq_printf(s, "test_se0_nak\n"); break; - case TEST_PACKET: + case USB_TEST_PACKET: seq_printf(s, "test_packet\n"); break; - case TEST_FORCE_EN: + case USB_TEST_FORCE_ENABLE: seq_printf(s, "test_force_enable\n"); break; default: @@ -506,15 +506,15 @@ static ssize_t dwc3_testmode_write(struct file *file, return -EFAULT; if (!strncmp(buf, "test_j", 6)) - testmode = TEST_J; + testmode = USB_TEST_J; else if (!strncmp(buf, "test_k", 6)) - testmode = TEST_K; + testmode = USB_TEST_K; else if (!strncmp(buf, "test_se0_nak", 12)) - testmode = TEST_SE0_NAK; + testmode = USB_TEST_SE0_NAK; else if (!strncmp(buf, "test_packet", 11)) - testmode = TEST_PACKET; + testmode = USB_TEST_PACKET; else if (!strncmp(buf, "test_force_enable", 17)) - testmode = TEST_FORCE_EN; + testmode = USB_TEST_FORCE_ENABLE; else testmode = 0; diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index 6dee4dabc0a4..8dd69728add3 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -425,11 +425,11 @@ static int dwc3_ep0_handle_test(struct dwc3 *dwc, enum usb_device_state state, return -EINVAL; switch (wIndex >> 8) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: - case TEST_FORCE_EN: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: + case USB_TEST_FORCE_ENABLE: dwc->test_mode_nr = wIndex >> 8; dwc->test_mode = true; break; diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 80c3ef134e41..0b59b2f1cf26 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -46,11 +46,11 @@ int dwc3_gadget_set_test_mode(struct dwc3 *dwc, int mode) reg &= ~DWC3_DCTL_TSTCTRL_MASK; switch (mode) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: - case TEST_FORCE_EN: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: + case USB_TEST_FORCE_ENABLE: reg |= mode << 1; break; default: diff --git a/drivers/usb/gadget/udc/bdc/bdc_ep.c b/drivers/usb/gadget/udc/bdc/bdc_ep.c index d49c6dc1082d..ba250cf75bef 100644 --- a/drivers/usb/gadget/udc/bdc/bdc_ep.c +++ b/drivers/usb/gadget/udc/bdc/bdc_ep.c @@ -927,11 +927,11 @@ static int bdc_set_test_mode(struct bdc *bdc) usb2_pm &= ~BDC_PTC_MASK; dev_dbg(bdc->dev, "%s\n", __func__); switch (bdc->test_mode) { - case TEST_J: - case TEST_K: - case TEST_SE0_NAK: - case TEST_PACKET: - case TEST_FORCE_EN: + case USB_TEST_J: + case USB_TEST_K: + case USB_TEST_SE0_NAK: + case USB_TEST_PACKET: + case USB_TEST_FORCE_ENABLE: usb2_pm |= bdc->test_mode << 28; break; default: diff --git a/drivers/usb/gadget/udc/gr_udc.c b/drivers/usb/gadget/udc/gr_udc.c index 7164ad9800f1..345e28d76709 100644 --- a/drivers/usb/gadget/udc/gr_udc.c +++ b/drivers/usb/gadget/udc/gr_udc.c @@ -912,9 +912,9 @@ static int gr_device_request(struct gr_udc *dev, u8 type, u8 request, return gr_ep0_respond_empty(dev); case USB_DEVICE_TEST_MODE: - /* The hardware does not support TEST_FORCE_EN */ + /* The hardware does not support USB_TEST_FORCE_ENABLE */ test = index >> 8; - if (test >= TEST_J && test <= TEST_PACKET) { + if (test >= USB_TEST_J && test <= USB_TEST_PACKET) { dev->test_mode = test; return gr_ep0_respond(dev, NULL, 0, gr_ep0_testmode_complete); diff --git a/drivers/usb/gadget/udc/mv_udc_core.c b/drivers/usb/gadget/udc/mv_udc_core.c index cafde053788b..69289717d856 100644 --- a/drivers/usb/gadget/udc/mv_udc_core.c +++ b/drivers/usb/gadget/udc/mv_udc_core.c @@ -1502,7 +1502,7 @@ out: static void mv_udc_testmode(struct mv_udc *udc, u16 index) { - if (index <= TEST_FORCE_EN) { + if (index <= USB_TEST_FORCE_ENABLE) { udc->test_mode = index; if (udc_prime_status(udc, EP_DIR_IN, 0, true)) ep0_stall(udc); diff --git a/drivers/usb/gadget/udc/net2272.c b/drivers/usb/gadget/udc/net2272.c index 928057b206f1..fbbe62513545 100644 --- a/drivers/usb/gadget/udc/net2272.c +++ b/drivers/usb/gadget/udc/net2272.c @@ -1688,7 +1688,7 @@ net2272_set_test_mode(struct net2272 *dev, int mode) net2272_write(dev, USBTEST, mode); /* load test packet */ - if (mode == TEST_PACKET) { + if (mode == USB_TEST_PACKET) { /* switch to 8 bit mode */ net2272_write(dev, LOCCTL, net2272_read(dev, LOCCTL) & ~(1 << DATA_WIDTH)); diff --git a/drivers/usb/gadget/udc/net2272.h b/drivers/usb/gadget/udc/net2272.h index 8e644627992d..87d0ab9ffeeb 100644 --- a/drivers/usb/gadget/udc/net2272.h +++ b/drivers/usb/gadget/udc/net2272.h @@ -105,11 +105,6 @@ #define USBTEST 0x32 #define TEST_MODE_SELECT 0 #define NORMAL_OPERATION 0 -#define TEST_J 1 -#define TEST_K 2 -#define TEST_SE0_NAK 3 -#define TEST_PACKET 4 -#define TEST_FORCE_ENABLE 5 #define XCVRDIAG 0x33 #define FORCE_FULL_SPEED 2 #define FORCE_HIGH_SPEED 3 diff --git a/drivers/usb/gadget/udc/udc-xilinx.c b/drivers/usb/gadget/udc/udc-xilinx.c index 709553bdb233..d5e9d20c097d 100644 --- a/drivers/usb/gadget/udc/udc-xilinx.c +++ b/drivers/usb/gadget/udc/udc-xilinx.c @@ -2097,9 +2097,9 @@ static int xudc_probe(struct platform_device *pdev) /* Check for IP endianness */ udc->write_fn = xudc_write32_be; udc->read_fn = xudc_read32_be; - udc->write_fn(udc->addr, XUSB_TESTMODE_OFFSET, TEST_J); + udc->write_fn(udc->addr, XUSB_TESTMODE_OFFSET, USB_TEST_J); if ((udc->read_fn(udc->addr + XUSB_TESTMODE_OFFSET)) - != TEST_J) { + != USB_TEST_J) { udc->write_fn = xudc_write32; udc->read_fn = xudc_read32; } diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index f37316d2c8fa..073c54e42223 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -612,7 +612,7 @@ static void xhci_port_set_test_mode(struct xhci_hcd *xhci, temp |= test_mode << PORT_TEST_MODE_SHIFT; writel(temp, port->addr + PORTPMSC); xhci->test_mode = test_mode; - if (test_mode == TEST_FORCE_EN) + if (test_mode == USB_TEST_FORCE_ENABLE) xhci_start(xhci); } @@ -666,7 +666,7 @@ static int xhci_exit_test_mode(struct xhci_hcd *xhci) xhci_err(xhci, "Not in test mode, do nothing.\n"); return 0; } - if (xhci->test_mode == TEST_FORCE_EN && + if (xhci->test_mode == USB_TEST_FORCE_ENABLE && !(xhci->xhc_state & XHCI_STATE_HALTED)) { retval = xhci_halt(xhci); if (retval) @@ -1421,7 +1421,8 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, /* 4.19.6 Port Test Modes (USB2 Test Mode) */ if (hcd->speed != HCD_USB2) goto error; - if (test_mode > TEST_FORCE_EN || test_mode < TEST_J) + if (test_mode > USB_TEST_FORCE_ENABLE || + test_mode < USB_TEST_J) goto error; retval = xhci_enter_test_mode(xhci, test_mode, wIndex, &flags); diff --git a/drivers/usb/misc/ehset.c b/drivers/usb/misc/ehset.c index 7895d61e733b..2752e1f4f4d0 100644 --- a/drivers/usb/misc/ehset.c +++ b/drivers/usb/misc/ehset.c @@ -33,28 +33,28 @@ static int ehset_probe(struct usb_interface *intf, ret = usb_control_msg(hub_udev, usb_sndctrlpipe(hub_udev, 0), USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, - (TEST_SE0_NAK << 8) | portnum, + (USB_TEST_SE0_NAK << 8) | portnum, NULL, 0, 1000); break; case TEST_J_PID: ret = usb_control_msg(hub_udev, usb_sndctrlpipe(hub_udev, 0), USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, - (TEST_J << 8) | portnum, + (USB_TEST_J << 8) | portnum, NULL, 0, 1000); break; case TEST_K_PID: ret = usb_control_msg(hub_udev, usb_sndctrlpipe(hub_udev, 0), USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, - (TEST_K << 8) | portnum, + (USB_TEST_K << 8) | portnum, NULL, 0, 1000); break; case TEST_PACKET_PID: ret = usb_control_msg(hub_udev, usb_sndctrlpipe(hub_udev, 0), USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, - (TEST_PACKET << 8) | portnum, + (USB_TEST_PACKET << 8) | portnum, NULL, 0, 1000); break; case TEST_HS_HOST_PORT_SUSPEND_RESUME: diff --git a/drivers/usb/mtu3/mtu3_gadget_ep0.c b/drivers/usb/mtu3/mtu3_gadget_ep0.c index 2be182bd793a..563a0a2e970d 100644 --- a/drivers/usb/mtu3/mtu3_gadget_ep0.c +++ b/drivers/usb/mtu3/mtu3_gadget_ep0.c @@ -278,20 +278,20 @@ static int handle_test_mode(struct mtu3 *mtu, struct usb_ctrlrequest *setup) u32 value; switch (le16_to_cpu(setup->wIndex) >> 8) { - case TEST_J: - dev_dbg(mtu->dev, "TEST_J\n"); + case USB_TEST_J: + dev_dbg(mtu->dev, "USB_TEST_J\n"); mtu->test_mode_nr = TEST_J_MODE; break; - case TEST_K: - dev_dbg(mtu->dev, "TEST_K\n"); + case USB_TEST_K: + dev_dbg(mtu->dev, "USB_TEST_K\n"); mtu->test_mode_nr = TEST_K_MODE; break; - case TEST_SE0_NAK: - dev_dbg(mtu->dev, "TEST_SE0_NAK\n"); + case USB_TEST_SE0_NAK: + dev_dbg(mtu->dev, "USB_TEST_SE0_NAK\n"); mtu->test_mode_nr = TEST_SE0_NAK_MODE; break; - case TEST_PACKET: - dev_dbg(mtu->dev, "TEST_PACKET\n"); + case USB_TEST_PACKET: + dev_dbg(mtu->dev, "USB_TEST_PACKET\n"); mtu->test_mode_nr = TEST_PACKET_MODE; break; default: diff --git a/drivers/usb/musb/musb_gadget_ep0.c b/drivers/usb/musb/musb_gadget_ep0.c index 91a5027b5c1f..0ae3e0be043e 100644 --- a/drivers/usb/musb/musb_gadget_ep0.c +++ b/drivers/usb/musb/musb_gadget_ep0.c @@ -311,27 +311,23 @@ __acquires(musb->lock) goto stall; switch (ctrlrequest->wIndex >> 8) { - case 1: - pr_debug("TEST_J\n"); - /* TEST_J */ + case USB_TEST_J: + pr_debug("USB_TEST_J\n"); musb->test_mode_nr = MUSB_TEST_J; break; - case 2: - /* TEST_K */ - pr_debug("TEST_K\n"); + case USB_TEST_K: + pr_debug("USB_TEST_K\n"); musb->test_mode_nr = MUSB_TEST_K; break; - case 3: - /* TEST_SE0_NAK */ - pr_debug("TEST_SE0_NAK\n"); + case USB_TEST_SE0_NAK: + pr_debug("USB_TEST_SE0_NAK\n"); musb->test_mode_nr = MUSB_TEST_SE0_NAK; break; - case 4: - /* TEST_PACKET */ - pr_debug("TEST_PACKET\n"); + case USB_TEST_PACKET: + pr_debug("USB_TEST_PACKET\n"); musb->test_mode_nr = MUSB_TEST_PACKET; break; diff --git a/drivers/usb/musb/musb_virthub.c b/drivers/usb/musb/musb_virthub.c index a84ec27c4c12..cb7ae297a3af 100644 --- a/drivers/usb/musb/musb_virthub.c +++ b/drivers/usb/musb/musb_virthub.c @@ -385,25 +385,25 @@ int musb_hub_control( wIndex >>= 8; switch (wIndex) { - case 1: - pr_debug("TEST_J\n"); + case USB_TEST_J: + pr_debug("USB_TEST_J\n"); temp = MUSB_TEST_J; break; - case 2: - pr_debug("TEST_K\n"); + case USB_TEST_K: + pr_debug("USB_TEST_K\n"); temp = MUSB_TEST_K; break; - case 3: - pr_debug("TEST_SE0_NAK\n"); + case USB_TEST_SE0_NAK: + pr_debug("USB_TEST_SE0_NAK\n"); temp = MUSB_TEST_SE0_NAK; break; - case 4: - pr_debug("TEST_PACKET\n"); + case USB_TEST_PACKET: + pr_debug("USB_TEST_PACKET\n"); temp = MUSB_TEST_PACKET; musb_load_testpacket(musb); break; - case 5: - pr_debug("TEST_FORCE_ENABLE\n"); + case USB_TEST_FORCE_ENABLE: + pr_debug("USB_TEST_FORCE_ENABLE\n"); temp = MUSB_TEST_FORCE_HOST | MUSB_TEST_FORCE_HS; diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index b1ed2ccfe9cf..48766fdf6580 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -138,11 +138,11 @@ * Test Mode Selectors * See USB 2.0 spec Table 9-7 */ -#define TEST_J 1 -#define TEST_K 2 -#define TEST_SE0_NAK 3 -#define TEST_PACKET 4 -#define TEST_FORCE_EN 5 +#define USB_TEST_J 1 +#define USB_TEST_K 2 +#define USB_TEST_SE0_NAK 3 +#define USB_TEST_PACKET 4 +#define USB_TEST_FORCE_ENABLE 5 /* Status Type */ #define USB_STATUS_TYPE_STANDARD 0 -- cgit v1.2.3 From f9bcf96837f158db6ea982d15cd2c8161ca6bc23 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Sat, 20 Jun 2020 18:30:52 +0300 Subject: bpf: Add SO_KEEPALIVE and related options to bpf_setsockopt This patch adds support of SO_KEEPALIVE flag and TCP related options to bpf_setsockopt() routine. This is helpful if we want to enable or tune TCP keepalive for applications which don't do it in the userspace code. v3: - update kernel-doc in uapi (Nikita Vetoshkin ) v4: - update kernel-doc in tools too (Alexei Starovoitov) - add test to selftests (Alexei Starovoitov) Signed-off-by: Dmitry Yakunin Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200620153052.9439-3-zeil@yandex-team.ru --- include/uapi/linux/bpf.h | 7 +++-- net/core/filter.c | 36 ++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 7 +++-- tools/testing/selftests/bpf/progs/connect4_prog.c | 27 +++++++++++++++++ 4 files changed, 72 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9d3923e6b860..d9737d51dd19 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1621,10 +1621,13 @@ union bpf_attr { * * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, - * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, - * **TCP_BPF_SNDCWND_CLAMP**. + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/net/core/filter.c b/net/core/filter.c index 73395384afe2..c713b6b8938f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4289,10 +4289,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen, u32 flags) { char devname[IFNAMSIZ]; + int val, valbool; struct net *net; int ifindex; int ret = 0; - int val; if (!sk_fullsock(sk)) return -EINVAL; @@ -4303,6 +4303,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) return -EINVAL; val = *((int *)optval); + valbool = val ? 1 : 0; /* Only some socketops are supported */ switch (optname) { @@ -4361,6 +4362,11 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, } ret = sock_bindtoindex(sk, ifindex, false); break; + case SO_KEEPALIVE: + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, valbool); + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; default: ret = -EINVAL; } @@ -4421,6 +4427,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, ret = tcp_set_congestion_control(sk, name, false, reinit, true); } else { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if (optlen != sizeof(int)) @@ -4449,6 +4456,33 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, else tp->save_syn = val; break; + case TCP_KEEPIDLE: + ret = tcp_sock_set_keepidle_locked(sk, val); + break; + case TCP_KEEPINTVL: + if (val < 1 || val > MAX_TCP_KEEPINTVL) + ret = -EINVAL; + else + tp->keepalive_intvl = val * HZ; + break; + case TCP_KEEPCNT: + if (val < 1 || val > MAX_TCP_KEEPCNT) + ret = -EINVAL; + else + tp->keepalive_probes = val; + break; + case TCP_SYNCNT: + if (val < 1 || val > MAX_TCP_SYNCNT) + ret = -EINVAL; + else + icsk->icsk_syn_retries = val; + break; + case TCP_USER_TIMEOUT: + if (val < 0) + ret = -EINVAL; + else + icsk->icsk_user_timeout = val; + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9d3923e6b860..d9737d51dd19 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1621,10 +1621,13 @@ union bpf_attr { * * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, - * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, - * **TCP_BPF_SNDCWND_CLAMP**. + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 1ab2c5eba86c..b1b2773c0b9d 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -104,6 +104,30 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) return 0; } +static __inline int set_keepalive(struct bpf_sock_addr *ctx) +{ + int zero = 0, one = 1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_KEEPALIVE, &one, sizeof(one))) + return 1; + if (ctx->type == SOCK_STREAM) { + if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPIDLE, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPINTVL, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPCNT, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_SYNCNT, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_USER_TIMEOUT, &one, sizeof(one))) + return 1; + } + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_KEEPALIVE, &zero, sizeof(zero))) + return 1; + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -121,6 +145,9 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) if (bind_to_device(ctx)) return 0; + if (set_keepalive(ctx)) + return 0; + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) return 0; else if (ctx->type == SOCK_STREAM) -- cgit v1.2.3 From dc5bdb68b5b369d5bc7d1de96fa64cc1737a6320 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 24 Jun 2020 11:29:10 +0200 Subject: drm/fb-helper: Fix vt restore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the past we had a pile of hacks to orchestrate access between fbdev emulation and native kms clients. We've tried to streamline this, by always preferring the kms side above fbdev calls when a drm master exists, because drm master controls access to the display resources. Unfortunately this breaks existing userspace, specifically Xorg. When exiting Xorg first restores the console to text mode using the KDSET ioctl on the vt. This does nothing, because a drm master is still around. Then it drops the drm master status, which again does nothing, because logind is keeping additional drm fd open to be able to orchestrate vt switches. In the past this is the point where fbdev was restored, as part of the ->lastclose hook on the drm side. Now to fix this regression we don't want to go back to letting fbdev restore things whenever it feels like, or to the pile of hacks we've had before. Instead try and go with a minimal exception to make the KDSET case work again, and nothing else. This means that if userspace does a KDSET call when switching between graphical compositors, there will be some flickering with fbcon showing up for a bit. But a) that's not a regression and b) userspace can fix it by improving the vt switching dance - logind should have all the information it needs. While pondering all this I'm also wondering wheter we should have a SWITCH_MASTER ioctl to allow race-free master status handover. But that's for another day. v2: Somehow forgot to cc all the fbdev people. v3: Fix typo Alex spotted. Reviewed-by: Alex Deucher Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=208179 Cc: shlomo@fastmail.com Reported-and-Tested-by: shlomo@fastmail.com Cc: Michel Dänzer Fixes: 64914da24ea9 ("drm/fbdev-helper: don't force restores") Cc: Noralf Trønnes Cc: Thomas Zimmermann Cc: Daniel Vetter Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Cc: # v5.7+ Cc: Bartlomiej Zolnierkiewicz Cc: Geert Uytterhoeven Cc: Nathan Chancellor Cc: Qiujun Huang Cc: Peter Rosin Cc: linux-fbdev@vger.kernel.org Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20200624092910.3280448-1-daniel.vetter@ffwll.ch --- drivers/gpu/drm/drm_fb_helper.c | 63 +++++++++++++++++++++++++++++++--------- drivers/video/fbdev/core/fbcon.c | 3 +- include/uapi/linux/fb.h | 1 + 3 files changed, 52 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 170aa7689110..5609e164805f 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -227,18 +227,9 @@ int drm_fb_helper_debug_leave(struct fb_info *info) } EXPORT_SYMBOL(drm_fb_helper_debug_leave); -/** - * drm_fb_helper_restore_fbdev_mode_unlocked - restore fbdev configuration - * @fb_helper: driver-allocated fbdev helper, can be NULL - * - * This should be called from driver's drm &drm_driver.lastclose callback - * when implementing an fbcon on top of kms using this helper. This ensures that - * the user isn't greeted with a black screen when e.g. X dies. - * - * RETURNS: - * Zero if everything went ok, negative error code otherwise. - */ -int drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper) +static int +__drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper, + bool force) { bool do_delayed; int ret; @@ -250,7 +241,16 @@ int drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper) return 0; mutex_lock(&fb_helper->lock); - ret = drm_client_modeset_commit(&fb_helper->client); + if (force) { + /* + * Yes this is the _locked version which expects the master lock + * to be held. But for forced restores we're intentionally + * racing here, see drm_fb_helper_set_par(). + */ + ret = drm_client_modeset_commit_locked(&fb_helper->client); + } else { + ret = drm_client_modeset_commit(&fb_helper->client); + } do_delayed = fb_helper->delayed_hotplug; if (do_delayed) @@ -262,6 +262,22 @@ int drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper) return ret; } + +/** + * drm_fb_helper_restore_fbdev_mode_unlocked - restore fbdev configuration + * @fb_helper: driver-allocated fbdev helper, can be NULL + * + * This should be called from driver's drm &drm_driver.lastclose callback + * when implementing an fbcon on top of kms using this helper. This ensures that + * the user isn't greeted with a black screen when e.g. X dies. + * + * RETURNS: + * Zero if everything went ok, negative error code otherwise. + */ +int drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper) +{ + return __drm_fb_helper_restore_fbdev_mode_unlocked(fb_helper, false); +} EXPORT_SYMBOL(drm_fb_helper_restore_fbdev_mode_unlocked); #ifdef CONFIG_MAGIC_SYSRQ @@ -1318,6 +1334,7 @@ int drm_fb_helper_set_par(struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct fb_var_screeninfo *var = &info->var; + bool force; if (oops_in_progress) return -EBUSY; @@ -1327,7 +1344,25 @@ int drm_fb_helper_set_par(struct fb_info *info) return -EINVAL; } - drm_fb_helper_restore_fbdev_mode_unlocked(fb_helper); + /* + * Normally we want to make sure that a kms master takes precedence over + * fbdev, to avoid fbdev flickering and occasionally stealing the + * display status. But Xorg first sets the vt back to text mode using + * the KDSET IOCTL with KD_TEXT, and only after that drops the master + * status when exiting. + * + * In the past this was caught by drm_fb_helper_lastclose(), but on + * modern systems where logind always keeps a drm fd open to orchestrate + * the vt switching, this doesn't work. + * + * To not break the userspace ABI we have this special case here, which + * is only used for the above case. Everything else uses the normal + * commit function, which ensures that we never steal the display from + * an active drm master. + */ + force = var->activate & FB_ACTIVATE_KD_TEXT; + + __drm_fb_helper_restore_fbdev_mode_unlocked(fb_helper, force); return 0; } diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 9d28a8e3328f..e2a490c5ae08 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2402,7 +2402,8 @@ static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch) ops->graphics = 1; if (!blank) { - var.activate = FB_ACTIVATE_NOW | FB_ACTIVATE_FORCE; + var.activate = FB_ACTIVATE_NOW | FB_ACTIVATE_FORCE | + FB_ACTIVATE_KD_TEXT; fb_set_var(info, &var); ops->graphics = 0; ops->var = info->var; diff --git a/include/uapi/linux/fb.h b/include/uapi/linux/fb.h index b6aac7ee1f67..4c14e8be7267 100644 --- a/include/uapi/linux/fb.h +++ b/include/uapi/linux/fb.h @@ -205,6 +205,7 @@ struct fb_bitfield { #define FB_ACTIVATE_ALL 64 /* change all VCs on this fb */ #define FB_ACTIVATE_FORCE 128 /* force apply even when no change*/ #define FB_ACTIVATE_INV_MODE 256 /* invalidate videomode */ +#define FB_ACTIVATE_KD_TEXT 512 /* for KDSET vt ioctl */ #define FB_ACCELF_TEXT 1 /* (OBSOLETE) see fb_info.flags and vc_mode */ -- cgit v1.2.3 From 899426b3bdd947541ba4af8c767575889c8b842a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Jun 2020 23:47:16 +0300 Subject: net: neighbor: add fdb extended attribute Add an attribute to NDA which will contain all future fdb-specific attributes in order to avoid polluting the NDA namespace with e.g. bridge or vxlan specific attributes. The attribute is called NDA_FDB_EXT_ATTRS and the structure would look like: [NDA_FDB_EXT_ATTRS] = { [NFEA_xxx] } Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 12 ++++++++++++ net/core/neighbour.c | 1 + 2 files changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index eefcda8ca44e..540ff48402a1 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -30,6 +30,7 @@ enum { NDA_SRC_VNI, NDA_PROTOCOL, /* Originator of entry */ NDA_NH_ID, + NDA_FDB_EXT_ATTRS, __NDA_MAX }; @@ -172,4 +173,15 @@ enum { }; #define NDTA_MAX (__NDTA_MAX - 1) +/* embedded into NDA_FDB_EXT_ATTRS: + * [NDA_FDB_EXT_ATTRS] = { + * ... + * } + */ +enum { + NFEA_UNSPEC, + __NFEA_MAX +}; +#define NFEA_MAX (__NFEA_MAX - 1) + #endif diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ef6b5a8f629c..8e39e28b0a8d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1783,6 +1783,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, + [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.3 From 31cbc39b6344916c20452e43a9171009214c409c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Jun 2020 23:47:17 +0300 Subject: net: bridge: add option to allow activity notifications for any fdb entries This patch adds the ability to notify about activity of any entries (static, permanent or ext_learn). EVPN multihoming peers need it to properly and efficiently handle mac sync (peer active/locally active). We add a new NFEA_ACTIVITY_NOTIFY attribute which is used to dump the current activity state and to control if static entries should be monitored at all. We use 2 bits - one to activate fdb entry tracking (disabled by default) and the second to denote that an entry is inactive. We need the second bit in order to avoid multiple notifications of inactivity. Obviously this makes no difference for dynamic entries since at the time of inactivity they get deleted, while the tracked non-dynamic entries get the inactive bit set and get a notification. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 11 ++++ net/bridge/br_fdb.c | 117 ++++++++++++++++++++++++++++++++++++----- net/bridge/br_private.h | 4 ++ 3 files changed, 119 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 540ff48402a1..21e569297355 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -173,13 +173,24 @@ enum { }; #define NDTA_MAX (__NDTA_MAX - 1) + /* FDB activity notification bits used in NFEA_ACTIVITY_NOTIFY: + * - FDB_NOTIFY_BIT - notify on activity/expire for any entry + * - FDB_NOTIFY_INACTIVE_BIT - mark as inactive to avoid multiple notifications + */ +enum { + FDB_NOTIFY_BIT = (1 << 0), + FDB_NOTIFY_INACTIVE_BIT = (1 << 1) +}; + /* embedded into NDA_FDB_EXT_ATTRS: * [NDA_FDB_EXT_ATTRS] = { + * [NFEA_ACTIVITY_NOTIFY] * ... * } */ enum { NFEA_UNSPEC, + NFEA_ACTIVITY_NOTIFY, __NFEA_MAX }; #define NFEA_MAX (__NFEA_MAX - 1) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index ed80d9ab0fb9..642deb57c064 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -349,12 +349,21 @@ void br_fdb_cleanup(struct work_struct *work) */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { - unsigned long this_timer; + unsigned long this_timer = f->updated + delay; if (test_bit(BR_FDB_STATIC, &f->flags) || - test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) + test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) { + if (test_bit(BR_FDB_NOTIFY, &f->flags)) { + if (time_after(this_timer, now)) + work_delay = min(work_delay, + this_timer - now); + else if (!test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, + &f->flags)) + fdb_notify(br, f, RTM_NEWNEIGH, false); + } continue; - this_timer = f->updated + delay; + } + if (time_after(this_timer, now)) { work_delay = min(work_delay, this_timer - now); } else { @@ -556,11 +565,17 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, return ret; } +/* returns true if the fdb was modified */ +static bool __fdb_mark_active(struct net_bridge_fdb_entry *fdb) +{ + return !!(test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags) && + test_and_clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)); +} + void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; - bool fdb_modified = false; /* some users want to always flood. */ if (hold_time(br) == 0) @@ -575,6 +590,12 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, source->dev->name, addr, vid); } else { unsigned long now = jiffies; + bool fdb_modified = false; + + if (now != fdb->updated) { + fdb->updated = now; + fdb_modified = __fdb_mark_active(fdb); + } /* fastpath: update of existing entry */ if (unlikely(source != fdb->dst && @@ -587,8 +608,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); } - if (now != fdb->updated) - fdb->updated = now; + if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (unlikely(fdb_modified)) { @@ -667,6 +687,23 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, &fdb->key.vlan_id)) goto nla_put_failure; + if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) { + struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS); + u8 notify_bits = FDB_NOTIFY_BIT; + + if (!nest) + goto nla_put_failure; + if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) + notify_bits |= FDB_NOTIFY_INACTIVE_BIT; + + if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) { + nla_nest_cancel(skb, nest); + goto nla_put_failure; + } + + nla_nest_end(skb, nest); + } + nlmsg_end(skb, nlh); return 0; @@ -681,7 +718,9 @@ static inline size_t fdb_nlmsg_size(void) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + nla_total_size(sizeof(u16)) /* NDA_VLAN */ - + nla_total_size(sizeof(struct nda_cacheinfo)); + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ + + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } static void fdb_notify(struct net_bridge *br, @@ -791,14 +830,40 @@ errout: return err; } +/* returns true if the fdb is modified */ +static bool fdb_handle_notify(struct net_bridge_fdb_entry *fdb, u8 notify) +{ + bool modified = false; + + /* allow to mark an entry as inactive, usually done on creation */ + if ((notify & FDB_NOTIFY_INACTIVE_BIT) && + !test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) + modified = true; + + if ((notify & FDB_NOTIFY_BIT) && + !test_and_set_bit(BR_FDB_NOTIFY, &fdb->flags)) { + /* enabled activity tracking */ + modified = true; + } else if (!(notify & FDB_NOTIFY_BIT) && + test_and_clear_bit(BR_FDB_NOTIFY, &fdb->flags)) { + /* disabled activity tracking, clear notify state */ + clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags); + modified = true; + } + + return modified; +} + /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, - const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid) + const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid, + struct nlattr *nfea_tb[]) { bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY); struct net_bridge_fdb_entry *fdb; u16 state = ndm->ndm_state; bool modified = false; + u8 notify = 0; /* If the port cannot learn allow only local and static entries */ if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) && @@ -815,6 +880,13 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (is_sticky && (state & NUD_PERMANENT)) return -EINVAL; + if (nfea_tb[NFEA_ACTIVITY_NOTIFY]) { + notify = nla_get_u8(nfea_tb[NFEA_ACTIVITY_NOTIFY]); + if ((notify & ~BR_FDB_NOTIFY_SETTABLE_BITS) || + (notify & BR_FDB_NOTIFY_SETTABLE_BITS) == FDB_NOTIFY_INACTIVE_BIT) + return -EINVAL; + } + fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) @@ -858,6 +930,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, modified = true; } + if (fdb_handle_notify(fdb, notify)) + modified = true; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); fdb->used = jiffies; @@ -871,7 +946,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, - u16 nlh_flags, u16 vid) + u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[]) { int err = 0; @@ -893,19 +968,24 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, err = br_fdb_external_learn_add(br, p, addr, vid, true); } else { spin_lock_bh(&br->hash_lock); - err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid); + err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb); spin_unlock_bh(&br->hash_lock); } return err; } +static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { + [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 }, +}; + /* Add new permanent fdb entry with RTM_NEWNEIGH */ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, struct netlink_ext_ack *extack) { + struct nlattr *nfea_tb[NFEA_MAX + 1], *attr; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; @@ -938,6 +1018,16 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], vg = nbp_vlan_group(p); } + if (tb[NDA_FDB_EXT_ATTRS]) { + attr = tb[NDA_FDB_EXT_ATTRS]; + err = nla_parse_nested(nfea_tb, NFEA_MAX, attr, + br_nda_fdb_pol, extack); + if (err) + return err; + } else { + memset(nfea_tb, 0, sizeof(struct nlattr *) * (NFEA_MAX + 1)); + } + if (vid) { v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) { @@ -946,9 +1036,9 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], } /* VID was specified, so use it. */ - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb); } else { - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb); if (err || !vg || !vg->num_vlans) goto out; @@ -959,7 +1049,8 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid, + nfea_tb); if (err) goto out; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 7501be4eeba0..c0ae639e1b36 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -48,6 +48,8 @@ enum { /* Path to usermode spanning tree program */ #define BR_STP_PROG "/sbin/bridge-stp" +#define BR_FDB_NOTIFY_SETTABLE_BITS (FDB_NOTIFY_BIT | FDB_NOTIFY_INACTIVE_BIT) + typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; @@ -184,6 +186,8 @@ enum { BR_FDB_ADDED_BY_USER, BR_FDB_ADDED_BY_EXT_LEARN, BR_FDB_OFFLOADED, + BR_FDB_NOTIFY, + BR_FDB_NOTIFY_INACTIVE }; struct net_bridge_fdb_key { -- cgit v1.2.3 From b5f1d9ec283bd28a452cf61d7e5c2f2b1a9cccda Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Jun 2020 23:47:18 +0300 Subject: net: bridge: add a flag to avoid refreshing fdb when changing/adding When we modify or create a new fdb entry sometimes we want to avoid refreshing its activity in order to track it properly. One example is when a mac is received from EVPN multi-homing peer by FRR, which doesn't want to change local activity accounting. It makes it static and sets a flag to track its activity. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 1 + net/bridge/br_fdb.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 21e569297355..dc8b72201f6c 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -191,6 +191,7 @@ enum { enum { NFEA_UNSPEC, NFEA_ACTIVITY_NOTIFY, + NFEA_DONT_REFRESH, __NFEA_MAX }; #define NFEA_MAX (__NFEA_MAX - 1) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 642deb57c064..9db504baa094 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -860,6 +860,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, struct nlattr *nfea_tb[]) { bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY); + bool refresh = !nfea_tb[NFEA_DONT_REFRESH]; struct net_bridge_fdb_entry *fdb; u16 state = ndm->ndm_state; bool modified = false; @@ -937,7 +938,8 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, fdb->used = jiffies; if (modified) { - fdb->updated = jiffies; + if (refresh) + fdb->updated = jiffies; fdb_notify(br, fdb, RTM_NEWNEIGH, true); } @@ -977,6 +979,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 }, + [NFEA_DONT_REFRESH] = { .type = NLA_FLAG }, }; /* Add new permanent fdb entry with RTM_NEWNEIGH */ -- cgit v1.2.3 From af7ec13833619e17f03aa73a785a2f871da6d66b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:09 -0700 Subject: bpf: Add bpf_skc_to_tcp6_sock() helper The helper is used in tracing programs to cast a socket pointer to a tcp6_sock pointer. The return value could be NULL if the casting is illegal. A new helper return type RET_PTR_TO_BTF_ID_OR_NULL is added so the verifier is able to deduce proper return types for the helper. Different from the previous BTF_ID based helpers, the bpf_skc_to_tcp6_sock() argument can be several possible btf_ids. More specifically, all possible socket data structures with sock_common appearing in the first in the memory layout. This patch only added socket types related to tcp and udp. All possible argument btf_id and return value btf_id for helper bpf_skc_to_tcp6_sock() are pre-calculcated and cached. In the future, it is even possible to precompute these btf_id's at kernel build time. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230809.3988195-1-yhs@fb.com --- include/linux/bpf.h | 12 +++++++ include/uapi/linux/bpf.h | 9 ++++- kernel/bpf/btf.c | 1 + kernel/bpf/verifier.c | 43 ++++++++++++++++------ kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 82 ++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 9 ++++- 8 files changed, 148 insertions(+), 12 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1e1501ee53ce..c0e38ad07848 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -265,6 +265,7 @@ enum bpf_return_type { RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ + RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -287,6 +288,12 @@ struct bpf_func_proto { enum bpf_arg_type arg_type[5]; }; int *btf_id; /* BTF ids of arguments */ + bool (*check_btf_id)(u32 btf_id, u32 arg); /* if the argument btf_id is + * valid. Often used if more + * than one btf id is permitted + * for this argument. + */ + int *ret_btf_id; /* return value btf_id */ }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -1524,6 +1531,7 @@ static inline bool bpf_map_is_dev_bound(struct bpf_map *map) struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); +void init_btf_sock_ids(struct btf *btf); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -1549,6 +1557,9 @@ static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) static inline void bpf_map_offload_map_free(struct bpf_map *map) { } +static inline void init_btf_sock_ids(struct btf *btf) +{ +} #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_BPF_STREAM_PARSER) @@ -1638,6 +1649,7 @@ extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d9737d51dd19..e90ad07b291a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3255,6 +3255,12 @@ union bpf_attr { * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level * is returned or the error code -EACCES in case the skb is not * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3392,7 +3398,8 @@ union bpf_attr { FN(ringbuf_submit), \ FN(ringbuf_discard), \ FN(ringbuf_query), \ - FN(csum_level), + FN(csum_level), \ + FN(skc_to_tcp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e377d1981730..4c3007f428b1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3674,6 +3674,7 @@ struct btf *btf_parse_vmlinux(void) goto errout; bpf_struct_ops_init(btf, log); + init_btf_sock_ids(btf); btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7460f967cb75..7de98906ddf4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3800,12 +3800,14 @@ static int int_ptr_type_to_size(enum bpf_arg_type type) return -EINVAL; } -static int check_func_arg(struct bpf_verifier_env *env, u32 regno, - enum bpf_arg_type arg_type, - struct bpf_call_arg_meta *meta) +static int check_func_arg(struct bpf_verifier_env *env, u32 arg, + struct bpf_call_arg_meta *meta, + const struct bpf_func_proto *fn) { + u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; + enum bpf_arg_type arg_type = fn->arg_type[arg]; int err = 0; if (arg_type == ARG_DONTCARE) @@ -3885,9 +3887,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; - if (reg->btf_id != meta->btf_id) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), + if (!fn->check_btf_id) { + if (reg->btf_id != meta->btf_id) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + + return -EACCES; + } + } else if (!fn->check_btf_id(reg->btf_id, arg)) { + verbose(env, "Helper does not support %s in R%d\n", kernel_type_name(reg->btf_id), regno); return -EACCES; @@ -4709,10 +4718,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); + if (!fn->check_btf_id) { + err = btf_resolve_helper_id(&env->log, fn, i); + if (err > 0) + meta.btf_id = err; + } + err = check_func_arg(env, i, &meta, fn); if (err) return err; } @@ -4815,6 +4826,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + int ret_btf_id; + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + ret_btf_id = *fn->ret_btf_id; + if (ret_btf_id == 0) { + verbose(env, "invalid return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); + return -EINVAL; + } + regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0159f12d2af5..2a97a268f533 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1515,6 +1515,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_output_proto; case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; + case BPF_FUNC_skc_to_tcp6_sock: + return &bpf_skc_to_tcp6_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index c713b6b8938f..176e27d75c51 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -9225,3 +9226,84 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } + +/* Define a list of socket types which can be the argument for + * skc_to_*_sock() helpers. All these sockets should have + * sock_common as the first argument in its memory layout. + */ +#define BTF_SOCK_TYPE_xxx \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, "inet_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, "inet_connection_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, "inet_request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, "inet_timewait_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, "request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, "sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, "sock_common") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, "tcp_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, "tcp_request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, "tcp_timewait_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, "tcp6_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, "udp_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, "udp6_sock") + +enum { +#define BTF_SOCK_TYPE(name, str) name, +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE +MAX_BTF_SOCK_TYPE, +}; + +static int btf_sock_ids[MAX_BTF_SOCK_TYPE]; + +#ifdef CONFIG_BPF_SYSCALL +static const char *bpf_sock_types[] = { +#define BTF_SOCK_TYPE(name, str) str, +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE +}; + +void init_btf_sock_ids(struct btf *btf) +{ + int i, btf_id; + + for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) { + btf_id = btf_find_by_name_kind(btf, bpf_sock_types[i], + BTF_KIND_STRUCT); + if (btf_id > 0) + btf_sock_ids[i] = btf_id; + } +} +#endif + +static bool check_arg_btf_id(u32 btf_id, u32 arg) +{ + int i; + + /* only one argument, no need to check arg */ + for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) + if (btf_sock_ids[i] == btf_id) + return true; + return false; +} + +BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) +{ + /* tcp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct tcp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && + sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { + .func = bpf_skc_to_tcp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 91fa668fa860..6c2f64118651 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -421,6 +421,7 @@ class PrinterHelpers(Printer): 'struct sockaddr', 'struct tcphdr', 'struct seq_file', + 'struct tcp6_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -458,6 +459,7 @@ class PrinterHelpers(Printer): 'struct sockaddr', 'struct tcphdr', 'struct seq_file', + 'struct tcp6_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d9737d51dd19..e90ad07b291a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3255,6 +3255,12 @@ union bpf_attr { * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level * is returned or the error code -EACCES in case the skb is not * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3392,7 +3398,8 @@ union bpf_attr { FN(ringbuf_submit), \ FN(ringbuf_discard), \ FN(ringbuf_query), \ - FN(csum_level), + FN(csum_level), \ + FN(skc_to_tcp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 478cfbdf5f13dfe09cfd0b1cbac821f5e27f6108 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:11 -0700 Subject: bpf: Add bpf_skc_to_{tcp, tcp_timewait, tcp_request}_sock() helpers Three more helpers are added to cast a sock_common pointer to an tcp_sock, tcp_timewait_sock or a tcp_request_sock for tracing programs. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230811.3988277-1-yhs@fb.com --- include/linux/bpf.h | 3 ++ include/uapi/linux/bpf.h | 23 +++++++++++++++- kernel/trace/bpf_trace.c | 6 ++++ net/core/filter.c | 62 ++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 6 ++++ tools/include/uapi/linux/bpf.h | 23 +++++++++++++++- 6 files changed, 121 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c0e38ad07848..c23998cf6699 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1650,6 +1650,9 @@ extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e90ad07b291a..b9412ab275f3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3261,6 +3261,24 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3399,7 +3417,10 @@ union bpf_attr { FN(ringbuf_discard), \ FN(ringbuf_query), \ FN(csum_level), \ - FN(skc_to_tcp6_sock), + FN(skc_to_tcp6_sock), \ + FN(skc_to_tcp_sock), \ + FN(skc_to_tcp_timewait_sock), \ + FN(skc_to_tcp_request_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2a97a268f533..48d935b0d87c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1517,6 +1517,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_output_proto; case BPF_FUNC_skc_to_tcp6_sock: return &bpf_skc_to_tcp6_sock_proto; + case BPF_FUNC_skc_to_tcp_sock: + return &bpf_skc_to_tcp_sock_proto; + case BPF_FUNC_skc_to_tcp_timewait_sock: + return &bpf_skc_to_tcp_timewait_sock_proto; + case BPF_FUNC_skc_to_tcp_request_sock: + return &bpf_skc_to_tcp_request_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index 176e27d75c51..0b4e5aed7e20 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -74,6 +74,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -9307,3 +9308,64 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .check_btf_id = check_arg_btf_id, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; + +BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) +{ + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { + .func = bpf_skc_to_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +}; + +BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) +{ + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { + .func = bpf_skc_to_tcp_timewait_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], +}; + +BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) +{ + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { + .func = bpf_skc_to_tcp_request_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6c2f64118651..d886657c6aaa 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -422,6 +422,9 @@ class PrinterHelpers(Printer): 'struct tcphdr', 'struct seq_file', 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -460,6 +463,9 @@ class PrinterHelpers(Printer): 'struct tcphdr', 'struct seq_file', 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e90ad07b291a..b9412ab275f3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3261,6 +3261,24 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3399,7 +3417,10 @@ union bpf_attr { FN(ringbuf_discard), \ FN(ringbuf_query), \ FN(csum_level), \ - FN(skc_to_tcp6_sock), + FN(skc_to_tcp6_sock), \ + FN(skc_to_tcp_sock), \ + FN(skc_to_tcp_timewait_sock), \ + FN(skc_to_tcp_request_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 0d4fad3e57df2bf61e8ffc8d12a34b1caf9b8835 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:15 -0700 Subject: bpf: Add bpf_skc_to_udp6_sock() helper The helper is used in tracing programs to cast a socket pointer to a udp6_sock pointer. The return value could be NULL if the casting is illegal. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Cc: Eric Dumazet Link: https://lore.kernel.org/bpf/20200623230815.3988481-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 9 ++++++++- kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 22 ++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 9 ++++++++- 6 files changed, 43 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c23998cf6699..3d2ade703a35 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1653,6 +1653,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9412ab275f3..0cb8ec948816 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3279,6 +3279,12 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3420,7 +3426,8 @@ union bpf_attr { FN(skc_to_tcp6_sock), \ FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ - FN(skc_to_tcp_request_sock), + FN(skc_to_tcp_request_sock), \ + FN(skc_to_udp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 48d935b0d87c..5d59dda5f661 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1523,6 +1523,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_tcp_timewait_sock_proto; case BPF_FUNC_skc_to_tcp_request_sock: return &bpf_skc_to_tcp_request_sock_proto; + case BPF_FUNC_skc_to_udp6_sock: + return &bpf_skc_to_udp6_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index 0b4e5aed7e20..c796e141ea8e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9369,3 +9369,25 @@ const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .check_btf_id = check_arg_btf_id, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; + +BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) +{ + /* udp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct udp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && + sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { + .func = bpf_skc_to_udp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index d886657c6aaa..6bab40ff442e 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -425,6 +425,7 @@ class PrinterHelpers(Printer): 'struct tcp_sock', 'struct tcp_timewait_sock', 'struct tcp_request_sock', + 'struct udp6_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -466,6 +467,7 @@ class PrinterHelpers(Printer): 'struct tcp_sock', 'struct tcp_timewait_sock', 'struct tcp_request_sock', + 'struct udp6_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b9412ab275f3..0cb8ec948816 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3279,6 +3279,12 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3420,7 +3426,8 @@ union bpf_attr { FN(skc_to_tcp6_sock), \ FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ - FN(skc_to_tcp_request_sock), + FN(skc_to_tcp_request_sock), \ + FN(skc_to_udp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 4c342f778fe234e0c2a2601d87fec8ba42f0d2c6 Mon Sep 17 00:00:00 2001 From: Rao Shoaib Date: Thu, 25 Jun 2020 13:46:00 -0700 Subject: rds: transport module should be auto loaded when transport is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This enhancement auto loads transport module when the transport is set via SO_RDS_TRANSPORT socket option. Reviewed-by: Ka-Cheong Poon Reviewed-by: Håkon Bugge Signed-off-by: Rao Shoaib Signed-off-by: Somasundaram Krishnasamy Signed-off-by: David S. Miller --- include/uapi/linux/rds.h | 4 +++- net/rds/transport.c | 26 +++++++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index cba368e55863..c21edb966c19 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -64,10 +64,12 @@ /* supported values for SO_RDS_TRANSPORT */ #define RDS_TRANS_IB 0 -#define RDS_TRANS_IWARP 1 +#define RDS_TRANS_GAP 1 #define RDS_TRANS_TCP 2 #define RDS_TRANS_COUNT 3 #define RDS_TRANS_NONE (~0) +/* don't use RDS_TRANS_IWARP - it is deprecated */ +#define RDS_TRANS_IWARP RDS_TRANS_GAP /* IOCTLS commands for SOL_RDS */ #define SIOCRDSSETTOS (SIOCPROTOPRIVATE) diff --git a/net/rds/transport.c b/net/rds/transport.c index 46f709a4b577..f8001ec80867 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -38,6 +38,12 @@ #include "rds.h" #include "loop.h" +static char * const rds_trans_modules[] = { + [RDS_TRANS_IB] = "rds_rdma", + [RDS_TRANS_GAP] = NULL, + [RDS_TRANS_TCP] = "rds_tcp", +}; + static struct rds_transport *transports[RDS_TRANS_COUNT]; static DECLARE_RWSEM(rds_trans_sem); @@ -110,18 +116,20 @@ struct rds_transport *rds_trans_get(int t_type) { struct rds_transport *ret = NULL; struct rds_transport *trans; - unsigned int i; down_read(&rds_trans_sem); - for (i = 0; i < RDS_TRANS_COUNT; i++) { - trans = transports[i]; - - if (trans && trans->t_type == t_type && - (!trans->t_owner || try_module_get(trans->t_owner))) { - ret = trans; - break; - } + trans = transports[t_type]; + if (!trans) { + up_read(&rds_trans_sem); + if (rds_trans_modules[t_type]) + request_module(rds_trans_modules[t_type]); + down_read(&rds_trans_sem); + trans = transports[t_type]; } + if (trans && trans->t_type == t_type && + (!trans->t_owner || try_module_get(trans->t_owner))) + ret = trans; + up_read(&rds_trans_sem); return ret; -- cgit v1.2.3 From bccb48c89fe3c09f1cbb7c8612e31f5daa1d4541 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 1 Jun 2020 20:13:21 +0200 Subject: batman-adv: Fix typos and grammar in documentation Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- Documentation/networking/batman-adv.rst | 6 ++-- include/uapi/linux/batadv_packet.h | 50 ++++++++++++++++----------------- include/uapi/linux/batman_adv.h | 4 +-- net/batman-adv/bat_iv_ogm.c | 8 +++--- net/batman-adv/bat_v_elp.c | 10 +++---- net/batman-adv/bat_v_ogm.c | 14 ++++----- net/batman-adv/bridge_loop_avoidance.c | 6 ++-- net/batman-adv/distributed-arp-table.c | 2 +- net/batman-adv/fragmentation.c | 6 ++-- net/batman-adv/hard-interface.c | 14 ++++----- net/batman-adv/log.h | 6 ++-- net/batman-adv/main.c | 2 +- net/batman-adv/main.h | 6 ++-- net/batman-adv/multicast.c | 21 +++++++------- net/batman-adv/netlink.c | 2 +- net/batman-adv/network-coding.c | 14 ++++----- net/batman-adv/originator.c | 8 +++--- net/batman-adv/routing.c | 4 +-- net/batman-adv/send.c | 4 +-- net/batman-adv/soft-interface.c | 2 +- net/batman-adv/tp_meter.c | 12 ++++---- net/batman-adv/translation-table.c | 10 +++---- net/batman-adv/tvlv.c | 4 +-- net/batman-adv/types.h | 12 ++++---- 24 files changed, 114 insertions(+), 113 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/batman-adv.rst b/Documentation/networking/batman-adv.rst index 02af49b08635..74821d29a22f 100644 --- a/Documentation/networking/batman-adv.rst +++ b/Documentation/networking/batman-adv.rst @@ -73,7 +73,7 @@ lower value. This will make the mesh more responsive to topology changes, but will also increase the overhead. Information about the current state can be accessed via the batadv generic -netlink family. batctl provides human readable version via its debug tables +netlink family. batctl provides a human readable version via its debug tables subcommands. @@ -115,8 +115,8 @@ are prefixed with "batman-adv:" So to see just these messages try:: $ dmesg | grep batman-adv When investigating problems with your mesh network, it is sometimes necessary to -see more detail debug messages. This must be enabled when compiling the -batman-adv module. When building batman-adv as part of kernel, use "make +see more detailed debug messages. This must be enabled when compiling the +batman-adv module. When building batman-adv as part of the kernel, use "make menuconfig" and enable the option ``B.A.T.M.A.N. debugging`` (``CONFIG_BATMAN_ADV_DEBUG=y``). diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 0ae34c85ef9e..9c8604c5b5f6 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -72,8 +72,8 @@ enum batadv_subtype { /** * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets - * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was - * previously received from someone else than the best neighbor. + * @BATADV_NOT_BEST_NEXT_HOP: flag is set when the ogm packet is forwarded and + * was previously received from someone other than the best neighbor. * @BATADV_PRIMARIES_FIRST_HOP: flag unused. * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a * one hop neighbor on the interface where it was originally received. @@ -195,8 +195,8 @@ struct batadv_bla_claim_dst { /** * struct batadv_ogm_packet - ogm (routing protocol) packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @flags: contains routing relevant flags - see enum batadv_iv_flags * @seqno: sequence identification * @orig: address of the source node @@ -247,7 +247,7 @@ struct batadv_ogm2_packet { /** * struct batadv_elp_packet - elp (neighbor discovery) packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header + * @version: batman-adv protocol version, part of the general header * @orig: originator mac address * @seqno: sequence number * @elp_interval: currently used ELP sending interval in ms @@ -265,15 +265,15 @@ struct batadv_elp_packet { /** * struct batadv_icmp_header - common members among all the ICMP packets * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node * @uid: local ICMP socket identifier * @align: not used - useful for alignment purposes only * - * This structure is used for ICMP packets parsing only and it is never sent + * This structure is used for ICMP packet parsing only and it is never sent * over the wire. The alignment field at the end is there to ensure that * members are padded the same way as they are in real packets. */ @@ -291,8 +291,8 @@ struct batadv_icmp_header { /** * struct batadv_icmp_packet - ICMP packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node @@ -315,8 +315,8 @@ struct batadv_icmp_packet { /** * struct batadv_icmp_tp_packet - ICMP TP Meter packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node @@ -358,8 +358,8 @@ enum batadv_icmp_tp_subtype { /** * struct batadv_icmp_packet_rr - ICMP RouteRecord packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node @@ -397,8 +397,8 @@ struct batadv_icmp_packet_rr { /** * struct batadv_unicast_packet - unicast packet for network payload * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @ttvn: translation table version number * @dest: originator destination of the unicast packet */ @@ -433,8 +433,8 @@ struct batadv_unicast_4addr_packet { /** * struct batadv_frag_packet - fragmented packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @dest: final destination used when routing fragments * @orig: originator of the fragment used when merging the packet * @no: fragment number within this sequence @@ -467,8 +467,8 @@ struct batadv_frag_packet { /** * struct batadv_bcast_packet - broadcast packet for network payload * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @reserved: reserved byte for alignment * @seqno: sequence identification * @orig: originator of the broadcast packet @@ -488,10 +488,10 @@ struct batadv_bcast_packet { /** * struct batadv_coded_packet - network coded packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @first_source: original source of first included packet - * @first_orig_dest: original destinal of first included packet + * @first_orig_dest: original destination of first included packet * @first_crc: checksum of first included packet * @first_ttvn: tt-version number of first included packet * @second_ttl: ttl of second packet @@ -523,8 +523,8 @@ struct batadv_coded_packet { /** * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @reserved: reserved field (for packet alignment) * @src: address of the source * @dst: address of the destination diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 617c180ff0c8..8cf2ad11ead9 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -69,7 +69,7 @@ enum batadv_tt_client_flags { /** * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be - * part of the network but no nnode has already announced it + * part of the network but no node has already announced it */ BATADV_TT_CLIENT_TEMP = (1 << 11), }; @@ -131,7 +131,7 @@ enum batadv_gw_modes { /** @BATADV_GW_MODE_CLIENT: send DHCP requests to gw servers */ BATADV_GW_MODE_CLIENT, - /** @BATADV_GW_MODE_SERVER: announce itself as gatway server */ + /** @BATADV_GW_MODE_SERVER: announce itself as gateway server */ BATADV_GW_MODE_SERVER, }; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index e87f19c82e8d..5b3a41983156 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -134,7 +134,7 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) * * Return: the originator object corresponding to the passed mac address or NULL * on failure. - * If the object does not exists it is created an initialised. + * If the object does not exist, it is created and initialised. */ static struct batadv_orig_node * batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) @@ -871,7 +871,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) } /** - * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface + * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface * @orig_node: originator which reproadcasted the OGMs directly * @if_outgoing: interface which transmitted the original OGM and received the * direct rebroadcast @@ -1554,7 +1554,7 @@ static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) - * @if_incoming: the interface where this packet was receved + * @if_incoming: the interface where this packet was received */ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) @@ -2288,7 +2288,7 @@ batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information - * @single_hardif: Limit dump to this hard interfaace + * @single_hardif: Limit dump to this hard interface */ static void batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 0bdefa35da98..d35aca0e969a 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -60,7 +60,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) * @neigh: the neighbour for which the throughput has to be obtained * * Return: The throughput towards the given neighbour in multiples of 100kpbs - * (a value of '1' equals to 0.1Mbps, '10' equals 1Mbps, etc). + * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). */ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) { @@ -183,8 +183,8 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work) * * Sends a predefined number of unicast wifi packets to a given neighbour in * order to trigger the throughput estimation on this link by the RC algorithm. - * Packets are sent only if there there is not enough payload unicast traffic - * towards this neighbour.. + * Packets are sent only if there is not enough payload unicast traffic towards + * this neighbour.. * * Return: True on success and false in case of error during skb preparation. */ @@ -244,7 +244,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) * batadv_v_elp_periodic_work() - ELP periodic task per interface * @work: work queue item * - * Emits broadcast ELP message in regular intervals. + * Emits broadcast ELP messages in regular intervals. */ static void batadv_v_elp_periodic_work(struct work_struct *work) { @@ -499,7 +499,7 @@ orig_free: * @skb: the received packet * @if_incoming: the interface this packet was received through * - * Return: NET_RX_SUCCESS and consumes the skb if the packet was peoperly + * Return: NET_RX_SUCCESS and consumes the skb if the packet was properly * processed or NET_RX_DROP in case of failure. */ int batadv_v_elp_packet_recv(struct sk_buff *skb, diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 18028b9f95f0..0d404f7bcd9f 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -47,9 +47,9 @@ * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * - * Return: the orig_node corresponding to the specified address. If such object - * does not exist it is allocated here. In case of allocation failure returns - * NULL. + * Return: the orig_node corresponding to the specified address. If such an + * object does not exist, it is allocated here. In case of allocation failure + * returns NULL. */ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) @@ -172,7 +172,7 @@ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue * @hard_iface: the interface holding the aggregation queue * - * Empties the OGMv2 aggregation queue and frees all the skbs it contained. + * Empties the OGMv2 aggregation queue and frees all the skbs it contains. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ @@ -378,7 +378,7 @@ static void batadv_v_ogm_send(struct work_struct *work) * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface * @work: work queue item * - * Emits aggregated OGM message in regular intervals. + * Emits aggregated OGM messages in regular intervals. */ void batadv_v_ogm_aggr_work(struct work_struct *work) { @@ -399,7 +399,7 @@ void batadv_v_ogm_aggr_work(struct work_struct *work) * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V * @hard_iface: the interface to prepare * - * Takes care of scheduling own OGM sending routine for this interface. + * Takes care of scheduling its own OGM sending routine for this interface. * * Return: 0 on success or a negative error code otherwise */ @@ -847,7 +847,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, * batadv_v_ogm_process() - process an incoming batman v OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) - * @if_incoming: the interface where this packet was receved + * @if_incoming: the interface where this packet was received */ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 41cc87f06b14..91a04ca373dc 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -992,7 +992,7 @@ static bool batadv_handle_claim(struct batadv_priv *bat_priv, * @hw_dst: the Hardware destination in the ARP Header * @ethhdr: pointer to the Ethernet header of the claim frame * - * checks if it is a claim packet and if its on the same group. + * checks if it is a claim packet and if it's on the same group. * This function also applies the group ID of the sender * if it is in the same mesh. * @@ -1757,7 +1757,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv) * @vid: the VLAN ID of the frame * * Checks if this packet is a loop detect frame which has been sent by us, - * throw an uevent and log the event if that is the case. + * throws an uevent and logs the event if that is the case. * * Return: true if it is a loop detect frame which is to be dropped, false * otherwise. @@ -1815,7 +1815,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, * * we have to race for a claim * * if the frame is allowed on the LAN * - * in these cases, the skb is further handled by this function + * In these cases, the skb is further handled by this function * * Return: true if handled, otherwise it returns false and the caller shall * further process the skb. diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b85da4b7a77b..0e6e53e9b5f3 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -666,7 +666,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, * @vid: VLAN identifier * @packet_subtype: unicast4addr packet subtype to use * - * This function copies the skb with pskb_copy() and is sent as unicast packet + * This function copies the skb with pskb_copy() and is sent as a unicast packet * to each of the selected candidates. * * Return: true if the packet is sent to at least one candidate, false diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 7cad97644d05..9fdbe3068153 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -102,8 +102,8 @@ static int batadv_frag_size_limit(void) * * Caller must hold chain->lock. * - * Return: true if chain is empty and caller can just insert the new fragment - * without searching for the right position. + * Return: true if chain is empty and the caller can just insert the new + * fragment without searching for the right position. */ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, u16 seqno) @@ -306,7 +306,7 @@ free: * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb * to NULL; 3) Error: Return false and free skb. * - * Return: true when packet is merged or buffered, false when skb is not not + * Return: true when the packet is merged or buffered, false when skb is not not * used. */ bool batadv_frag_skb_buffer(struct sk_buff **skb, diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 3a256af92784..53c27c67cc11 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -138,10 +138,10 @@ static bool batadv_mutual_parents(const struct net_device *dev1, * @net_dev: the device to check * * If the user creates any virtual device on top of a batman-adv interface, it - * is important to prevent this new interface to be used to create a new mesh - * network (this behaviour would lead to a batman-over-batman configuration). - * This function recursively checks all the fathers of the device passed as - * argument looking for a batman-adv soft interface. + * is important to prevent this new interface from being used to create a new + * mesh network (this behaviour would lead to a batman-over-batman + * configuration). This function recursively checks all the fathers of the + * device passed as argument looking for a batman-adv soft interface. * * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise @@ -680,8 +680,8 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) * @slave: the interface enslaved in another master * @master: the master from which slave has to be removed * - * Invoke ndo_del_slave on master passing slave as argument. In this way slave - * is free'd and master can correctly change its internal state. + * Invoke ndo_del_slave on master passing slave as argument. In this way the + * slave is free'd and the master can correctly change its internal state. * * Return: 0 on success, a negative value representing the error otherwise */ @@ -818,7 +818,7 @@ err: * @soft_iface: soft interface to check * * This function is only using RCU for locking - the result can therefore be - * off when another functions is modifying the list at the same time. The + * off when another function is modifying the list at the same time. The * caller can use the rtnl_lock to make sure that the count is accurate. * * Return: number of connected/enslaved hard interfaces diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index f9884dc56cf3..979864c0fa6b 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -69,7 +69,7 @@ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) __printf(2, 3); /** - * _batadv_dbg() - Store debug output with(out) ratelimiting + * _batadv_dbg() - Store debug output with(out) rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @ratelimited: whether output should be rate limited @@ -95,7 +95,7 @@ static inline void _batadv_dbg(int type __always_unused, #endif /** - * batadv_dbg() - Store debug output without ratelimiting + * batadv_dbg() - Store debug output without rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @arg: format string and variable arguments @@ -104,7 +104,7 @@ static inline void _batadv_dbg(int type __always_unused, _batadv_dbg(type, bat_priv, 0, ## arg) /** - * batadv_dbg_ratelimited() - Store debug output with ratelimiting + * batadv_dbg_ratelimited() - Store debug output with rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @arg: format string and variable arguments diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d8a255c85e77..519c08c2cfba 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -666,7 +666,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * - * Return: true if AP isolation is on for the VLAN idenfied by vid, false + * Return: true if AP isolation is on for the VLAN identified by vid, false * otherwise */ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 42b8d1e76dea..0393bb9ed3d0 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -308,7 +308,7 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a predecessor - * unless the variable sequence number has grown by more then + * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: @@ -330,11 +330,11 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, /** * batadv_seq_after() - Checks if a sequence number x is a successor of y - * @x: potential sucessor of @y + * @x: potential successor of @y * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a successor - * unless the variable sequence number has grown by more then + * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 9ebdc1e864b9..bdc4a1fba1c6 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -510,7 +510,7 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * - * If there is a bridge interface on top of dev, collects from that one + * If there is a bridge interface on top of dev, collect from that one * instead. Just like with IP addresses and routes, multicast listeners * will(/should) register to the bridge interface instead of an * enslaved bat0. @@ -832,8 +832,8 @@ batadv_mcast_bridge_log(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @flags: TVLV flags indicating the new multicast state * - * Whenever the multicast TVLV flags this nodes announces change this notifies - * userspace via the 'mcast' log level. + * Whenever the multicast TVLV flags this node announces change, this function + * should be used to notify userspace about the change. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { @@ -1244,7 +1244,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) * @ethhdr: an ethernet header to determine the protocol family from * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or - * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and + * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, sets and * increases its refcount. */ static struct batadv_orig_node * @@ -1693,7 +1693,7 @@ batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_send() - send packet to any detected multicast recpient + * batadv_mcast_forw_send() - send packet to any detected multicast recipient * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier @@ -1742,7 +1742,8 @@ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, - * orig, has toggled then this method updates counter and list accordingly. + * orig, has toggled then this method updates the counter and the list + * accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1787,7 +1788,7 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1832,7 +1833,7 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1877,7 +1878,7 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1922,7 +1923,7 @@ static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 02ed073f95a9..cfb00dfa468a 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -640,7 +640,7 @@ batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) * @bat_priv: the bat priv with all the soft interface information * @dst: destination of tp_meter session * @result: reason for tp meter session stop - * @test_time: total time ot the tp_meter session + * @test_time: total time of the tp_meter session * @total_bytes: bytes acked to the receiver * @cookie: cookie of tp_meter session * diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index b0469d15da0e..48d707850f3e 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -134,7 +134,7 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, } /** - * batadv_nc_mesh_init() - initialise coding hash table and start house keeping + * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure @@ -700,7 +700,7 @@ batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, } /** - * batadv_nc_worker() - periodic task for house keeping related to network + * batadv_nc_worker() - periodic task for housekeeping related to network * coding * @work: kernel work struct */ @@ -1316,7 +1316,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, } /** - * batadv_nc_skb_src_search() - Loops through the list of neighoring nodes of + * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of * the skb's sender (may be equal to the originator). * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to forward @@ -1402,10 +1402,10 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, * @neigh_node: next hop to forward packet to * @ethhdr: pointer to the ethernet header inside the skb * - * Loops through list of neighboring nodes the next hop has a good connection to - * (receives OGMs with a sufficient quality). We need to find a neighbor of our - * next hop that potentially sent a packet which our next hop also received - * (overheard) and has stored for later decoding. + * Loops through the list of neighboring nodes the next hop has a good + * connection to (receives OGMs with a sufficient quality). We need to find a + * neighbor of our next hop that potentially sent a packet which our next hop + * also received (overheard) and has stored for later decoding. * * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 5b0c2fffc214..805d8969bdfb 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -325,7 +325,7 @@ void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * - * Return: the neighbor which should be router for this orig_node/iface. + * Return: the neighbor which should be the router for this orig_node/iface. * * The object is returned with refcounter increased by 1. */ @@ -515,7 +515,7 @@ out: * Looks for and possibly returns a neighbour belonging to this originator list * which is connected through the provided hard interface. * - * Return: neighbor when found. Othwerwise NULL + * Return: neighbor when found. Otherwise NULL */ static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, @@ -620,7 +620,7 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, * * Looks for and possibly returns a neighbour belonging to this hard interface. * - * Return: neighbor when found. Othwerwise NULL + * Return: neighbor when found. Otherwise NULL */ struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, @@ -999,7 +999,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the originator * - * Creates a new originator object and initialise all the generic fields. + * Creates a new originator object and initialises all the generic fields. * The new object is not added to the originator list. * * Return: the newly created object or NULL on failure. diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index d343382e9664..27cdf5e4349a 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -449,7 +449,7 @@ free_skb: * @skb: packet to check * @hdr_size: size of header to pull * - * Check for short header and bad addresses in given packet. + * Checks for short header and bad addresses in the given packet. * * Return: negative value when check fails and 0 otherwise. The negative value * depends on the reason: -ENODATA for bad header, -EBADR for broadcast @@ -1113,7 +1113,7 @@ free_skb: * @recv_if: interface that the skb is received on * * This function does one of the three following things: 1) Forward fragment, if - * the assembled packet will exceed our MTU; 2) Buffer fragment, if we till + * the assembled packet will exceed our MTU; 2) Buffer fragment, if we still * lack further fragments; 3) Merge fragments, if we have all needed parts. * * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 7f8ade04e08e..d267b94800d6 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -605,8 +605,8 @@ bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet, * given hard_iface. If hard_iface is NULL forwarding packets on all hard * interfaces will be claimed. * - * The packets are being moved from the forw_list to the cleanup_list and - * by that allows already running threads to notice the claiming. + * The packets are being moved from the forw_list to the cleanup_list. This + * makes it possible for already running threads to notice the claim. */ static void batadv_forw_packet_list_steal(struct hlist_head *forw_list, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index f1f1c86f3419..23833a0ba5e6 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -406,7 +406,7 @@ end: * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * - * Sends a ethernet frame to the receive path of the local @soft_iface. + * Sends an ethernet frame to the receive path of the local @soft_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index bd2ac570c42c..db7e3774825b 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -66,7 +66,7 @@ /** * BATADV_TP_MAX_RTO - Maximum sender timeout. If the sender RTO gets beyond - * such amound of milliseconds, the receiver is considered unreachable and the + * such amount of milliseconds, the receiver is considered unreachable and the * connection is killed */ #define BATADV_TP_MAX_RTO 30000 @@ -108,10 +108,10 @@ static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid) * batadv_tp_cwnd() - compute the new cwnd size * @base: base cwnd size value * @increment: the value to add to base to get the new size - * @min: minumim cwnd value (usually MSS) + * @min: minimum cwnd value (usually MSS) * - * Return the new cwnd size and ensures it does not exceed the Advertised - * Receiver Window size. It is wrap around safe. + * Return the new cwnd size and ensure it does not exceed the Advertised + * Receiver Window size. It is wrapped around safely. * For details refer to Section 3.1 of RFC5681 * * Return: new congestion window size in bytes @@ -254,7 +254,7 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, * @dst: the other endpoint MAC address to look for * * Look for a tp_vars object matching dst as end_point and return it after - * having incremented the refcounter. Return NULL is not found + * having increment the refcounter. Return NULL is not found * * Return: matching tp_vars or NULL when no tp_vars with @dst was found */ @@ -291,7 +291,7 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, * @session: session identifier * * Look for a tp_vars object matching dst as end_point, session as tp meter - * session and return it after having incremented the refcounter. Return NULL + * session and return it after having increment the refcounter. Return NULL * is not found * * Return: matching tp_vars or NULL when no tp_vars was found diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index a9635c882fe0..98a0aaaf0d50 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -301,7 +301,7 @@ void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) * @vid: VLAN identifier * * Return: the number of originators advertising the given address/data - * (excluding ourself). + * (excluding our self). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) @@ -842,7 +842,7 @@ out: * table. In case of success the value is updated with the real amount of * reserved bytes * Allocate the needed amount of memory for the entire TT TVLV and write its - * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data + * header made up of one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN served by the originator node. * * Return: the size of the allocated buffer or 0 in case of failure. @@ -1674,7 +1674,7 @@ out: * the function argument. * If a TT local entry exists for this non-mesh client remove it. * - * The caller must hold orig_node refcount. + * The caller must hold the orig_node refcount. * * Return: true if the new entry has been added, false otherwise */ @@ -1839,7 +1839,7 @@ out: * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: global translation table entry to be analyzed * - * This functon assumes the caller holds rcu_read_lock(). + * This function assumes the caller holds rcu_read_lock(). * Return: best originator list entry or NULL on errors. */ static struct batadv_tt_orig_list_entry * @@ -1887,7 +1887,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, * @tt_global_entry: global translation table entry to be printed * @seq: debugfs table seq_file struct * - * This functon assumes the caller holds rcu_read_lock(). + * This function assumes the caller holds rcu_read_lock(). */ static void batadv_tt_global_print_entry(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 0963a43ad996..6a23a566cde1 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -353,8 +353,8 @@ end: * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * - * Return: success if handler was not found or the return value of the handler - * callback. + * Return: success if the handler was not found or the return value of the + * handler callback. */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index d152b8e81f61..cc151e1f23b2 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -455,8 +455,8 @@ struct batadv_orig_node { spinlock_t tt_buff_lock; /** - * @tt_lock: prevents from updating the table while reading it. Table - * update is made up by two operations (data structure update and + * @tt_lock: avoids concurrent read from and write to the table. Table + * update is made up of two operations (data structure update and * metadata -CRC/TTVN-recalculation) and they have to be executed * atomically in order to avoid another thread to read the * table/metadata between those. @@ -748,7 +748,7 @@ struct batadv_neigh_ifinfo { * struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression */ struct batadv_bcast_duplist_entry { - /** @orig: mac address of orig node orginating the broadcast */ + /** @orig: mac address of orig node originating the broadcast */ u8 orig[ETH_ALEN]; /** @crc: crc32 checksum of broadcast payload */ @@ -1010,7 +1010,7 @@ struct batadv_priv_tt { /** * @commit_lock: prevents from executing a local TT commit while reading - * the local table. The local TT commit is made up by two operations + * the local table. The local TT commit is made up of two operations * (data structure update and metadata -CRC/TTVN- recalculation) and * they have to be executed atomically in order to avoid another thread * to read the table/metadata between those. @@ -1024,7 +1024,7 @@ struct batadv_priv_tt { #ifdef CONFIG_BATMAN_ADV_BLA /** - * struct batadv_priv_bla - per mesh interface bridge loope avoidance data + * struct batadv_priv_bla - per mesh interface bridge loop avoidance data */ struct batadv_priv_bla { /** @num_requests: number of bla requests in flight */ @@ -1718,7 +1718,7 @@ struct batadv_priv { spinlock_t softif_vlan_list_lock; #ifdef CONFIG_BATMAN_ADV_BLA - /** @bla: bridge loope avoidance data */ + /** @bla: bridge loop avoidance data */ struct batadv_priv_bla bla; #endif -- cgit v1.2.3 From 3bda14d09dc5789a895ab02b7dcfcec19b0a65b3 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Mon, 1 Jun 2020 22:35:22 +0200 Subject: batman-adv: Introduce a configurable per interface hop penalty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some setups multiple hard interfaces with similar link qualities or throughput values are available. But people have expressed the desire to consider one of them as a backup only. Some creative solutions are currently in use: Such people are configuring multiple batman-adv mesh/soft interfaces, wire them together with some veth pairs and then tune the hop penalty to achieve an effect similar to a tunable per interface hop penalty. This patch introduces a new, configurable, per hard interface hop penalty to simplify such setups. Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 3 ++- net/batman-adv/bat_iv_ogm.c | 17 +++++++++-------- net/batman-adv/bat_v_ogm.c | 13 ++++++++++--- net/batman-adv/hard-interface.c | 2 ++ net/batman-adv/netlink.c | 12 +++++++++++- net/batman-adv/types.h | 6 ++++++ 6 files changed, 40 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 8cf2ad11ead9..bb0ae945b36a 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -427,7 +427,8 @@ enum batadv_nl_attrs { /** * @BATADV_ATTR_HOP_PENALTY: defines the penalty which will be applied - * to an originator message's tq-field on every hop. + * to an originator message's tq-field on every hop and/or per + * hard interface */ BATADV_ATTR_HOP_PENALTY, diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 5b3a41983156..a4faf5f904d9 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1075,10 +1075,10 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_neigh_ifinfo *neigh_ifinfo; u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; + unsigned int tq_iface_hop_penalty = BATADV_TQ_MAX_VALUE; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; - unsigned int tq_iface_penalty; bool ret = false; /* find corresponding one hop neighbor */ @@ -1157,31 +1157,32 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube; inv_asym_penalty /= neigh_rq_max_cube; tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty; + tq_iface_hop_penalty -= atomic_read(&if_incoming->hop_penalty); /* penalize if the OGM is forwarded on the same interface. WiFi * interfaces and other half duplex devices suffer from throughput * drops as they can't send and receive at the same time. */ - tq_iface_penalty = BATADV_TQ_MAX_VALUE; if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) - tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE, - bat_priv); + tq_iface_hop_penalty = batadv_hop_penalty(tq_iface_hop_penalty, + bat_priv); combined_tq = batadv_ogm_packet->tq * tq_own * tq_asym_penalty * - tq_iface_penalty; + tq_iface_hop_penalty; combined_tq /= BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE; batadv_ogm_packet->tq = combined_tq; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", + "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_hop_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", orig_node->orig, orig_neigh_node->orig, total_count, - neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_penalty, - batadv_ogm_packet->tq, if_incoming->net_dev->name, + neigh_rq_count, tq_own, tq_asym_penalty, + tq_iface_hop_penalty, batadv_ogm_packet->tq, + if_incoming->net_dev->name, if_outgoing ? if_outgoing->net_dev->name : "DEFAULT"); /* if link has the minimum required transmission quality diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 0d404f7bcd9f..0f8495b9eeb1 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -455,15 +455,17 @@ unlock: * @throughput: the current throughput * * Apply a penalty on the current throughput metric value based on the - * characteristic of the interface where the OGM has been received. The return - * value is computed as follows: + * characteristic of the interface where the OGM has been received. + * + * Initially the per hardif hop penalty is applied to the throughput. After + * that the return value is then computed as follows: * - throughput * 50% if the incoming and outgoing interface are the * same WiFi interface and the throughput is above * 1MBit/s * - throughput if the outgoing interface is the default * interface (i.e. this OGM is processed for the * internal table and not forwarded) - * - throughput * hop penalty otherwise + * - throughput * node hop penalty otherwise * * Return: the penalised throughput metric. */ @@ -472,9 +474,14 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, u32 throughput) { + int if_hop_penalty = atomic_read(&if_incoming->hop_penalty); int hop_penalty = atomic_read(&bat_priv->hop_penalty); int hop_penalty_max = BATADV_TQ_MAX_VALUE; + /* Apply per hardif hop penalty */ + throughput = throughput * (hop_penalty_max - if_hop_penalty) / + hop_penalty_max; + /* Don't apply hop penalty in default originator table. */ if (if_outgoing == BATADV_IF_DEFAULT) return throughput; diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 53c27c67cc11..fa06b51c0144 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -939,6 +939,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; + atomic_set(&hard_iface->hop_penalty, 0); + batadv_v_hardif_init(hard_iface); batadv_check_known_mac_addr(hard_iface->net_dev); diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index cfb00dfa468a..dc193618a761 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -826,6 +826,10 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg, goto nla_put_failure; } + if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, + atomic_read(&hard_iface->hop_penalty))) + goto nla_put_failure; + #ifdef CONFIG_BATMAN_ADV_BATMAN_V if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL, atomic_read(&hard_iface->bat_v.elp_interval))) @@ -920,9 +924,15 @@ static int batadv_netlink_set_hardif(struct sk_buff *skb, { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; + struct nlattr *attr; + + if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { + attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; + + atomic_set(&hard_iface->hop_penalty, nla_get_u8(attr)); + } #ifdef CONFIG_BATMAN_ADV_BATMAN_V - struct nlattr *attr; if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) { attr = info->attrs[BATADV_ATTR_ELP_INTERVAL]; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index cc151e1f23b2..ed519efa3c36 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -208,6 +208,12 @@ struct batadv_hard_iface { /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; + /** + * @hop_penalty: penalty which will be applied to the tq-field + * of an OGM received via this interface + */ + atomic_t hop_penalty; + /** @bat_iv: per hard-interface B.A.T.M.A.N. IV data */ struct batadv_hard_iface_bat_iv bat_iv; -- cgit v1.2.3 From 322b598be4d9b9090cda560c4caab78704615ab4 Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Tue, 16 Jun 2020 12:08:44 +0800 Subject: fpga: dfl: introduce interrupt trigger setting API FPGA user applications may be interested in interrupts generated by DFL features. For example, users can implement their own FPGA logics with interrupts enabled in AFU (Accelerated Function Unit, dynamic region of DFL based FPGA). So user applications need to be notified to handle these interrupts. In order to allow userspace applications to monitor interrupts, driver requires userspace to provide eventfds as interrupt notification channels. Applications then poll/select on the eventfds to get notified. This patch introduces a generic helper functions to do eventfds binding with given interrupts. Sub feature drivers are expected to use XXX_GET_IRQ_NUM to query irq info, and XXX_SET_IRQ to set eventfds for interrupts. This patch also introduces helper functions for these 2 ioctls. Signed-off-by: Luwei Kang Signed-off-by: Wu Hao Signed-off-by: Xu Yilun Signed-off-by: Tom Rix Reviewed-by: Marcelo Tosatti Acked-by: Wu Hao Signed-off-by: Moritz Fischer --- drivers/fpga/dfl.c | 157 ++++++++++++++++++++++++++++++++++++++++++ drivers/fpga/dfl.h | 16 +++++ include/uapi/linux/fpga-dfl.h | 13 ++++ 3 files changed, 186 insertions(+) (limited to 'include/uapi') diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c index d915331fb52c..649958a36e62 100644 --- a/drivers/fpga/dfl.c +++ b/drivers/fpga/dfl.c @@ -10,7 +10,9 @@ * Wu Hao * Xiao Guangrong */ +#include #include +#include #include "dfl.h" @@ -533,6 +535,7 @@ static int build_info_commit_dev(struct build_feature_devs_info *binfo) unsigned int i; /* save resource information for each feature */ + feature->dev = fdev; feature->id = finfo->fid; feature->resource_index = index; feature->ioaddr = finfo->ioaddr; @@ -1393,6 +1396,160 @@ done: } EXPORT_SYMBOL_GPL(dfl_fpga_cdev_config_ports_vf); +static irqreturn_t dfl_irq_handler(int irq, void *arg) +{ + struct eventfd_ctx *trigger = arg; + + eventfd_signal(trigger, 1); + return IRQ_HANDLED; +} + +static int do_set_irq_trigger(struct dfl_feature *feature, unsigned int idx, + int fd) +{ + struct platform_device *pdev = feature->dev; + struct eventfd_ctx *trigger; + int irq, ret; + + irq = feature->irq_ctx[idx].irq; + + if (feature->irq_ctx[idx].trigger) { + free_irq(irq, feature->irq_ctx[idx].trigger); + kfree(feature->irq_ctx[idx].name); + eventfd_ctx_put(feature->irq_ctx[idx].trigger); + feature->irq_ctx[idx].trigger = NULL; + } + + if (fd < 0) + return 0; + + feature->irq_ctx[idx].name = + kasprintf(GFP_KERNEL, "fpga-irq[%u](%s-%llx)", idx, + dev_name(&pdev->dev), feature->id); + if (!feature->irq_ctx[idx].name) + return -ENOMEM; + + trigger = eventfd_ctx_fdget(fd); + if (IS_ERR(trigger)) { + ret = PTR_ERR(trigger); + goto free_name; + } + + ret = request_irq(irq, dfl_irq_handler, 0, + feature->irq_ctx[idx].name, trigger); + if (!ret) { + feature->irq_ctx[idx].trigger = trigger; + return ret; + } + + eventfd_ctx_put(trigger); +free_name: + kfree(feature->irq_ctx[idx].name); + + return ret; +} + +/** + * dfl_fpga_set_irq_triggers - set eventfd triggers for dfl feature interrupts + * + * @feature: dfl sub feature. + * @start: start of irq index in this dfl sub feature. + * @count: number of irqs. + * @fds: eventfds to bind with irqs. unbind related irq if fds[n] is negative. + * unbind "count" specified number of irqs if fds ptr is NULL. + * + * Bind given eventfds with irqs in this dfl sub feature. Unbind related irq if + * fds[n] is negative. Unbind "count" specified number of irqs if fds ptr is + * NULL. + * + * Return: 0 on success, negative error code otherwise. + */ +int dfl_fpga_set_irq_triggers(struct dfl_feature *feature, unsigned int start, + unsigned int count, int32_t *fds) +{ + unsigned int i; + int ret = 0; + + /* overflow */ + if (unlikely(start + count < start)) + return -EINVAL; + + /* exceeds nr_irqs */ + if (start + count > feature->nr_irqs) + return -EINVAL; + + for (i = 0; i < count; i++) { + int fd = fds ? fds[i] : -1; + + ret = do_set_irq_trigger(feature, start + i, fd); + if (ret) { + while (i--) + do_set_irq_trigger(feature, start + i, -1); + break; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(dfl_fpga_set_irq_triggers); + +/** + * dfl_feature_ioctl_get_num_irqs - dfl feature _GET_IRQ_NUM ioctl interface. + * @pdev: the feature device which has the sub feature + * @feature: the dfl sub feature + * @arg: ioctl argument + * + * Return: 0 on success, negative error code otherwise. + */ +long dfl_feature_ioctl_get_num_irqs(struct platform_device *pdev, + struct dfl_feature *feature, + unsigned long arg) +{ + return put_user(feature->nr_irqs, (__u32 __user *)arg); +} +EXPORT_SYMBOL_GPL(dfl_feature_ioctl_get_num_irqs); + +/** + * dfl_feature_ioctl_set_irq - dfl feature _SET_IRQ ioctl interface. + * @pdev: the feature device which has the sub feature + * @feature: the dfl sub feature + * @arg: ioctl argument + * + * Return: 0 on success, negative error code otherwise. + */ +long dfl_feature_ioctl_set_irq(struct platform_device *pdev, + struct dfl_feature *feature, + unsigned long arg) +{ + struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev); + struct dfl_fpga_irq_set hdr; + s32 *fds; + long ret; + + if (!feature->nr_irqs) + return -ENOENT; + + if (copy_from_user(&hdr, (void __user *)arg, sizeof(hdr))) + return -EFAULT; + + if (!hdr.count || (hdr.start + hdr.count > feature->nr_irqs) || + (hdr.start + hdr.count < hdr.start)) + return -EINVAL; + + fds = memdup_user((void __user *)(arg + sizeof(hdr)), + hdr.count * sizeof(s32)); + if (IS_ERR(fds)) + return PTR_ERR(fds); + + mutex_lock(&pdata->lock); + ret = dfl_fpga_set_irq_triggers(feature, hdr.start, hdr.count, fds); + mutex_unlock(&pdata->lock); + + kfree(fds); + return ret; +} +EXPORT_SYMBOL_GPL(dfl_feature_ioctl_set_irq); + static void __exit dfl_fpga_exit(void) { dfl_chardev_uinit(); diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h index e1f3ab86560e..a32dfba2a88b 100644 --- a/drivers/fpga/dfl.h +++ b/drivers/fpga/dfl.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -214,14 +215,19 @@ struct dfl_feature_driver { * struct dfl_feature_irq_ctx - dfl private feature interrupt context * * @irq: Linux IRQ number of this interrupt. + * @trigger: eventfd context to signal when interrupt happens. + * @name: irq name needed when requesting irq. */ struct dfl_feature_irq_ctx { int irq; + struct eventfd_ctx *trigger; + char *name; }; /** * struct dfl_feature - sub feature of the feature devices * + * @dev: ptr to pdev of the feature device which has the sub feature. * @id: sub feature id. * @resource_index: each sub feature has one mmio resource for its registers. * this index is used to find its mmio resource from the @@ -233,6 +239,7 @@ struct dfl_feature_irq_ctx { * @priv: priv data of this feature. */ struct dfl_feature { + struct platform_device *dev; u64 id; int resource_index; void __iomem *ioaddr; @@ -503,4 +510,13 @@ int dfl_fpga_cdev_release_port(struct dfl_fpga_cdev *cdev, int port_id); int dfl_fpga_cdev_assign_port(struct dfl_fpga_cdev *cdev, int port_id); void dfl_fpga_cdev_config_ports_pf(struct dfl_fpga_cdev *cdev); int dfl_fpga_cdev_config_ports_vf(struct dfl_fpga_cdev *cdev, int num_vf); +int dfl_fpga_set_irq_triggers(struct dfl_feature *feature, unsigned int start, + unsigned int count, int32_t *fds); +long dfl_feature_ioctl_get_num_irqs(struct platform_device *pdev, + struct dfl_feature *feature, + unsigned long arg); +long dfl_feature_ioctl_set_irq(struct platform_device *pdev, + struct dfl_feature *feature, + unsigned long arg); + #endif /* __FPGA_DFL_H */ diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h index ec70a0746e59..7331350f3067 100644 --- a/include/uapi/linux/fpga-dfl.h +++ b/include/uapi/linux/fpga-dfl.h @@ -151,6 +151,19 @@ struct dfl_fpga_port_dma_unmap { #define DFL_FPGA_PORT_DMA_UNMAP _IO(DFL_FPGA_MAGIC, DFL_PORT_BASE + 4) +/** + * struct dfl_fpga_irq_set - the argument for DFL_FPGA_XXX_SET_IRQ ioctl. + * + * @start: Index of the first irq. + * @count: The number of eventfd handler. + * @evtfds: Eventfd handlers. + */ +struct dfl_fpga_irq_set { + __u32 start; + __u32 count; + __s32 evtfds[]; +}; + /* IOCTLs for FME file descriptor */ /** -- cgit v1.2.3 From fe80536acf8397827be77f9b8ada384b90e790d0 Mon Sep 17 00:00:00 2001 From: Martin Date: Sun, 28 Jun 2020 23:18:23 +0530 Subject: bareudp: Added attribute to enable & disable rx metadata collection Metadata need not be collected in receive if the packet from bareudp device is not targeted to openvswitch. Signed-off-by: Martin Signed-off-by: David S. Miller --- Documentation/networking/bareudp.rst | 6 ++++-- drivers/net/bareudp.c | 23 +++++++++++++++++------ include/net/bareudp.h | 1 + include/uapi/linux/if_link.h | 1 + 4 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/bareudp.rst b/Documentation/networking/bareudp.rst index 465a8b251bfe..0e00636d8d74 100644 --- a/Documentation/networking/bareudp.rst +++ b/Documentation/networking/bareudp.rst @@ -48,5 +48,7 @@ enabled. The bareudp device could be used along with OVS or flower filter in TC. The OVS or TC flower layer must set the tunnel information in SKB dst field before sending packet buffer to the bareudp device for transmission. On reception the -bareudp device extracts and stores the tunnel information in SKB dst field before -passing the packet buffer to the network stack. +bareudp device decapsulates the udp header and passes the inner packet to the +network stack. If RX_COLLECT_METADATA flag is enabled in the device the tunnel +information will be stored in the SKB dst field before the packet buffer is +passed to the network stack. diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 3dd46cd55114..108a8cafc4f8 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -46,6 +46,7 @@ struct bareudp_dev { __be16 port; u16 sport_min; bool multi_proto_mode; + bool rx_collect_metadata; struct socket __rcu *sock; struct list_head next; /* bareudp node on namespace list */ struct gro_cells gro_cells; @@ -125,13 +126,14 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) bareudp->dev->stats.rx_dropped++; goto drop; } - - tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); - if (!tun_dst) { - bareudp->dev->stats.rx_dropped++; - goto drop; + if (bareudp->rx_collect_metadata) { + tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); + if (!tun_dst) { + bareudp->dev->stats.rx_dropped++; + goto drop; + } + skb_dst_set(skb, &tun_dst->dst); } - skb_dst_set(skb, &tun_dst->dst); skb->dev = bareudp->dev; oiph = skb_network_header(skb); skb_reset_network_header(skb); @@ -575,6 +577,9 @@ static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf, if (data[IFLA_BAREUDP_MULTIPROTO_MODE]) conf->multi_proto_mode = true; + if (data[IFLA_BAREUDP_RX_COLLECT_METADATA]) + conf->rx_collect_metadata = true; + return 0; } @@ -612,6 +617,8 @@ static int bareudp_configure(struct net *net, struct net_device *dev, bareudp->ethertype = conf->ethertype; bareudp->sport_min = conf->sport_min; bareudp->multi_proto_mode = conf->multi_proto_mode; + bareudp->rx_collect_metadata = conf->rx_collect_metadata; + err = register_netdevice(dev); if (err) return err; @@ -669,6 +676,7 @@ static size_t bareudp_get_size(const struct net_device *dev) nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ nla_total_size(0) + /* IFLA_BAREUDP_MULTIPROTO_MODE */ + nla_total_size(0) + /* IFLA_BAREUDP_RX_COLLECT_METADATA */ 0; } @@ -685,6 +693,9 @@ static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) if (bareudp->multi_proto_mode && nla_put_flag(skb, IFLA_BAREUDP_MULTIPROTO_MODE)) goto nla_put_failure; + if (bareudp->rx_collect_metadata && + nla_put_flag(skb, IFLA_BAREUDP_RX_COLLECT_METADATA)) + goto nla_put_failure; return 0; diff --git a/include/net/bareudp.h b/include/net/bareudp.h index dc65a0d71d9b..3dd5f9a8d01c 100644 --- a/include/net/bareudp.h +++ b/include/net/bareudp.h @@ -12,6 +12,7 @@ struct bareudp_conf { __be16 port; u16 sport_min; bool multi_proto_mode; + bool rx_collect_metadata; }; struct net_device *bareudp_dev_create(struct net *net, const char *name, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index a009365ad67b..cc185a007ade 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -600,6 +600,7 @@ enum { IFLA_BAREUDP_ETHERTYPE, IFLA_BAREUDP_SRCPORT_MIN, IFLA_BAREUDP_MULTIPROTO_MODE, + IFLA_BAREUDP_RX_COLLECT_METADATA, __IFLA_BAREUDP_MAX }; -- cgit v1.2.3 From aee9caa03fc3c8b02f8f31824354d85f30e562e0 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 27 Jun 2020 01:45:28 +0300 Subject: net: sched: sch_red: Add qevents "early_drop" and "mark" In order to allow acting on dropped and/or ECN-marked packets, add two new qevents to the RED qdisc: "early_drop" and "mark". Filters attached at "early_drop" block are executed as packets are early-dropped, those attached at the "mark" block are executed as packets are ECN-marked. Two new attributes are introduced: TCA_RED_EARLY_DROP_BLOCK with the block index for the "early_drop" qevent, and TCA_RED_MARK_BLOCK for the "mark" qevent. Absence of these attributes signifies "don't care": no block is allocated in that case, or the existing blocks are left intact in case of the change callback. For purposes of offloading, blocks attached to these qevents appear with newly-introduced binder types, FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP and FLOW_BLOCK_BINDER_TYPE_RED_MARK. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/net/flow_offload.h | 2 ++ include/uapi/linux/pkt_sched.h | 2 ++ net/sched/sch_red.c | 58 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 60 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 3bafb5124ac0..3e793ac66baf 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -424,6 +424,8 @@ enum flow_block_binder_type { FLOW_BLOCK_BINDER_TYPE_UNSPEC, FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS, FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, + FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, + FLOW_BLOCK_BINDER_TYPE_RED_MARK, }; struct flow_block { diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index a95f3ae7ab37..9e7c2c607845 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -257,6 +257,8 @@ enum { TCA_RED_STAB, TCA_RED_MAX_P, TCA_RED_FLAGS, /* bitfield32 */ + TCA_RED_EARLY_DROP_BLOCK, /* u32 */ + TCA_RED_MARK_BLOCK, /* u32 */ __TCA_RED_MAX, }; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 225ce370e5a8..de2be4d04ed6 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -46,6 +46,8 @@ struct red_sched_data { struct red_vars vars; struct red_stats stats; struct Qdisc *qdisc; + struct tcf_qevent qe_early_drop; + struct tcf_qevent qe_mark; }; #define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP) @@ -92,6 +94,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ if (INET_ECN_set_ce(skb)) { q->stats.prob_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.prob_drop++; goto congestion_drop; @@ -109,6 +114,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ if (INET_ECN_set_ce(skb)) { q->stats.forced_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.forced_drop++; goto congestion_drop; @@ -129,6 +137,10 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ return ret; congestion_drop: + skb = tcf_qevent_handle(&q->qe_early_drop, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } @@ -202,6 +214,8 @@ static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); + tcf_qevent_destroy(&q->qe_mark, sch); + tcf_qevent_destroy(&q->qe_early_drop, sch); del_timer_sync(&q->adapt_timer); red_offload(sch, false); qdisc_put(q->qdisc); @@ -213,6 +227,8 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS), + [TCA_RED_EARLY_DROP_BLOCK] = { .type = NLA_U32 }, + [TCA_RED_MARK_BLOCK] = { .type = NLA_U32 }, }; static int __red_change(struct Qdisc *sch, struct nlattr **tb, @@ -328,12 +344,38 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt, q->qdisc = &noop_qdisc; q->sch = sch; timer_setup(&q->adapt_timer, red_adaptative_timer, 0); - return __red_change(sch, tb, extack); + + err = __red_change(sch, tb, extack); + if (err) + return err; + + err = tcf_qevent_init(&q->qe_early_drop, sch, + FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + goto err_early_drop_init; + + err = tcf_qevent_init(&q->qe_mark, sch, + FLOW_BLOCK_BINDER_TYPE_RED_MARK, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + goto err_mark_init; + + return 0; + +err_mark_init: + tcf_qevent_destroy(&q->qe_early_drop, sch); +err_early_drop_init: + del_timer_sync(&q->adapt_timer); + red_offload(sch, false); + qdisc_put(q->qdisc); + return err; } static int red_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct red_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_RED_MAX + 1]; int err; @@ -345,6 +387,16 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; + err = tcf_qevent_validate_change(&q->qe_early_drop, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + return err; + + err = tcf_qevent_validate_change(&q->qe_mark, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + return err; + return __red_change(sch, tb, extack); } @@ -389,7 +441,9 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || nla_put_bitfield32(skb, TCA_RED_FLAGS, - q->flags, TC_RED_SUPPORTED_FLAGS)) + q->flags, TC_RED_SUPPORTED_FLAGS) || + tcf_qevent_dump(skb, TCA_RED_MARK_BLOCK, &q->qe_mark) || + tcf_qevent_dump(skb, TCA_RED_EARLY_DROP_BLOCK, &q->qe_early_drop)) goto nla_put_failure; return nla_nest_end(skb, opts); -- cgit v1.2.3 From ecc31c60240b9808a274befc5db6b8a249a6ade1 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 29 Jun 2020 23:46:16 +0300 Subject: ethtool: Add link extended state Currently, drivers can only tell whether the link is up/down using LINKSTATE_GET, but no additional information is given. Add attributes to LINKSTATE_GET command in order to allow drivers to expose the user more information in addition to link state to ease the debug process, for example, reason for link down state. Extended state consists of two attributes - link_ext_state and link_ext_substate. The idea is to avoid 'vendor specific' states in order to prevent drivers to use specific link_ext_state that can be in the future common link_ext_state. The substates allows drivers to add more information to the common link_ext_state. For example, vendor can expose 'Autoneg' as link_ext_state and add 'No partner detected during force mode' as link_ext_substate. If a driver cannot pinpoint the extended state with the substate accuracy, it is free to expose only the extended state and omit the substate attribute. Signed-off-by: Amit Cohen Reviewed-by: Jiri Pirko Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/ethtool.h | 23 ++++++++++++ include/uapi/linux/ethtool.h | 70 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/ethtool_netlink.h | 2 ++ net/ethtool/linkstate.c | 52 ++++++++++++++++++++++++--- 4 files changed, 143 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index a23b26eab479..48ad3b6a0150 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -86,6 +86,22 @@ struct net_device; u32 ethtool_op_get_link(struct net_device *dev); int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *eti); + +/** + * struct ethtool_link_ext_state_info - link extended state and substate. + */ +struct ethtool_link_ext_state_info { + enum ethtool_link_ext_state link_ext_state; + union { + enum ethtool_link_ext_substate_autoneg autoneg; + enum ethtool_link_ext_substate_link_training link_training; + enum ethtool_link_ext_substate_link_logical_mismatch link_logical_mismatch; + enum ethtool_link_ext_substate_bad_signal_integrity bad_signal_integrity; + enum ethtool_link_ext_substate_cable_issue cable_issue; + u8 __link_ext_substate; + }; +}; + /** * ethtool_rxfh_indir_default - get default value for RX flow hash indirection * @index: Index in RX flow hash indirection table @@ -245,6 +261,11 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * @get_link: Report whether physical link is up. Will only be called if * the netdev is up. Should usually be set to ethtool_op_get_link(), * which uses netif_carrier_ok(). + * @get_link_ext_state: Report link extended state. Should set link_ext_state and + * link_ext_substate (link_ext_substate of 0 means link_ext_substate is unknown, + * do not attach ext_substate attribute to netlink message). If link_ext_state + * and link_ext_substate are unknown, return -ENODATA. If not implemented, + * link_ext_state and link_ext_substate will not be sent to userspace. * @get_eeprom: Read data from the device EEPROM. * Should fill in the magic field. Don't need to check len for zero * or wraparound. Fill in the data argument with the eeprom values @@ -384,6 +405,8 @@ struct ethtool_ops { void (*set_msglevel)(struct net_device *, u32); int (*nway_reset)(struct net_device *); u32 (*get_link)(struct net_device *); + int (*get_link_ext_state)(struct net_device *, + struct ethtool_link_ext_state_info *); int (*get_eeprom_len)(struct net_device *); int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f4662b3a9e1e..d1413538ef30 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -579,6 +579,76 @@ struct ethtool_pauseparam { __u32 tx_pause; }; +/** + * enum ethtool_link_ext_state - link extended state + */ +enum ethtool_link_ext_state { + ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, + ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, + ETHTOOL_LINK_EXT_STATE_NO_CABLE, + ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_STATE_EEPROM_ISSUE, + ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, + ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, + ETHTOOL_LINK_EXT_STATE_OVERHEAT, +}; + +/** + * enum ethtool_link_ext_substate_autoneg - more information in addition to + * ETHTOOL_LINK_EXT_STATE_AUTONEG. + */ +enum ethtool_link_ext_substate_autoneg { + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NEXT_PAGE_EXCHANGE_FAILED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED_FORCE_MODE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_FEC_MISMATCH_DURING_OVERRIDE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD, +}; + +/** + * enum ethtool_link_ext_substate_link_training - more information in addition to + * ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE. + */ +enum ethtool_link_ext_substate_link_training { + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_INHIBIT_TIMEOUT, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_PARTNER_DID_NOT_SET_RECEIVER_READY, + ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT, +}; + +/** + * enum ethtool_link_ext_substate_logical_mismatch - more information in addition + * to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH. + */ +enum ethtool_link_ext_substate_link_logical_mismatch { + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_AM_LOCK, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_GET_ALIGN_STATUS, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_FC_FEC_IS_NOT_LOCKED, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED, +}; + +/** + * enum ethtool_link_ext_substate_bad_signal_integrity - more information in + * addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY. + */ +enum ethtool_link_ext_substate_bad_signal_integrity { + ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE, +}; + +/** + * enum ethtool_link_ext_substate_cable_issue - more information in + * addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. + */ +enum ethtool_link_ext_substate_cable_issue { + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1, + ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE, +}; + #define ETH_GSTRING_LEN 32 /** diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 4dda5e4244a7..c12ce4df4b6b 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -236,6 +236,8 @@ enum { ETHTOOL_A_LINKSTATE_LINK, /* u8 */ ETHTOOL_A_LINKSTATE_SQI, /* u32 */ ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ + ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ + ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ /* add new constants above here */ __ETHTOOL_A_LINKSTATE_CNT, diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index afe5ac8a0f00..4834091ec24c 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -9,10 +9,12 @@ struct linkstate_req_info { }; struct linkstate_reply_data { - struct ethnl_reply_data base; - int link; - int sqi; - int sqi_max; + struct ethnl_reply_data base; + int link; + int sqi; + int sqi_max; + bool link_ext_state_provided; + struct ethtool_link_ext_state_info ethtool_link_ext_state_info; }; #define LINKSTATE_REPDATA(__reply_base) \ @@ -25,6 +27,8 @@ linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_SQI] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_SQI_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_EXT_STATE] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_EXT_SUBSTATE] = { .type = NLA_REJECT }, }; static int linkstate_get_sqi(struct net_device *dev) @@ -61,6 +65,23 @@ static int linkstate_get_sqi_max(struct net_device *dev) mutex_unlock(&phydev->lock); return ret; +}; + +static int linkstate_get_link_ext_state(struct net_device *dev, + struct linkstate_reply_data *data) +{ + int err; + + if (!dev->ethtool_ops->get_link_ext_state) + return -EOPNOTSUPP; + + err = dev->ethtool_ops->get_link_ext_state(dev, &data->ethtool_link_ext_state_info); + if (err) + return err; + + data->link_ext_state_provided = true; + + return 0; } static int linkstate_prepare_data(const struct ethnl_req_info *req_base, @@ -86,6 +107,12 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, goto out; data->sqi_max = ret; + if (dev->flags & IFF_UP) { + ret = linkstate_get_link_ext_state(dev, data); + if (ret < 0 && ret != -EOPNOTSUPP && ret != -ENODATA) + goto out; + } + ret = 0; out: ethnl_ops_complete(dev); @@ -107,6 +134,12 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base, if (data->sqi_max != -EOPNOTSUPP) len += nla_total_size(sizeof(u32)); + if (data->link_ext_state_provided) + len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */ + + if (data->ethtool_link_ext_state_info.__link_ext_substate) + len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */ + return len; } @@ -128,6 +161,17 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) return -EMSGSIZE; + if (data->link_ext_state_provided) { + if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE, + data->ethtool_link_ext_state_info.link_ext_state)) + return -EMSGSIZE; + + if (data->ethtool_link_ext_state_info.__link_ext_substate && + nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, + data->ethtool_link_ext_state_info.__link_ext_substate)) + return -EMSGSIZE; + } + return 0; } -- cgit v1.2.3 From 970471914c67b70df24def6b2a30cc42acbebded Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 16 Jun 2020 16:47:14 +0200 Subject: iommu: Allow page responses without PASID Some PCIe devices do not expect a PASID value in PRI Page Responses. If the "PRG Response PASID Required" bit in the PRI capability is zero, then the OS should not set the PASID field. Similarly on Arm SMMU, responses to stall events do not have a PASID. Currently iommu_page_response() systematically checks that the PASID in the page response corresponds to the one in the page request. This can't work with virtualization because a page response coming from a guest OS won't have a PASID if the passed-through device does not require one. Add a flag to page requests that declares whether the corresponding response needs to have a PASID. When this flag isn't set, allow page responses without PASID. Reported-by: Shameerali Kolothum Thodi Signed-off-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/20200616144712.748818-1-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 23 +++++++++++++++++------ include/uapi/linux/iommu.h | 6 +++++- 2 files changed, 22 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d43120eb1dc5..1ed1e14a1f0c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1185,11 +1185,12 @@ EXPORT_SYMBOL_GPL(iommu_report_device_fault); int iommu_page_response(struct device *dev, struct iommu_page_response *msg) { - bool pasid_valid; + bool needs_pasid; int ret = -EINVAL; struct iommu_fault_event *evt; struct iommu_fault_page_request *prm; struct dev_iommu *param = dev->iommu; + bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID; struct iommu_domain *domain = iommu_get_domain_for_dev(dev); if (!domain || !domain->ops->page_response) @@ -1214,14 +1215,24 @@ int iommu_page_response(struct device *dev, */ list_for_each_entry(evt, ¶m->fault_param->faults, list) { prm = &evt->fault.prm; - pasid_valid = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + if (prm->grpid != msg->grpid) + continue; - if ((pasid_valid && prm->pasid != msg->pasid) || - prm->grpid != msg->grpid) + /* + * If the PASID is required, the corresponding request is + * matched using the group ID, the PASID valid bit and the PASID + * value. Otherwise only the group ID matches request and + * response. + */ + needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; + if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid)) continue; - /* Sanitize the reply */ - msg->flags = pasid_valid ? IOMMU_PAGE_RESP_PASID_VALID : 0; + if (!needs_pasid && has_pasid) { + /* No big deal, just clear it. */ + msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID; + msg->pasid = 0; + } ret = domain->ops->page_response(dev, evt, msg); list_del(&evt->list); diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index e907b7091a46..c2b2caf9ed41 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -81,7 +81,10 @@ struct iommu_fault_unrecoverable { /** * struct iommu_fault_page_request - Page Request data * @flags: encodes whether the corresponding fields are valid and whether this - * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values) + * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values). + * When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response + * must have the same PASID value as the page request. When it is clear, + * the page response should not have a PASID. * @pasid: Process Address Space ID * @grpid: Page Request Group Index * @perm: requested page permissions (IOMMU_FAULT_PERM_* values) @@ -92,6 +95,7 @@ struct iommu_fault_page_request { #define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID (1 << 0) #define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) #define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) +#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 3) __u32 flags; __u32 pasid; __u32 grpid; -- cgit v1.2.3 From fa28dcb82a38f8e3993b0fae9106b1a80b59e4f0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Jun 2020 23:28:44 -0700 Subject: bpf: Introduce helper bpf_get_task_stack() Introduce helper bpf_get_task_stack(), which dumps stack trace of given task. This is different to bpf_get_stack(), which gets stack track of current task. One potential use case of bpf_get_task_stack() is to call it from bpf_iter__task and dump all /proc//stack to a seq_file. bpf_get_task_stack() uses stack_trace_save_tsk() instead of get_perf_callchain() for kernel stack. The benefit of this choice is that stack_trace_save_tsk() doesn't require changes in arch/. The downside of using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the stack trace to unsigned long array. For 32-bit systems, we need to translate it to u64 array. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630062846.664389-3-songliubraving@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 37 +++++++++++++++++++- kernel/bpf/stackmap.c | 77 +++++++++++++++++++++++++++++++++++++++--- kernel/bpf/verifier.c | 4 ++- kernel/trace/bpf_trace.c | 2 ++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 37 +++++++++++++++++++- 7 files changed, 153 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3d2ade703a35..0cd7f6884c5c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1627,6 +1627,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; +extern const struct bpf_func_proto bpf_get_task_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0cb8ec948816..da9bf35a26f8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3285,6 +3285,39 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to struct task_struct. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3427,7 +3460,9 @@ union bpf_attr { FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), + FN(skc_to_udp6_sock), \ + FN(get_task_stack), \ + /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 071f98d0f7c6..5ad72ab2276b 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -348,6 +348,40 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } } +static struct perf_callchain_entry * +get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +{ + struct perf_callchain_entry *entry; + int rctx; + + entry = get_callchain_entry(&rctx); + + if (!entry) + return NULL; + + entry->nr = init_nr + + stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), + sysctl_perf_event_max_stack - init_nr, 0); + + /* stack_trace_save_tsk() works on unsigned long array, while + * perf_callchain_entry uses u64 array. For 32-bit systems, it is + * necessary to fix this mismatch. + */ + if (__BITS_PER_LONG != 64) { + unsigned long *from = (unsigned long *) entry->ip; + u64 *to = entry->ip; + int i; + + /* copy data from the end to avoid using extra buffer */ + for (i = entry->nr - 1; i >= (int)init_nr; i--) + to[i] = (u64)(from[i]); + } + + put_callchain_entry(rctx); + + return entry; +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { @@ -448,8 +482,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, - u64, flags) +static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; bool user_build_id = flags & BPF_F_USER_BUILD_ID; @@ -471,13 +505,22 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, if (unlikely(size % elem_size)) goto clear; + /* cannot get valid user stack for task without user_mode regs */ + if (task && user && !user_mode(regs)) + goto err_fault; + num_elem = size / elem_size; if (sysctl_perf_event_max_stack < num_elem) init_nr = 0; else init_nr = sysctl_perf_event_max_stack - num_elem; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + + if (kernel && task) + trace = get_callchain_entry_for_task(task, init_nr); + else + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, + false, false); if (unlikely(!trace)) goto err_fault; @@ -505,6 +548,12 @@ clear: return err; } +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + return __bpf_get_stack(regs, NULL, buf, size, flags); +} + const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, @@ -515,6 +564,26 @@ const struct bpf_func_proto bpf_get_stack_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + struct pt_regs *regs = task_pt_regs(task); + + return __bpf_get_stack(regs, task, buf, size, flags); +} + +static int bpf_get_task_stack_btf_ids[5]; +const struct bpf_func_proto bpf_get_task_stack_proto = { + .func = bpf_get_task_stack, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .btf_id = bpf_get_task_stack_btf_ids, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7de98906ddf4..b608185e1ffd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4864,7 +4864,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; - if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + if ((func_id == BPF_FUNC_get_stack || + func_id == BPF_FUNC_get_task_stack) && + !env->prog->has_callchain_buf) { const char *err_str; #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5d59dda5f661..977ba3b6f6c6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1137,6 +1137,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ringbuf_query_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_get_task_stack: + return &bpf_get_task_stack_proto; default: return NULL; } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6bab40ff442e..6843376733df 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -426,6 +426,7 @@ class PrinterHelpers(Printer): 'struct tcp_timewait_sock', 'struct tcp_request_sock', 'struct udp6_sock', + 'struct task_struct', 'struct __sk_buff', 'struct sk_msg_md', @@ -468,6 +469,7 @@ class PrinterHelpers(Printer): 'struct tcp_timewait_sock', 'struct tcp_request_sock', 'struct udp6_sock', + 'struct task_struct', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0cb8ec948816..da9bf35a26f8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3285,6 +3285,39 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to struct task_struct. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3427,7 +3460,9 @@ union bpf_attr { FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), + FN(skc_to_udp6_sock), \ + FN(get_task_stack), \ + /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 62e9dd177732843ae6c5b9d2ed61e7c9538fa276 Mon Sep 17 00:00:00 2001 From: Shyam Sundar Date: Tue, 30 Jun 2020 03:22:28 -0700 Subject: scsi: qla2xxx: Change in PUREX to handle FPIN ELS requests SAN Congestion Management generates ELS pkts whose size can vary and be > 64 bytes. Change the PUREX handling code to support non-standard ELS pkt size. Link: https://lore.kernel.org/r/20200630102229.29660-2-njavali@marvell.com Reviewed-by: Himanshu Madhani Signed-off-by: Shyam Sundar Signed-off-by: Arun Easi Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_def.h | 15 +++++- drivers/scsi/qla2xxx/qla_gbl.h | 3 +- drivers/scsi/qla2xxx/qla_isr.c | 116 ++++++++++++++++++++++++++++++----------- drivers/scsi/qla2xxx/qla_mbx.c | 22 ++++++-- drivers/scsi/qla2xxx/qla_os.c | 19 +++++-- include/uapi/scsi/fc/fc_els.h | 2 + 6 files changed, 134 insertions(+), 43 deletions(-) (limited to 'include/uapi') diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 42dbf90d4651..9a0f2314fe7b 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -34,6 +34,8 @@ #include #include +#include + /* Big endian Fibre Channel S_ID (source ID) or D_ID (destination ID). */ typedef struct { uint8_t domain; @@ -1304,7 +1306,6 @@ static inline bool qla2xxx_is_valid_mbs(unsigned int mbs) #define RNID_TYPE_ASIC_TEMP 0xC #define ELS_CMD_MAP_SIZE 32 -#define ELS_COMMAND_RDP 0x18 /* * Firmware state codes from get firmware state mailbox command @@ -4522,10 +4523,19 @@ struct active_regions { #define QLA_SET_DATA_RATE_NOLR 1 #define QLA_SET_DATA_RATE_LR 2 /* Set speed and initiate LR */ +#define QLA_DEFAULT_PAYLOAD_SIZE 64 +/* + * This item might be allocated with a size > sizeof(struct purex_item). + * The "size" variable gives the size of the payload (which + * is variable) starting at "iocb". + */ struct purex_item { struct list_head list; struct scsi_qla_host *vha; - void (*process_item)(struct scsi_qla_host *vha, void *pkt); + void (*process_item)(struct scsi_qla_host *vha, + struct purex_item *pkt); + atomic_t in_use; + uint16_t size; struct { uint8_t iocb[64]; } iocb; @@ -4725,6 +4735,7 @@ typedef struct scsi_qla_host { struct list_head head; spinlock_t lock; } purex_list; + struct purex_item default_item; struct name_list_extended gnl; /* Count of active session/fcport */ diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h index 061f91b521b3..9cf33d05e3f8 100644 --- a/drivers/scsi/qla2xxx/qla_gbl.h +++ b/drivers/scsi/qla2xxx/qla_gbl.h @@ -229,7 +229,8 @@ void qla2x00_handle_login_done_event(struct scsi_qla_host *, fc_port_t *, int qla24xx_post_gnl_work(struct scsi_qla_host *, fc_port_t *); int qla24xx_post_relogin_work(struct scsi_qla_host *vha); void qla2x00_wait_for_sess_deletion(scsi_qla_host_t *); -void qla24xx_process_purex_rdp(struct scsi_qla_host *vha, void *pkt); +void qla24xx_process_purex_rdp(struct scsi_qla_host *vha, + struct purex_item *pkt); /* * Global Functions in qla_mid.c source file. diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index cf0800546740..3bbfff20e3a6 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -31,35 +31,11 @@ const char *const port_state_str[] = { "ONLINE" }; -static void qla24xx_purex_iocb(scsi_qla_host_t *vha, void *pkt, - void (*process_item)(struct scsi_qla_host *vha, void *pkt)) -{ - struct purex_list *list = &vha->purex_list; - struct purex_item *item; - ulong flags; - - item = kzalloc(sizeof(*item), GFP_KERNEL); - if (!item) { - ql_log(ql_log_warn, vha, 0x5092, - ">> Failed allocate purex list item.\n"); - return; - } - - item->vha = vha; - item->process_item = process_item; - memcpy(&item->iocb, pkt, sizeof(item->iocb)); - - spin_lock_irqsave(&list->lock, flags); - list_add_tail(&item->list, &list->head); - spin_unlock_irqrestore(&list->lock, flags); - - set_bit(PROCESS_PUREX_IOCB, &vha->dpc_flags); -} - static void -qla24xx_process_abts(struct scsi_qla_host *vha, void *pkt) +qla24xx_process_abts(struct scsi_qla_host *vha, struct purex_item *pkt) { - struct abts_entry_24xx *abts = pkt; + struct abts_entry_24xx *abts = + (struct abts_entry_24xx *)&pkt->iocb; struct qla_hw_data *ha = vha->hw; struct els_entry_24xx *rsp_els; struct abts_entry_24xx *abts_rsp; @@ -789,6 +765,74 @@ qla27xx_handle_8200_aen(scsi_qla_host_t *vha, uint16_t *mb) } } +struct purex_item * +qla24xx_alloc_purex_item(scsi_qla_host_t *vha, uint16_t size) +{ + struct purex_item *item = NULL; + uint8_t item_hdr_size = sizeof(*item); + + if (size > QLA_DEFAULT_PAYLOAD_SIZE) { + item = kzalloc(item_hdr_size + + (size - QLA_DEFAULT_PAYLOAD_SIZE), GFP_ATOMIC); + } else { + if (atomic_inc_return(&vha->default_item.in_use) == 1) { + item = &vha->default_item; + goto initialize_purex_header; + } else { + item = kzalloc(item_hdr_size, GFP_ATOMIC); + } + } + if (!item) { + ql_log(ql_log_warn, vha, 0x5092, + ">> Failed allocate purex list item.\n"); + + return NULL; + } + +initialize_purex_header: + item->vha = vha; + item->size = size; + return item; +} + +static void +qla24xx_queue_purex_item(scsi_qla_host_t *vha, struct purex_item *pkt, + void (*process_item)(struct scsi_qla_host *vha, + struct purex_item *pkt)) +{ + struct purex_list *list = &vha->purex_list; + ulong flags; + + pkt->process_item = process_item; + + spin_lock_irqsave(&list->lock, flags); + list_add_tail(&pkt->list, &list->head); + spin_unlock_irqrestore(&list->lock, flags); + + set_bit(PROCESS_PUREX_IOCB, &vha->dpc_flags); +} + +/** + * qla24xx_copy_std_pkt() - Copy over purex ELS which is + * contained in a single IOCB. + * purex packet. + * @vha: SCSI driver HA context + * @pkt: ELS packet + */ +struct purex_item +*qla24xx_copy_std_pkt(struct scsi_qla_host *vha, void *pkt) +{ + struct purex_item *item; + + item = qla24xx_alloc_purex_item(vha, + QLA_DEFAULT_PAYLOAD_SIZE); + if (!item) + return item; + + memcpy(&item->iocb, pkt, sizeof(item->iocb)); + return item; +} + /** * qla2x00_async_event() - Process aynchronous events. * @vha: SCSI driver HA context @@ -3229,6 +3273,7 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha, { struct sts_entry_24xx *pkt; struct qla_hw_data *ha = vha->hw; + struct purex_item *pure_item; if (!ha->flags.fw_started) return; @@ -3280,8 +3325,12 @@ process_err: break; case ABTS_RECV_24XX: if (qla_ini_mode_enabled(vha)) { - qla24xx_purex_iocb(vha, pkt, - qla24xx_process_abts); + pure_item = qla24xx_copy_std_pkt(vha, pkt); + if (!pure_item) + break; + + qla24xx_queue_purex_item(vha, pure_item, + qla24xx_process_abts); break; } if (IS_QLA83XX(ha) || IS_QLA27XX(ha) || @@ -3332,13 +3381,18 @@ process_err: { struct purex_entry_24xx *purex = (void *)pkt; - if (purex->els_frame_payload[3] != ELS_COMMAND_RDP) { + if (purex->els_frame_payload[3] != ELS_RDP) { ql_dbg(ql_dbg_init, vha, 0x5091, "Discarding ELS Request opcode %#x...\n", purex->els_frame_payload[3]); break; } - qla24xx_purex_iocb(vha, pkt, qla24xx_process_purex_rdp); + pure_item = qla24xx_copy_std_pkt(vha, pkt); + if (!pure_item) + break; + + qla24xx_queue_purex_item(vha, pure_item, + qla24xx_process_purex_rdp); break; } default: diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index df31ee0d59b2..b29d3831ea73 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -59,6 +59,7 @@ static struct rom_cmd { { MBC_IOCB_COMMAND_A64 }, { MBC_GET_ADAPTER_LOOP_ID }, { MBC_READ_SFP }, + { MBC_SET_RNID_PARAMS }, { MBC_GET_RNID_PARAMS }, { MBC_GET_SET_ZIO_THRESHOLD }, }; @@ -4866,6 +4867,7 @@ qla24xx_get_port_login_templ(scsi_qla_host_t *vha, dma_addr_t buf_dma, return rval; } +#define PUREX_CMD_COUNT 2 int qla25xx_set_els_cmds_supported(scsi_qla_host_t *vha) { @@ -4874,12 +4876,12 @@ qla25xx_set_els_cmds_supported(scsi_qla_host_t *vha) mbx_cmd_t *mcp = &mc; uint8_t *els_cmd_map; dma_addr_t els_cmd_map_dma; - uint cmd_opcode = ELS_COMMAND_RDP; - uint index = cmd_opcode / 8; - uint bit = cmd_opcode % 8; + uint8_t cmd_opcode[PUREX_CMD_COUNT]; + uint8_t i, index, purex_bit; struct qla_hw_data *ha = vha->hw; - if (!IS_QLA25XX(ha) && !IS_QLA2031(ha) && !IS_QLA27XX(ha)) + if (!IS_QLA25XX(ha) && !IS_QLA2031(ha) && + !IS_QLA27XX(ha) && !IS_QLA28XX(ha)) return QLA_SUCCESS; ql_dbg(ql_dbg_mbx + ql_dbg_verbose, vha, 0x1197, @@ -4893,7 +4895,17 @@ qla25xx_set_els_cmds_supported(scsi_qla_host_t *vha) return QLA_MEMORY_ALLOC_FAILED; } - els_cmd_map[index] |= 1 << bit; + memset(els_cmd_map, 0, ELS_CMD_MAP_SIZE); + + /* List of Purex ELS */ + cmd_opcode[0] = ELS_FPIN; + cmd_opcode[1] = ELS_RDP; + + for (i = 0; i < PUREX_CMD_COUNT; i++) { + index = cmd_opcode[i] / 8; + purex_bit = cmd_opcode[i] % 8; + els_cmd_map[index] |= 1 << purex_bit; + } mcp->mb[0] = MBC_SET_RNID_PARAMS; mcp->mb[1] = RNID_TYPE_ELS_CMD << 8; diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index e92fad99338c..80ce22cfc1b9 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -5893,10 +5893,12 @@ qla25xx_rdp_port_speed_currently(struct qla_hw_data *ha) * vha: SCSI qla host * purex: RDP request received by HBA */ -void qla24xx_process_purex_rdp(struct scsi_qla_host *vha, void *pkt) +void qla24xx_process_purex_rdp(struct scsi_qla_host *vha, + struct purex_item *item) { struct qla_hw_data *ha = vha->hw; - struct purex_entry_24xx *purex = pkt; + struct purex_entry_24xx *purex = + (struct purex_entry_24xx *)&item->iocb; dma_addr_t rsp_els_dma; dma_addr_t rsp_payload_dma; dma_addr_t stat_dma; @@ -6306,6 +6308,15 @@ dealloc: rsp_els, rsp_els_dma); } +void +qla24xx_free_purex_item(struct purex_item *item) +{ + if (item == &item->vha->default_item) + memset(&item->vha->default_item, 0, sizeof(struct purex_item)); + else + kfree(item); +} + void qla24xx_process_purex_list(struct purex_list *list) { struct list_head head = LIST_HEAD_INIT(head); @@ -6318,8 +6329,8 @@ void qla24xx_process_purex_list(struct purex_list *list) list_for_each_entry_safe(item, next, &head, list) { list_del(&item->list); - item->process_item(item->vha, &item->iocb); - kfree(item); + item->process_item(item->vha, item); + qla24xx_free_purex_item(item); } } diff --git a/include/uapi/scsi/fc/fc_els.h b/include/uapi/scsi/fc/fc_els.h index 66318c44acd7..8c704e510e39 100644 --- a/include/uapi/scsi/fc/fc_els.h +++ b/include/uapi/scsi/fc/fc_els.h @@ -41,6 +41,7 @@ enum fc_els_cmd { ELS_REC = 0x13, /* read exchange concise */ ELS_SRR = 0x14, /* sequence retransmission request */ ELS_FPIN = 0x16, /* Fabric Performance Impact Notification */ + ELS_RDP = 0x18, /* Read Diagnostic Parameters */ ELS_RDF = 0x19, /* Register Diagnostic Functions */ ELS_PRLI = 0x20, /* process login */ ELS_PRLO = 0x21, /* process logout */ @@ -110,6 +111,7 @@ enum fc_els_cmd { [ELS_REC] = "REC", \ [ELS_SRR] = "SRR", \ [ELS_FPIN] = "FPIN", \ + [ELS_RDP] = "RDP", \ [ELS_RDF] = "RDF", \ [ELS_PRLI] = "PRLI", \ [ELS_PRLO] = "PRLO", \ -- cgit v1.2.3 From e4266b991fead8eb996688e82ff39f6cc59ef7dd Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 2 Jul 2020 10:13:05 +0200 Subject: bridge: uapi: mrp: Extend MRP attributes to get the status Add MRP attribute IFLA_BRIDGE_MRP_INFO to allow the userspace to get the current state of the MRP instances. This is a nested attribute that contains other attributes like, ring id, index of primary and secondary port, priority, ring state, ring role. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index caa6914a3e53..c114c1c2bd53 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -166,6 +166,7 @@ enum { IFLA_BRIDGE_MRP_RING_STATE, IFLA_BRIDGE_MRP_RING_ROLE, IFLA_BRIDGE_MRP_START_TEST, + IFLA_BRIDGE_MRP_INFO, __IFLA_BRIDGE_MRP_MAX, }; @@ -228,6 +229,22 @@ enum { #define IFLA_BRIDGE_MRP_START_TEST_MAX (__IFLA_BRIDGE_MRP_START_TEST_MAX - 1) +enum { + IFLA_BRIDGE_MRP_INFO_UNSPEC, + IFLA_BRIDGE_MRP_INFO_RING_ID, + IFLA_BRIDGE_MRP_INFO_P_IFINDEX, + IFLA_BRIDGE_MRP_INFO_S_IFINDEX, + IFLA_BRIDGE_MRP_INFO_PRIO, + IFLA_BRIDGE_MRP_INFO_RING_STATE, + IFLA_BRIDGE_MRP_INFO_RING_ROLE, + IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL, + IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_INFO_TEST_MONITOR, + __IFLA_BRIDGE_MRP_INFO_MAX, +}; + +#define IFLA_BRIDGE_MRP_INFO_MAX (__IFLA_BRIDGE_MRP_INFO_MAX - 1) + struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; -- cgit v1.2.3 From 36a8e8e26542056bbd7eb5e047cadee30587d230 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 2 Jul 2020 10:13:07 +0200 Subject: bridge: Extend br_fill_ifinfo to return MPR status This patch extends the function br_fill_ifinfo to return also the MRP status for each instance on a bridge. It also adds a new filter RTEXT_FILTER_MRP to return the MRP status only when this is set, not to interfer with the vlans. The MRP status is return only on the bridge interfaces. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + net/bridge/br_netlink.c | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 879e64950a0a..9b814c92de12 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -778,6 +778,7 @@ enum { #define RTEXT_FILTER_BRVLAN (1 << 1) #define RTEXT_FILTER_BRVLAN_COMPRESSED (1 << 2) #define RTEXT_FILTER_SKIP_STATS (1 << 3) +#define RTEXT_FILTER_MRP (1 << 4) /* End of information exported to user level */ diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 240e260e3461..c532fa65c983 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -453,6 +453,28 @@ static int br_fill_ifinfo(struct sk_buff *skb, rcu_read_unlock(); if (err) goto nla_put_failure; + + nla_nest_end(skb, af); + } + + if (filter_mask & RTEXT_FILTER_MRP) { + struct nlattr *af; + int err; + + if (!br_mrp_enabled(br) || port) + goto done; + + af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); + if (!af) + goto nla_put_failure; + + rcu_read_lock(); + err = br_mrp_fill_info(skb, br); + rcu_read_unlock(); + + if (err) + goto nla_put_failure; + nla_nest_end(skb, af); } @@ -516,7 +538,8 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_bridge_port *port = br_port_get_rtnl(dev); if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) && - !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) + !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) && + !(filter_mask & RTEXT_FILTER_MRP)) return 0; return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, -- cgit v1.2.3 From d6528ec883096e7ccdb08257bcc45670bc878519 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Fri, 3 Jul 2020 10:07:23 +0200 Subject: drm/fourcc: Add modifier definitions for describing Amlogic Video Framebuffer Compression Amlogic uses a proprietary lossless image compression protocol and format for their hardware video codec accelerators, either video decoders or video input encoders. It considerably reduces memory bandwidth while writing and reading frames in memory. The underlying storage is considered to be 3 components, 8bit or 10-bit per component, YCbCr 420, single plane : - DRM_FORMAT_YUV420_8BIT - DRM_FORMAT_YUV420_10BIT This modifier will be notably added to DMA-BUF frames imported from the V4L2 Amlogic VDEC decoder. This introduces the basic layout composed of: - a body content organized in 64x32 superblocks with 4096 bytes per superblock in default mode. - a 32 bytes per 128x64 header block This layout is tranferrable between Amlogic SoCs supporting this modifier. The Memory Saving option exist changing the layout superblock size to save memory when using 8bit components pixels size. Finally is also adds the Scatter Memory layout, meaning the header contains IOMMU references to the compressed frames content to optimize memory access and layout. In this mode, only the header memory address is needed, thus the content memory organization is tied to the current producer execution and cannot be saved/dumped neither transferrable between Amlogic SoCs supporting this modifier. Signed-off-by: Neil Armstrong Tested-by: Kevin Hilman Reviewed-by: Kevin Hilman Acked-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20200703080728.25207-2-narmstrong@baylibre.com --- include/uapi/drm/drm_fourcc.h | 81 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 993c1b342315..cbf92fdf2712 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -331,6 +331,7 @@ extern "C" { #define DRM_FORMAT_MOD_VENDOR_BROADCOM 0x07 #define DRM_FORMAT_MOD_VENDOR_ARM 0x08 #define DRM_FORMAT_MOD_VENDOR_ALLWINNER 0x09 +#define DRM_FORMAT_MOD_VENDOR_AMLOGIC 0x0a /* add more to the end as needed */ @@ -950,6 +951,86 @@ drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier) */ #define DRM_FORMAT_MOD_ALLWINNER_TILED fourcc_mod_code(ALLWINNER, 1) +/* + * Amlogic Video Framebuffer Compression modifiers + * + * Amlogic uses a proprietary lossless image compression protocol and format + * for their hardware video codec accelerators, either video decoders or + * video input encoders. + * + * It considerably reduces memory bandwidth while writing and reading + * frames in memory. + * + * The underlying storage is considered to be 3 components, 8bit or 10-bit + * per component YCbCr 420, single plane : + * - DRM_FORMAT_YUV420_8BIT + * - DRM_FORMAT_YUV420_10BIT + * + * The first 8 bits of the mode defines the layout, then the following 8 bits + * defines the options changing the layout. + * + * Not all combinations are valid, and different SoCs may support different + * combinations of layout and options. + */ +#define __fourcc_mod_amlogic_layout_mask 0xf +#define __fourcc_mod_amlogic_options_shift 8 +#define __fourcc_mod_amlogic_options_mask 0xf + +#define DRM_FORMAT_MOD_AMLOGIC_FBC(__layout, __options) \ + fourcc_mod_code(AMLOGIC, \ + ((__layout) & __fourcc_mod_amlogic_layout_mask) | \ + ((__options) & __fourcc_mod_amlogic_options_mask \ + << __fourcc_mod_amlogic_options_shift)) + +/* Amlogic FBC Layouts */ + +/* + * Amlogic FBC Basic Layout + * + * The basic layout is composed of: + * - a body content organized in 64x32 superblocks with 4096 bytes per + * superblock in default mode. + * - a 32 bytes per 128x64 header block + * + * This layout is transferrable between Amlogic SoCs supporting this modifier. + */ +#define AMLOGIC_FBC_LAYOUT_BASIC (1ULL) + +/* + * Amlogic FBC Scatter Memory layout + * + * Indicates the header contains IOMMU references to the compressed + * frames content to optimize memory access and layout. + * + * In this mode, only the header memory address is needed, thus the + * content memory organization is tied to the current producer + * execution and cannot be saved/dumped neither transferrable between + * Amlogic SoCs supporting this modifier. + * + * Due to the nature of the layout, these buffers are not expected to + * be accessible by the user-space clients, but only accessible by the + * hardware producers and consumers. + * + * The user-space clients should expect a failure while trying to mmap + * the DMA-BUF handle returned by the producer. + */ +#define AMLOGIC_FBC_LAYOUT_SCATTER (2ULL) + +/* Amlogic FBC Layout Options Bit Mask */ + +/* + * Amlogic FBC Memory Saving mode + * + * Indicates the storage is packed when pixel size is multiple of word + * boudaries, i.e. 8bit should be stored in this mode to save allocation + * memory. + * + * This mode reduces body layout to 3072 bytes per 64x32 superblock with + * the basic layout and 3200 bytes per 64x32 superblock combined with + * the scatter layout. + */ +#define AMLOGIC_FBC_OPTION_MEM_SAVING (1ULL << 0) + #if defined(__cplusplus) } #endif -- cgit v1.2.3 From 74cccc3d38438b346e40a4f8133cff3f0839ff84 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:11 +0200 Subject: netfilter: nf_tables: add NFTA_CHAIN_ID attribute This netlink attribute allows you to refer to chains inside a transaction as an alternative to the name and the handle. The chain binding support requires this new chain ID approach. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 15 ++++++++++++--- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 6f0f6fca9ac3..3e5226684017 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1433,6 +1433,7 @@ struct nft_trans_chain { char *name; struct nft_stats __percpu *stats; u8 policy; + u32 chain_id; }; #define nft_trans_chain_update(trans) \ @@ -1443,6 +1444,8 @@ struct nft_trans_chain { (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) +#define nft_trans_chain_id(trans) \ + (((struct nft_trans_chain *)trans->data)->chain_id) struct nft_trans_table { bool update; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 4565456c0ef4..477779595b78 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -196,6 +196,7 @@ enum nft_table_attributes { * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING) * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes) * @NFTA_CHAIN_FLAGS: chain flags + * @NFTA_CHAIN_ID: uniquely identifies a chain in a transaction (NLA_U32) */ enum nft_chain_attributes { NFTA_CHAIN_UNSPEC, @@ -209,6 +210,7 @@ enum nft_chain_attributes { NFTA_CHAIN_COUNTERS, NFTA_CHAIN_PAD, NFTA_CHAIN_FLAGS, + NFTA_CHAIN_ID, __NFTA_CHAIN_MAX }; #define NFTA_CHAIN_MAX (__NFTA_CHAIN_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7647ecfa0d40..650ef0dd0773 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -280,9 +280,15 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) if (trans == NULL) return ERR_PTR(-ENOMEM); - if (msg_type == NFT_MSG_NEWCHAIN) + if (msg_type == NFT_MSG_NEWCHAIN) { nft_activate_next(ctx->net, ctx->chain); + if (ctx->nla[NFTA_CHAIN_ID]) { + nft_trans_chain_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); + } + } + list_add_tail(&trans->list, &ctx->net->nft.commit_list); return trans; } @@ -1274,6 +1280,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, + [NFTA_CHAIN_ID] = { .type = NLA_U32 }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { @@ -2154,9 +2161,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct nft_chain *chain = NULL; const struct nlattr *attr; struct nft_table *table; - struct nft_chain *chain; u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; @@ -2181,7 +2188,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } attr = nla[NFTA_CHAIN_HANDLE]; - } else { + } else if (nla[NFTA_CHAIN_NAME]) { chain = nft_chain_lookup(net, table, attr, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) { @@ -2190,6 +2197,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } chain = NULL; } + } else if (!nla[NFTA_CHAIN_ID]) { + return -EINVAL; } if (nla[NFTA_CHAIN_POLICY]) { -- cgit v1.2.3 From 837830a4b439bfeb86c70b0115c280377c84714b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:16 +0200 Subject: netfilter: nf_tables: add NFTA_RULE_CHAIN_ID attribute This new netlink attribute allows you to add rules to chains by the chain ID. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 36 ++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 477779595b78..2304d1b7ba5e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -240,6 +240,7 @@ enum nft_rule_attributes { NFTA_RULE_PAD, NFTA_RULE_ID, NFTA_RULE_POSITION_ID, + NFTA_RULE_CHAIN_ID, __NFTA_RULE_MAX }; #define NFTA_RULE_MAX (__NFTA_RULE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 650ef0dd0773..fbe8f9209813 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2153,6 +2153,22 @@ err: return err; } +static struct nft_chain *nft_chain_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans *trans; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + struct nft_chain *chain = trans->ctx.chain; + + if (trans->msg_type == NFT_MSG_NEWCHAIN && + id == nft_trans_chain_id(trans)) + return chain; + } + return ERR_PTR(-ENOENT); +} + static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2633,6 +2649,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, [NFTA_RULE_ID] = { .type = NLA_U32 }, [NFTA_RULE_POSITION_ID] = { .type = NLA_U32 }, + [NFTA_RULE_CHAIN_ID] = { .type = NLA_U32 }, }; static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, @@ -3039,10 +3056,21 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); - if (IS_ERR(chain)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); - return PTR_ERR(chain); + if (nla[NFTA_RULE_CHAIN]) { + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], + genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); + return PTR_ERR(chain); + } + } else if (nla[NFTA_RULE_CHAIN_ID]) { + chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]); + return PTR_ERR(chain); + } + } else { + return -EINVAL; } if (nla[NFTA_RULE_HANDLE]) { -- cgit v1.2.3 From 51d70f181ff4e2c996ddf256af1efecd7d5864e5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:21 +0200 Subject: netfilter: nf_tables: add NFTA_VERDICT_CHAIN_ID attribute This netlink attribute allows you to identify the chain to jump/goto by means of the chain ID. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2304d1b7ba5e..683e75126d68 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -471,11 +471,13 @@ enum nft_data_attributes { * * @NFTA_VERDICT_CODE: nf_tables verdict (NLA_U32: enum nft_verdicts) * @NFTA_VERDICT_CHAIN: jump target chain name (NLA_STRING) + * @NFTA_VERDICT_CHAIN_ID: jump target chain ID (NLA_U32) */ enum nft_verdict_attributes { NFTA_VERDICT_UNSPEC, NFTA_VERDICT_CODE, NFTA_VERDICT_CHAIN, + NFTA_VERDICT_CHAIN_ID, __NFTA_VERDICT_MAX }; #define NFTA_VERDICT_MAX (__NFTA_VERDICT_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fbe8f9209813..d86602797a69 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8242,6 +8242,7 @@ static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, + [NFTA_VERDICT_CHAIN_ID] = { .type = NLA_U32 }, }; static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, @@ -8278,10 +8279,19 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, break; case NFT_JUMP: case NFT_GOTO: - if (!tb[NFTA_VERDICT_CHAIN]) + if (tb[NFTA_VERDICT_CHAIN]) { + chain = nft_chain_lookup(ctx->net, ctx->table, + tb[NFTA_VERDICT_CHAIN], + genmask); + } else if (tb[NFTA_VERDICT_CHAIN_ID]) { + chain = nft_chain_lookup_byid(ctx->net, + tb[NFTA_VERDICT_CHAIN_ID]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } else { return -EINVAL; - chain = nft_chain_lookup(ctx->net, ctx->table, - tb[NFTA_VERDICT_CHAIN], genmask); + } + if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) -- cgit v1.2.3 From 67c49de4ad862c567088c5119cf125e566f56e7f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:25 +0200 Subject: netfilter: nf_tables: expose enum nft_chain_flags through UAPI This enum definition was never exposed through UAPI. Rename NFT_BASE_CHAIN to NFT_CHAIN_BASE for consistency. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +------ include/uapi/linux/netfilter/nf_tables.h | 5 +++++ net/netfilter/nf_tables_api.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3e5226684017..6d1e7da6e00a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -921,11 +921,6 @@ static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, (expr) != (last); \ (expr) = nft_expr_next(expr)) -enum nft_chain_flags { - NFT_BASE_CHAIN = 0x1, - NFT_CHAIN_HW_OFFLOAD = 0x2, -}; - #define NFT_CHAIN_POLICY_UNSET U8_MAX /** @@ -1036,7 +1031,7 @@ static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chai static inline bool nft_is_base_chain(const struct nft_chain *chain) { - return chain->flags & NFT_BASE_CHAIN; + return chain->flags & NFT_CHAIN_BASE; } int __nft_release_basechain(struct nft_ctx *ctx); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 683e75126d68..2cf7cc3b50c1 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -184,6 +184,11 @@ enum nft_table_attributes { }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) +enum nft_chain_flags { + NFT_CHAIN_BASE = (1 << 0), + NFT_CHAIN_HW_OFFLOAD = (1 << 1), +}; + /** * enum nft_chain_attributes - nf_tables chain netlink attributes * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d86602797a69..b7582a1c8dce 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1903,7 +1903,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, nft_basechain_hook_init(&basechain->ops, family, hook, chain); } - chain->flags |= NFT_BASE_CHAIN | flags; + chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && nft_chain_offload_priority(basechain) < 0) @@ -2255,7 +2255,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - flags |= chain->flags & NFT_BASE_CHAIN; + flags |= chain->flags & NFT_CHAIN_BASE; return nf_tables_updchain(&ctx, genmask, policy, flags); } -- cgit v1.2.3 From d0e2c7de92c7f2b3d355ad76b0bb9fc43d1beb87 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:36 +0200 Subject: netfilter: nf_tables: add NFT_CHAIN_BINDING This new chain flag specifies that: * the kernel dynamically allocates the chain name, if no chain name is specified. * If the immediate expression that refers to this chain is removed, then this bound chain (and its content) is destroyed. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 13 ++++- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 86 +++++++++++++++++++++++++++----- net/netfilter/nft_immediate.c | 51 +++++++++++++++++++ 4 files changed, 138 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 6d1e7da6e00a..822c26766330 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -899,6 +899,8 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) return (void *)&rule->data[rule->dlen]; } +void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule); + static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -944,7 +946,8 @@ struct nft_chain { struct nft_table *table; u64 handle; u32 use; - u8 flags:6, + u8 flags:5, + bound:1, genmask:2; char *name; @@ -989,6 +992,14 @@ int nft_chain_validate_dependency(const struct nft_chain *chain, int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); +static inline bool nft_chain_is_bound(struct nft_chain *chain) +{ + return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; +} + +void nft_chain_del(struct nft_chain *chain); +void nf_tables_chain_destroy(struct nft_ctx *ctx); + struct nft_stats { u64 bytes; u64 pkts; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2cf7cc3b50c1..e00b4ae6174e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -187,6 +187,7 @@ enum nft_table_attributes { enum nft_chain_flags { NFT_CHAIN_BASE = (1 << 0), NFT_CHAIN_HW_OFFLOAD = (1 << 1), + NFT_CHAIN_BINDING = (1 << 2), }; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a7cb9c07802b..b8a970dad213 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1056,6 +1056,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_is_bound(chain)) + continue; + ctx->chain = chain; err = nft_delrule_by_chain(ctx); @@ -1098,6 +1101,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_is_bound(chain)) + continue; + ctx->chain = chain; err = nft_delchain(ctx); @@ -1413,13 +1419,12 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, lockdep_commit_lock_is_held(net)); if (nft_dump_stats(skb, stats)) goto nla_put_failure; - - if ((chain->flags & NFT_CHAIN_HW_OFFLOAD) && - nla_put_be32(skb, NFTA_CHAIN_FLAGS, - htonl(NFT_CHAIN_HW_OFFLOAD))) - goto nla_put_failure; } + if (chain->flags && + nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) goto nla_put_failure; @@ -1621,7 +1626,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) kvfree(chain->rules_next); } -static void nf_tables_chain_destroy(struct nft_ctx *ctx) +void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; struct nft_hook *hook, *next; @@ -1928,6 +1933,8 @@ static int nft_chain_add(struct nft_table *table, struct nft_chain *chain) return 0; } +static u64 chain_id; + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, u32 flags) { @@ -1936,6 +1943,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_base_chain *basechain; struct nft_stats __percpu *stats; struct net *net = ctx->net; + char name[NFT_NAME_MAXLEN]; struct nft_trans *trans; struct nft_chain *chain; struct nft_rule **rules; @@ -1947,6 +1955,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; + if (flags & NFT_CHAIN_BINDING) + return -EOPNOTSUPP; + err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) return err; @@ -1976,16 +1987,33 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, return err; } } else { + if (flags & NFT_CHAIN_BASE) + return -EINVAL; + if (flags & NFT_CHAIN_HW_OFFLOAD) + return -EOPNOTSUPP; + chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) return -ENOMEM; + + chain->flags = flags; } ctx->chain = chain; INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; - chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + + if (nla[NFTA_CHAIN_NAME]) { + chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + } else { + if (!(flags & NFT_CHAIN_BINDING)) + return -EINVAL; + + snprintf(name, sizeof(name), "__chain%llu", ++chain_id); + chain->name = kstrdup(name, GFP_KERNEL); + } + if (!chain->name) { err = -ENOMEM; goto err1; @@ -2976,8 +3004,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -static void nf_tables_rule_release(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); @@ -3075,6 +3102,9 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; + } else if (nla[NFTA_RULE_CHAIN_ID]) { chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]); if (IS_ERR(chain)) { @@ -3294,6 +3324,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; } nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); @@ -5330,11 +5362,24 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, */ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { + struct nft_chain *chain; + struct nft_rule *rule; + if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use++; + chain = data->verdict.chain; + chain->use++; + + if (!nft_chain_is_bound(chain)) + break; + + chain->table->use++; + list_for_each_entry(rule, &chain->rules, list) + chain->use++; + + nft_chain_add(chain->table, chain); break; } } @@ -7474,7 +7519,7 @@ static void nft_obj_del(struct nft_object *obj) list_del_rcu(&obj->list); } -static void nft_chain_del(struct nft_chain *chain) +void nft_chain_del(struct nft_chain *chain) { struct nft_table *table = chain->table; @@ -7825,6 +7870,10 @@ static int __nf_tables_abort(struct net *net, bool autoload) kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { + if (nft_chain_is_bound(trans->ctx.chain)) { + nft_trans_destroy(trans); + break; + } trans->ctx.table->use--; nft_chain_del(trans->ctx.chain); nf_tables_unregister_hook(trans->ctx.net, @@ -8321,10 +8370,23 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, static void nft_verdict_uninit(const struct nft_data *data) { + struct nft_chain *chain; + struct nft_rule *rule; + switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use--; + chain = data->verdict.chain; + chain->use--; + + if (!nft_chain_is_bound(chain)) + break; + + chain->table->use--; + list_for_each_entry(rule, &chain->rules, list) + chain->use--; + + nft_chain_del(chain); break; } } diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c7f0ef73d939..9e556638bb32 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -54,6 +54,23 @@ static int nft_immediate_init(const struct nft_ctx *ctx, if (err < 0) goto err1; + if (priv->dreg == NFT_REG_VERDICT) { + struct nft_chain *chain = priv->data.verdict.chain; + + switch (priv->data.verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + if (nft_chain_is_bound(chain)) { + err = -EBUSY; + goto err1; + } + chain->bound = true; + break; + default: + break; + } + } + return 0; err1: @@ -81,6 +98,39 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } +static void nft_immediate_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg != NFT_REG_VERDICT) + return; + + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + + if (!nft_chain_is_bound(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nf_tables_rule_release(&chain_ctx, rule); + + nf_tables_chain_destroy(&chain_ctx); + break; + default: + break; + } +} + static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); @@ -170,6 +220,7 @@ static const struct nft_expr_ops nft_imm_ops = { .init = nft_immediate_init, .activate = nft_immediate_activate, .deactivate = nft_immediate_deactivate, + .destroy = nft_immediate_destroy, .dump = nft_immediate_dump, .validate = nft_immediate_validate, .offload = nft_immediate_offload, -- cgit v1.2.3 From c1f79a2eefdcc0aef5d7a911c27a3f75f1936ecd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 4 Jul 2020 02:51:28 +0200 Subject: netfilter: nf_tables: reject unsupported chain flags Bail out if userspace sends unsupported chain flags. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e00b4ae6174e..42f351c1f5c5 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -189,6 +189,9 @@ enum nft_chain_flags { NFT_CHAIN_HW_OFFLOAD = (1 << 1), NFT_CHAIN_BINDING = (1 << 2), }; +#define NFT_CHAIN_FLAGS (NFT_CHAIN_BASE | \ + NFT_CHAIN_HW_OFFLOAD | \ + NFT_CHAIN_BINDING) /** * enum nft_chain_attributes - nf_tables chain netlink attributes diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b8a970dad213..f96785586f64 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2285,6 +2285,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, else if (chain) flags = chain->flags; + if (flags & ~NFT_CHAIN_FLAGS) + return -EOPNOTSUPP; + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { -- cgit v1.2.3 From ea1be1e59b19017e61aa357d524b743ba5602d3c Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 7 May 2020 20:03:35 +0800 Subject: serial: Remove duplicated macro definition of port type There exists the same macro definition of port type from 0 to 13 in include/uapi/linux/serial.h, remove these duplicated code in include/uapi/linux/serial_core.h which includes the former header. Acked-by: Jiri Slaby Signed-off-by: Tiezhu Yang Link: https://lore.kernel.org/r/1588853015-28392-1-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/serial_core.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 8ec3dd742ea4..851b982f8c4b 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -26,20 +26,6 @@ /* * The type definitions. These are from Ted Ts'o's serial.h */ -#define PORT_UNKNOWN 0 -#define PORT_8250 1 -#define PORT_16450 2 -#define PORT_16550 3 -#define PORT_16550A 4 -#define PORT_CIRRUS 5 -#define PORT_16650 6 -#define PORT_16650V2 7 -#define PORT_16750 8 -#define PORT_STARTECH 9 -#define PORT_16C950 10 -#define PORT_16654 11 -#define PORT_16850 12 -#define PORT_RSA 13 #define PORT_NS16550A 14 #define PORT_XSCALE 15 #define PORT_RM9000 16 /* PMC-Sierra RM9xxx internal UART */ -- cgit v1.2.3 From 9ac2b63791ef63935c71e2a7f5444a1118c4d084 Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Fri, 26 Jun 2020 17:48:00 +0100 Subject: drm: drm_fourcc: Add generic alias for 16_16_TILE modifier In cases such as DRM_FORMAT_MOD_SAMSUNG_16_16_TILE, the modifier describes a generic pixel re-ordering which can be applicable to multiple vendors. Define an alias: DRM_FORMAT_MOD_GENERIC_16_16_TILE, which can be used to describe this layout in a vendor-neutral way, and add a comment about the expected usage of such "generic" modifiers. Changes in v2: - Move note about future cases to comment (Daniel) Signed-off-by: Brian Starkey Reviewed-by: Daniel Vetter Signed-off-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20200626164800.11595-1-brian.starkey@arm.com --- include/uapi/drm/drm_fourcc.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index cbf92fdf2712..4bee7de5f306 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -346,8 +346,33 @@ extern "C" { * When adding a new token please document the layout with a code comment, * similar to the fourcc codes above. drm_fourcc.h is considered the * authoritative source for all of these. + * + * Generic modifier names: + * + * DRM_FORMAT_MOD_GENERIC_* definitions are used to provide vendor-neutral names + * for layouts which are common across multiple vendors. To preserve + * compatibility, in cases where a vendor-specific definition already exists and + * a generic name for it is desired, the common name is a purely symbolic alias + * and must use the same numerical value as the original definition. + * + * Note that generic names should only be used for modifiers which describe + * generic layouts (such as pixel re-ordering), which may have + * independently-developed support across multiple vendors. + * + * In future cases where a generic layout is identified before merging with a + * vendor-specific modifier, a new 'GENERIC' vendor or modifier using vendor + * 'NONE' could be considered. This should only be for obvious, exceptional + * cases to avoid polluting the 'GENERIC' namespace with modifiers which only + * apply to a single vendor. + * + * Generic names should not be used for cases where multiple hardware vendors + * have implementations of the same standardised compression scheme (such as + * AFBC). In those cases, all implementations should use the same format + * modifier(s), reflecting the vendor of the standard. */ +#define DRM_FORMAT_MOD_GENERIC_16_16_TILE DRM_FORMAT_MOD_SAMSUNG_16_16_TILE + /* * Invalid Modifier * -- cgit v1.2.3 From 1c8fb1ea5a1dbd2159e78fa580aaffb001794cfa Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 30 Jun 2020 12:39:12 +0300 Subject: IB/uverbs: Expose UAPI to query ucontext Expose UAPI to query ucontext, this will let user space application that didn't allocate the ucontext but has access to by owning the matching command FD to retrieve the ucontext information. Link: https://lore.kernel.org/r/20200630093916.332097-4-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/uverbs_std_types_device.c | 41 ++++++++++++++++++++++- include/rdma/ib_verbs.h | 4 +++ include/uapi/rdma/ib_user_ioctl_cmds.h | 6 ++++ 4 files changed, 51 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 40cf07129f66..1900c0df3c8a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2674,6 +2674,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); + SET_DEVICE_OP(dev_ops, query_ucontext); SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); SET_DEVICE_OP(dev_ops, read_counters); SET_DEVICE_OP(dev_ops, reg_dm_mr); diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index ae4a59d6f9b1..8e58605a17be 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -229,6 +229,37 @@ static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)( return 0; } +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_CONTEXT)( + struct uverbs_attr_bundle *attrs) +{ + u64 core_support = IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + u32 num_comp; + int ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!ib_dev->ops.query_ucontext) + return -EOPNOTSUPP; + + num_comp = attrs->ufile->device->num_comp_vectors; + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + &num_comp, sizeof(num_comp)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, + &core_support, sizeof(core_support)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + return ucontext->device->ops.query_ucontext(ucontext, attrs); +} + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_GET_CONTEXT, UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, @@ -237,6 +268,13 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u64), UA_OPTIONAL), UVERBS_ATTR_UHW()); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_CONTEXT, + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + UVERBS_ATTR_TYPE(u32), UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, + UVERBS_ATTR_TYPE(u64), UA_OPTIONAL)); + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_INFO_HANDLES, /* Also includes any device specific object ids */ @@ -260,7 +298,8 @@ DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE, &UVERBS_METHOD(UVERBS_METHOD_GET_CONTEXT), &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE), &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES), - &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT)); + &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT)); const struct uapi_definition uverbs_def_obj_device[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE), diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 20c801730fed..6c72bb194148 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2645,6 +2645,10 @@ struct ib_device_ops { */ int (*fill_stat_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); + /* query driver for its ucontext properties */ + int (*query_ucontext)(struct ib_ucontext *context, + struct uverbs_attr_bundle *attrs); + DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_cq); DECLARE_RDMA_OBJ_SIZE(ib_pd); diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 4961d5e858eb..83b6e71ea216 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -69,6 +69,7 @@ enum uverbs_methods_device { UVERBS_METHOD_INFO_HANDLES, UVERBS_METHOD_QUERY_PORT, UVERBS_METHOD_GET_CONTEXT, + UVERBS_METHOD_QUERY_CONTEXT, }; enum uverbs_attrs_invoke_write_cmd_attr_ids { @@ -87,6 +88,11 @@ enum uverbs_attrs_get_context_attr_ids { UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, }; +enum uverbs_attrs_query_context_attr_ids { + UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, +}; + enum uverbs_attrs_create_cq_cmd_attr_ids { UVERBS_ATTR_CREATE_CQ_HANDLE, UVERBS_ATTR_CREATE_CQ_CQE, -- cgit v1.2.3 From 0fb556b2b58d6b8336c94379042bad2a8512af1a Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 30 Jun 2020 12:39:14 +0300 Subject: RDMA/mlx5: Implement the query ucontext functionality Implement the query ucontext functionality by returning the original ucontext data as part of an extra mlx5 attribute that holds the driver UAPI response. Link: https://lore.kernel.org/r/20200630093916.332097-6-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 1 + drivers/infiniband/hw/mlx5/main.c | 35 ++++++++++++++++++++++++++++++++ include/uapi/rdma/mlx5_user_ioctl_cmds.h | 4 ++++ 3 files changed, 40 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 2d882c02387c..ef04a261097f 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -790,6 +790,7 @@ int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, } return uverbs_copy_to(bundle, idx, from, size); } +EXPORT_SYMBOL(uverbs_copy_to_struct_or_zero); /* Once called an abort will call through to the type's destroy_hw() */ void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 56039fa6c530..dc77388464c9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1990,6 +1990,29 @@ out_ctx: return err; } +static int mlx5_ib_query_ucontext(struct ib_ucontext *ibcontext, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_alloc_ucontext_resp uctx_resp = {}; + int ret; + + ret = set_ucontext_resp(ibcontext, &uctx_resp); + if (ret) + return ret; + + uctx_resp.response_length = + min_t(size_t, + uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX), + sizeof(uctx_resp)); + + ret = uverbs_copy_to_struct_or_zero(attrs, + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX, + &uctx_resp, + sizeof(uctx_resp)); + return ret; +} + static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); @@ -6364,6 +6387,16 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE( UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, enum mlx5_ib_uapi_flow_action_flags)); +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_query_context, + UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_QUERY_CONTEXT, + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX, + UVERBS_ATTR_STRUCT(struct mlx5_ib_alloc_ucontext_resp, + dump_fill_mkey), + UA_MANDATORY)); + static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_devx_defs), UAPI_DEF_CHAIN(mlx5_ib_flow_defs), @@ -6372,6 +6405,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION, &mlx5_ib_flow_action), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm), + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR), @@ -6605,6 +6639,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .query_pkey = mlx5_ib_query_pkey, .query_qp = mlx5_ib_query_qp, .query_srq = mlx5_ib_query_srq, + .query_ucontext = mlx5_ib_query_ucontext, .read_counters = mlx5_ib_read_counters, .reg_user_mr = mlx5_ib_reg_user_mr, .req_notify_cq = mlx5_ib_arm_cq, diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 8e316ef896b5..496309e8a856 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -228,6 +228,10 @@ enum mlx5_ib_flow_matcher_methods { MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, }; +enum mlx5_ib_device_query_context_attrs { + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX = (1U << UVERBS_ID_NS_SHIFT), +}; + #define MLX5_IB_DW_MATCH_PARAM 0x80 struct mlx5_ib_match_params { -- cgit v1.2.3 From 05f71ef9797454b9384c740b8acd0d02eca1d1bc Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 30 Jun 2020 12:39:15 +0300 Subject: RDMA/mlx5: Introduce UAPI to query PD attributes Introduce UAPI to query PD attributes, this can be used to retrieve PD attributes by having the PD handle of the created one and owning the command FD for the ucontxet. Link: https://lore.kernel.org/r/20200630093916.332097-7-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/Makefile | 3 ++- drivers/infiniband/hw/mlx5/main.c | 1 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 ++ drivers/infiniband/hw/mlx5/std_types.c | 45 ++++++++++++++++++++++++++++++++ include/uapi/rdma/mlx5_user_ioctl_cmds.h | 10 +++++++ 5 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/mlx5/std_types.c (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 8cca61c671f8..9838719aacb9 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -23,4 +23,5 @@ mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o \ flow.o \ - qos.o + qos.o \ + std_types.o diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index dc77388464c9..324a98c77ad5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6401,6 +6401,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_devx_defs), UAPI_DEF_CHAIN(mlx5_ib_flow_defs), UAPI_DEF_CHAIN(mlx5_ib_qos_defs), + UAPI_DEF_CHAIN(mlx5_ib_std_types_defs), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION, &mlx5_ib_flow_action), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 74d3507d6f80..27b069ef63d9 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1384,6 +1384,8 @@ int mlx5_ib_fill_stat_mr_entry(struct sk_buff *msg, struct ib_mr *ib_mr); extern const struct uapi_definition mlx5_ib_devx_defs[]; extern const struct uapi_definition mlx5_ib_flow_defs[]; extern const struct uapi_definition mlx5_ib_qos_defs[]; +extern const struct uapi_definition mlx5_ib_std_types_defs[]; + #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c new file mode 100644 index 000000000000..16145fda68d0 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include +#include "mlx5_ib.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include + +static int UVERBS_HANDLER(MLX5_IB_METHOD_PD_QUERY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, MLX5_IB_ATTR_QUERY_PD_HANDLE); + struct mlx5_ib_pd *mpd = to_mpd(pd); + + return uverbs_copy_to(attrs, MLX5_IB_ATTR_QUERY_PD_RESP_PDN, + &mpd->pdn, sizeof(mpd->pdn)); +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_PD_QUERY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_QUERY_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_QUERY_PD_RESP_PDN, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +ADD_UVERBS_METHODS(mlx5_ib_pd, + UVERBS_OBJECT_PD, + &UVERBS_METHOD(MLX5_IB_METHOD_PD_QUERY)); + +const struct uapi_definition mlx5_ib_std_types_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE( + UVERBS_OBJECT_PD, + &mlx5_ib_pd), + {}, +}; diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 496309e8a856..b330e6eee626 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -290,4 +290,14 @@ enum mlx5_ib_create_flow_action_create_packet_reformat_attrs { MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, }; +enum mlx5_ib_query_pd_attrs { + MLX5_IB_ATTR_QUERY_PD_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_QUERY_PD_RESP_PDN, +}; + +enum mlx5_ib_pd_methods { + MLX5_IB_METHOD_PD_QUERY = (1U << UVERBS_ID_NS_SHIFT), + +}; + #endif -- cgit v1.2.3 From 6c01e6b218aea09ec9947cbf88a4db97b4dd155c Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 30 Jun 2020 12:39:16 +0300 Subject: IB/uverbs: Expose UAPI to query MR Expose UAPI to query MR, this will let user space application that didn't allocate the MR but has access to by owning the matching command FD to retrieve its information. Link: https://lore.kernel.org/r/20200630093916.332097-8-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_std_types_mr.c | 52 ++++++++++++++++++++++++++- include/uapi/rdma/ib_user_ioctl_cmds.h | 9 +++++ 2 files changed, 60 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index a2722ef8496e..62f58ad56afd 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -148,6 +148,36 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( return ret; } +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_mr *mr = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_QUERY_MR_HANDLE); + int ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LKEY, &mr->lkey, + sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + &mr->length, sizeof(mr->length)); + + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_IOVA, + &mr->iova, sizeof(mr->iova)); + + return IS_UVERBS_COPY_ERR(ret) ? ret : 0; +} + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_ADVISE_MR, UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE, @@ -165,6 +195,25 @@ DECLARE_UVERBS_NAMED_METHOD( UA_MANDATORY, UA_ALLOC_AND_COPY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_QUERY_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL)); + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_DM_MR_REG, UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE, @@ -206,7 +255,8 @@ DECLARE_UVERBS_NAMED_OBJECT( UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr), &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG), &UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY), - &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR)); + &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_MR)); const struct uapi_definition uverbs_def_obj_mr[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR, diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 83b6e71ea216..99dcabf61a71 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -248,6 +248,7 @@ enum uverbs_methods_mr { UVERBS_METHOD_DM_MR_REG, UVERBS_METHOD_MR_DESTROY, UVERBS_METHOD_ADVISE_MR, + UVERBS_METHOD_QUERY_MR, }; enum uverbs_attrs_mr_destroy_ids { @@ -261,6 +262,14 @@ enum uverbs_attrs_advise_mr_cmd_attr_ids { UVERBS_ATTR_ADVISE_MR_SGE_LIST, }; +enum uverbs_attrs_query_mr_cmd_attr_ids { + UVERBS_ATTR_QUERY_MR_HANDLE, + UVERBS_ATTR_QUERY_MR_RESP_LKEY, + UVERBS_ATTR_QUERY_MR_RESP_RKEY, + UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + UVERBS_ATTR_QUERY_MR_RESP_IOVA, +}; + enum uverbs_attrs_create_counters_cmd_attr_ids { UVERBS_ATTR_CREATE_COUNTERS_HANDLE, }; -- cgit v1.2.3 From fe6a3d6521227e49857d0d6caabbdae3bd4aef89 Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Tue, 16 Jun 2020 12:08:45 +0800 Subject: fpga: dfl: afu: add interrupt support for port error reporting Error reporting interrupt is very useful to notify users that some errors are detected by the hardware. Once users are notified, they could query hardware logged error states, no need to continuously poll on these states. This patch adds interrupt support for port error reporting sub feature. It follows the common DFL interrupt notification and handling mechanism, implements two ioctl commands below for user to query number of irqs supported, and set/unset interrupt triggers. Ioctls: * DFL_FPGA_PORT_ERR_GET_IRQ_NUM get the number of irqs, which is used to determine whether/how many interrupts error reporting feature supports. * DFL_FPGA_PORT_ERR_SET_IRQ set/unset given eventfds as error interrupt triggers. Signed-off-by: Luwei Kang Signed-off-by: Wu Hao Signed-off-by: Xu Yilun Reviewed-by: Marcelo Tosatti Acked-by: Wu Hao Signed-off-by: Moritz Fischer --- drivers/fpga/dfl-afu-error.c | 17 +++++++++++++++++ drivers/fpga/dfl-afu-main.c | 4 ++++ include/uapi/linux/fpga-dfl.h | 23 +++++++++++++++++++++++ 3 files changed, 44 insertions(+) (limited to 'include/uapi') diff --git a/drivers/fpga/dfl-afu-error.c b/drivers/fpga/dfl-afu-error.c index c1467ae1a6b6..c4691187cca9 100644 --- a/drivers/fpga/dfl-afu-error.c +++ b/drivers/fpga/dfl-afu-error.c @@ -14,6 +14,7 @@ * Mitchel Henry */ +#include #include #include "dfl-afu.h" @@ -219,6 +220,21 @@ static void port_err_uinit(struct platform_device *pdev, afu_port_err_mask(&pdev->dev, true); } +static long +port_err_ioctl(struct platform_device *pdev, struct dfl_feature *feature, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case DFL_FPGA_PORT_ERR_GET_IRQ_NUM: + return dfl_feature_ioctl_get_num_irqs(pdev, feature, arg); + case DFL_FPGA_PORT_ERR_SET_IRQ: + return dfl_feature_ioctl_set_irq(pdev, feature, arg); + default: + dev_dbg(&pdev->dev, "%x cmd not handled", cmd); + return -ENODEV; + } +} + const struct dfl_feature_id port_err_id_table[] = { {.id = PORT_FEATURE_ID_ERROR,}, {0,} @@ -227,4 +243,5 @@ const struct dfl_feature_id port_err_id_table[] = { const struct dfl_feature_ops port_err_ops = { .init = port_err_init, .uinit = port_err_uinit, + .ioctl = port_err_ioctl, }; diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c index b0c31789a909..357cd5d9b267 100644 --- a/drivers/fpga/dfl-afu-main.c +++ b/drivers/fpga/dfl-afu-main.c @@ -577,6 +577,7 @@ static int afu_release(struct inode *inode, struct file *filp) { struct platform_device *pdev = filp->private_data; struct dfl_feature_platform_data *pdata; + struct dfl_feature *feature; dev_dbg(&pdev->dev, "Device File Release\n"); @@ -586,6 +587,9 @@ static int afu_release(struct inode *inode, struct file *filp) dfl_feature_dev_use_end(pdata); if (!dfl_feature_dev_use_count(pdata)) { + dfl_fpga_dev_for_each_feature(pdata, feature) + dfl_fpga_set_irq_triggers(feature, 0, + feature->nr_irqs, NULL); __port_reset(pdev); afu_dma_region_destroy(pdata); } diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h index 7331350f3067..6c71c9d7e926 100644 --- a/include/uapi/linux/fpga-dfl.h +++ b/include/uapi/linux/fpga-dfl.h @@ -164,6 +164,29 @@ struct dfl_fpga_irq_set { __s32 evtfds[]; }; +/** + * DFL_FPGA_PORT_ERR_GET_IRQ_NUM - _IOR(DFL_FPGA_MAGIC, DFL_PORT_BASE + 5, + * __u32 num_irqs) + * + * Get the number of irqs supported by the fpga port error reporting private + * feature. Currently hardware supports up to 1 irq. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_PORT_ERR_GET_IRQ_NUM _IOR(DFL_FPGA_MAGIC, \ + DFL_PORT_BASE + 5, __u32) + +/** + * DFL_FPGA_PORT_ERR_SET_IRQ - _IOW(DFL_FPGA_MAGIC, DFL_PORT_BASE + 6, + * struct dfl_fpga_irq_set) + * + * Set fpga port error reporting interrupt trigger if evtfds[n] is valid. + * Unset related interrupt trigger if evtfds[n] is a negative value. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_PORT_ERR_SET_IRQ _IOW(DFL_FPGA_MAGIC, \ + DFL_PORT_BASE + 6, \ + struct dfl_fpga_irq_set) + /* IOCTLs for FME file descriptor */ /** -- cgit v1.2.3 From d43f20bae5173ba431526040c320c36fdd4f086d Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Tue, 16 Jun 2020 12:08:46 +0800 Subject: fpga: dfl: fme: add interrupt support for global error reporting Error reporting interrupt is very useful to notify users that some errors are detected by the hardware. Once users are notified, they could query hardware logged error states, no need to continuously poll on these states. This patch adds interrupt support for fme global error reporting sub feature. It follows the common DFL interrupt notification and handling mechanism. And it implements two ioctls below for user to query number of irqs supported, and set/unset interrupt triggers. Ioctls: * DFL_FPGA_FME_ERR_GET_IRQ_NUM get the number of irqs, which is used to determine whether/how many interrupts fme error reporting feature supports. * DFL_FPGA_FME_ERR_SET_IRQ set/unset given eventfds as fme error reporting interrupt triggers. Signed-off-by: Luwei Kang Signed-off-by: Wu Hao Signed-off-by: Xu Yilun Reviewed-by: Marcelo Tosatti Acked-by: Wu Hao Signed-off-by: Moritz Fischer --- drivers/fpga/dfl-fme-error.c | 18 ++++++++++++++++++ drivers/fpga/dfl-fme-main.c | 6 ++++++ include/uapi/linux/fpga-dfl.h | 23 +++++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'include/uapi') diff --git a/drivers/fpga/dfl-fme-error.c b/drivers/fpga/dfl-fme-error.c index f897d414b923..51c2892ec06d 100644 --- a/drivers/fpga/dfl-fme-error.c +++ b/drivers/fpga/dfl-fme-error.c @@ -15,6 +15,7 @@ * Mitchel, Henry */ +#include #include #include "dfl.h" @@ -348,6 +349,22 @@ static void fme_global_err_uinit(struct platform_device *pdev, fme_err_mask(&pdev->dev, true); } +static long +fme_global_error_ioctl(struct platform_device *pdev, + struct dfl_feature *feature, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case DFL_FPGA_FME_ERR_GET_IRQ_NUM: + return dfl_feature_ioctl_get_num_irqs(pdev, feature, arg); + case DFL_FPGA_FME_ERR_SET_IRQ: + return dfl_feature_ioctl_set_irq(pdev, feature, arg); + default: + dev_dbg(&pdev->dev, "%x cmd not handled", cmd); + return -ENODEV; + } +} + const struct dfl_feature_id fme_global_err_id_table[] = { {.id = FME_FEATURE_ID_GLOBAL_ERR,}, {0,} @@ -356,4 +373,5 @@ const struct dfl_feature_id fme_global_err_id_table[] = { const struct dfl_feature_ops fme_global_err_ops = { .init = fme_global_err_init, .uinit = fme_global_err_uinit, + .ioctl = fme_global_error_ioctl, }; diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c index fc210d4e1863..77ea04d4edbe 100644 --- a/drivers/fpga/dfl-fme-main.c +++ b/drivers/fpga/dfl-fme-main.c @@ -620,11 +620,17 @@ static int fme_release(struct inode *inode, struct file *filp) { struct dfl_feature_platform_data *pdata = filp->private_data; struct platform_device *pdev = pdata->dev; + struct dfl_feature *feature; dev_dbg(&pdev->dev, "Device File Release\n"); mutex_lock(&pdata->lock); dfl_feature_dev_use_end(pdata); + + if (!dfl_feature_dev_use_count(pdata)) + dfl_fpga_dev_for_each_feature(pdata, feature) + dfl_fpga_set_irq_triggers(feature, 0, + feature->nr_irqs, NULL); mutex_unlock(&pdata->lock); return 0; diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h index 6c71c9d7e926..b6495ea412ec 100644 --- a/include/uapi/linux/fpga-dfl.h +++ b/include/uapi/linux/fpga-dfl.h @@ -230,4 +230,27 @@ struct dfl_fpga_fme_port_pr { */ #define DFL_FPGA_FME_PORT_ASSIGN _IOW(DFL_FPGA_MAGIC, DFL_FME_BASE + 2, int) +/** + * DFL_FPGA_FME_ERR_GET_IRQ_NUM - _IOR(DFL_FPGA_MAGIC, DFL_FME_BASE + 3, + * __u32 num_irqs) + * + * Get the number of irqs supported by the fpga fme error reporting private + * feature. Currently hardware supports up to 1 irq. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_FME_ERR_GET_IRQ_NUM _IOR(DFL_FPGA_MAGIC, \ + DFL_FME_BASE + 3, __u32) + +/** + * DFL_FPGA_FME_ERR_SET_IRQ - _IOW(DFL_FPGA_MAGIC, DFL_FME_BASE + 4, + * struct dfl_fpga_irq_set) + * + * Set fpga fme error reporting interrupt trigger if evtfds[n] is valid. + * Unset related interrupt trigger if evtfds[n] is a negative value. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_FME_ERR_SET_IRQ _IOW(DFL_FPGA_MAGIC, \ + DFL_FME_BASE + 4, \ + struct dfl_fpga_irq_set) + #endif /* _UAPI_LINUX_FPGA_DFL_H */ -- cgit v1.2.3 From 09d86150141955ba1b3d7cbef23785f4996e4d6f Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Tue, 16 Jun 2020 12:08:47 +0800 Subject: fpga: dfl: afu: add AFU interrupt support AFU (Accelerated Function Unit) is dynamic region of the DFL based FPGA, and always defined by users. Some DFL based FPGA cards allow users to implement their own interrupts in AFU. In order to support this, hardware implements a new UINT (AFU Interrupt) private feature with related capability register which describes the number of supported AFU interrupts as well as the local index of the interrupts for software enumeration, and from software side, driver follows the common DFL interrupt notification and handling mechanism, and it implements two ioctls below for user to query number of irqs supported and set/unset interrupt triggers. Ioctls: * DFL_FPGA_PORT_UINT_GET_IRQ_NUM get the number of irqs, which is used to determine how many interrupts UINT feature supports. * DFL_FPGA_PORT_UINT_SET_IRQ set/unset eventfds as AFU interrupt triggers. Signed-off-by: Luwei Kang Signed-off-by: Wu Hao Signed-off-by: Xu Yilun Reviewed-by: Marcelo Tosatti Acked-by: Wu Hao Signed-off-by: Moritz Fischer --- drivers/fpga/dfl-afu-main.c | 28 ++++++++++++++++++++++++++++ include/uapi/linux/fpga-dfl.h | 23 +++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'include/uapi') diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c index 357cd5d9b267..7c84feea7354 100644 --- a/drivers/fpga/dfl-afu-main.c +++ b/drivers/fpga/dfl-afu-main.c @@ -529,6 +529,30 @@ static const struct dfl_feature_ops port_stp_ops = { .init = port_stp_init, }; +static long +port_uint_ioctl(struct platform_device *pdev, struct dfl_feature *feature, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case DFL_FPGA_PORT_UINT_GET_IRQ_NUM: + return dfl_feature_ioctl_get_num_irqs(pdev, feature, arg); + case DFL_FPGA_PORT_UINT_SET_IRQ: + return dfl_feature_ioctl_set_irq(pdev, feature, arg); + default: + dev_dbg(&pdev->dev, "%x cmd not handled", cmd); + return -ENODEV; + } +} + +static const struct dfl_feature_id port_uint_id_table[] = { + {.id = PORT_FEATURE_ID_UINT,}, + {0,} +}; + +static const struct dfl_feature_ops port_uint_ops = { + .ioctl = port_uint_ioctl, +}; + static struct dfl_feature_driver port_feature_drvs[] = { { .id_table = port_hdr_id_table, @@ -546,6 +570,10 @@ static struct dfl_feature_driver port_feature_drvs[] = { .id_table = port_stp_id_table, .ops = &port_stp_ops, }, + { + .id_table = port_uint_id_table, + .ops = &port_uint_ops, + }, { .ops = NULL, } diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h index b6495ea412ec..1621b077bf21 100644 --- a/include/uapi/linux/fpga-dfl.h +++ b/include/uapi/linux/fpga-dfl.h @@ -187,6 +187,29 @@ struct dfl_fpga_irq_set { DFL_PORT_BASE + 6, \ struct dfl_fpga_irq_set) +/** + * DFL_FPGA_PORT_UINT_GET_IRQ_NUM - _IOR(DFL_FPGA_MAGIC, DFL_PORT_BASE + 7, + * __u32 num_irqs) + * + * Get the number of irqs supported by the fpga AFU interrupt private + * feature. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_PORT_UINT_GET_IRQ_NUM _IOR(DFL_FPGA_MAGIC, \ + DFL_PORT_BASE + 7, __u32) + +/** + * DFL_FPGA_PORT_UINT_SET_IRQ - _IOW(DFL_FPGA_MAGIC, DFL_PORT_BASE + 8, + * struct dfl_fpga_irq_set) + * + * Set fpga AFU interrupt trigger if evtfds[n] is valid. + * Unset related interrupt trigger if evtfds[n] is a negative value. + * Return: 0 on success, -errno on failure. + */ +#define DFL_FPGA_PORT_UINT_SET_IRQ _IOW(DFL_FPGA_MAGIC, \ + DFL_PORT_BASE + 8, \ + struct dfl_fpga_irq_set) + /* IOCTLs for FME file descriptor */ /** -- cgit v1.2.3 From 1ce50e7d408ef2bdc8ca021363fd46d1b8bfad00 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 6 Jul 2020 12:55:37 +0200 Subject: thermal: core: genetlink support for events/cmd/sampling Initially the thermal framework had a very simple notification mechanism to send generic netlink messages to the userspace. The notification function was never called from anywhere and the corresponding dead code was removed. It was probably a first attempt to introduce the netlink notification. At LPC2018, the presentation "Linux thermal: User kernel interface", proposed to create the notifications to the userspace via a kfifo. The advantage of the kfifo is the performance. It is usually used from a 1:1 communication channel where a driver captures data and sends it as fast as possible to a userspace process. The drawback is that only one process uses the notification channel exclusively, thus no other process is allowed to use the channel to get temperature or notifications. This patch defines a generic netlink API to discover the current thermal setup and adds event notifications as well as temperature sampling. As any genetlink protocol, it can evolve and the versioning allows to keep the backward compatibility. In order to prevent the user from getting flooded with data on a single channel, there are two multicast channels, one for the temperature sampling when the thermal zone is updated and another one for the events, so the user can get the events only without the thermal zone temperature sampling. Also, a list of commands to discover the thermal setup is added and can be extended when needed. Reviewed-by: Amit Kucheria Signed-off-by: Daniel Lezcano Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20200706105538.2159-3-daniel.lezcano@linaro.org --- drivers/thermal/Makefile | 2 +- drivers/thermal/thermal_core.h | 18 ++ drivers/thermal/thermal_netlink.c | 648 ++++++++++++++++++++++++++++++++++++++ include/linux/thermal.h | 17 - include/uapi/linux/thermal.h | 89 +++++- 5 files changed, 739 insertions(+), 35 deletions(-) create mode 100644 drivers/thermal/thermal_netlink.c (limited to 'include/uapi') diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index 0c8b84a09b9a..1bbf0805fb04 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_THERMAL) += thermal_sys.o thermal_sys-y += thermal_core.o thermal_sysfs.o \ - thermal_helpers.o + thermal_helpers.o thermal_netlink.o # interface to/from other layers providing sensors thermal_sys-$(CONFIG_THERMAL_HWMON) += thermal_hwmon.o diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 967b0ba6593e..4d9455ef0c32 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -52,6 +52,24 @@ int for_each_thermal_governor(int (*cb)(struct thermal_governor *, void *), struct thermal_zone_device *thermal_zone_get_by_id(int id); +/* Netlink notification function */ +int thermal_notify_tz_create(int tz_id, const char *name); +int thermal_notify_tz_delete(int tz_id); +int thermal_notify_tz_enable(int tz_id); +int thermal_notify_tz_disable(int tz_id); +int thermal_notify_tz_trip_down(int tz_id, int id); +int thermal_notify_tz_trip_up(int tz_id, int id); +int thermal_notify_tz_trip_delete(int tz_id, int id); +int thermal_notify_tz_trip_add(int tz_id, int id, int type, + int temp, int hyst); +int thermal_notify_tz_trip_change(int tz_id, int id, int type, + int temp, int hyst); +int thermal_notify_cdev_state_update(int cdev_id, int state); +int thermal_notify_cdev_add(int cdev_id, const char *name, int max_state); +int thermal_notify_cdev_delete(int cdev_id); +int thermal_notify_tz_gov_change(int tz_id, const char *name); +int thermal_genl_sampling_temp(int id, int temp); + struct thermal_attr { struct device_attribute attr; char name[THERMAL_NAME_LENGTH]; diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c new file mode 100644 index 000000000000..dd0a3b889674 --- /dev/null +++ b/drivers/thermal/thermal_netlink.c @@ -0,0 +1,648 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Linaro Limited + * + * Author: Daniel Lezcano + * + * Generic netlink for thermal management framework + */ +#include +#include +#include +#include + +#include "thermal_core.h" + +static const struct genl_multicast_group thermal_genl_mcgrps[] = { + { .name = THERMAL_GENL_SAMPLING_GROUP_NAME, }, + { .name = THERMAL_GENL_EVENT_GROUP_NAME, }, +}; + +static const struct nla_policy thermal_genl_policy[THERMAL_GENL_ATTR_MAX + 1] = { + /* Thermal zone */ + [THERMAL_GENL_ATTR_TZ] = { .type = NLA_NESTED }, + [THERMAL_GENL_ATTR_TZ_ID] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_TEMP] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_TRIP] = { .type = NLA_NESTED }, + [THERMAL_GENL_ATTR_TZ_TRIP_ID] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_TRIP_TEMP] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_TRIP_TYPE] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_TRIP_HYST] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_MODE] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_CDEV_WEIGHT] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_TZ_NAME] = { .type = NLA_STRING, + .len = THERMAL_NAME_LENGTH }, + /* Governor(s) */ + [THERMAL_GENL_ATTR_TZ_GOV] = { .type = NLA_NESTED }, + [THERMAL_GENL_ATTR_TZ_GOV_NAME] = { .type = NLA_STRING, + .len = THERMAL_NAME_LENGTH }, + /* Cooling devices */ + [THERMAL_GENL_ATTR_CDEV] = { .type = NLA_NESTED }, + [THERMAL_GENL_ATTR_CDEV_ID] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_CDEV_CUR_STATE] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_CDEV_MAX_STATE] = { .type = NLA_U32 }, + [THERMAL_GENL_ATTR_CDEV_NAME] = { .type = NLA_STRING, + .len = THERMAL_NAME_LENGTH }, +}; + +struct param { + struct nlattr **attrs; + struct sk_buff *msg; + const char *name; + int tz_id; + int cdev_id; + int trip_id; + int trip_temp; + int trip_type; + int trip_hyst; + int temp; + int cdev_state; + int cdev_max_state; +}; + +typedef int (*cb_t)(struct param *); + +static struct genl_family thermal_gnl_family; + +/************************** Sampling encoding *******************************/ + +int thermal_genl_sampling_temp(int id, int temp) +{ + struct sk_buff *skb; + void *hdr; + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, 0, 0, &thermal_gnl_family, 0, + THERMAL_GENL_SAMPLING_TEMP); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(skb, THERMAL_GENL_ATTR_TZ_ID, id)) + goto out_cancel; + + if (nla_put_u32(skb, THERMAL_GENL_ATTR_TZ_TEMP, temp)) + goto out_cancel; + + genlmsg_end(skb, hdr); + + genlmsg_multicast(&thermal_gnl_family, skb, 0, 0, GFP_KERNEL); + + return 0; +out_cancel: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + + return -EMSGSIZE; +} + +/**************************** Event encoding *********************************/ + +static int thermal_genl_event_tz_create(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || + nla_put_string(p->msg, THERMAL_GENL_ATTR_TZ_NAME, p->name)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_tz(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_tz_trip_up(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, p->trip_id)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_tz_trip_add(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, p->trip_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, p->trip_type) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_TEMP, p->trip_temp) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_HYST, p->trip_hyst)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_tz_trip_delete(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, p->trip_id)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_cdev_add(struct param *p) +{ + if (nla_put_string(p->msg, THERMAL_GENL_ATTR_CDEV_NAME, + p->name) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_ID, + p->cdev_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_MAX_STATE, + p->cdev_max_state)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_cdev_delete(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_ID, p->cdev_id)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_cdev_state_update(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_ID, + p->cdev_id) || + nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_CUR_STATE, + p->cdev_state)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_event_gov_change(struct param *p) +{ + if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || + nla_put_string(p->msg, THERMAL_GENL_ATTR_GOV_NAME, p->name)) + return -EMSGSIZE; + + return 0; +} + +int thermal_genl_event_tz_delete(struct param *p) + __attribute__((alias("thermal_genl_event_tz"))); + +int thermal_genl_event_tz_enable(struct param *p) + __attribute__((alias("thermal_genl_event_tz"))); + +int thermal_genl_event_tz_disable(struct param *p) + __attribute__((alias("thermal_genl_event_tz"))); + +int thermal_genl_event_tz_trip_down(struct param *p) + __attribute__((alias("thermal_genl_event_tz_trip_up"))); + +int thermal_genl_event_tz_trip_change(struct param *p) + __attribute__((alias("thermal_genl_event_tz_trip_add"))); + +static cb_t event_cb[] = { + [THERMAL_GENL_EVENT_TZ_CREATE] = thermal_genl_event_tz_create, + [THERMAL_GENL_EVENT_TZ_DELETE] = thermal_genl_event_tz_delete, + [THERMAL_GENL_EVENT_TZ_ENABLE] = thermal_genl_event_tz_enable, + [THERMAL_GENL_EVENT_TZ_DISABLE] = thermal_genl_event_tz_disable, + [THERMAL_GENL_EVENT_TZ_TRIP_UP] = thermal_genl_event_tz_trip_up, + [THERMAL_GENL_EVENT_TZ_TRIP_DOWN] = thermal_genl_event_tz_trip_down, + [THERMAL_GENL_EVENT_TZ_TRIP_CHANGE] = thermal_genl_event_tz_trip_change, + [THERMAL_GENL_EVENT_TZ_TRIP_ADD] = thermal_genl_event_tz_trip_add, + [THERMAL_GENL_EVENT_TZ_TRIP_DELETE] = thermal_genl_event_tz_trip_delete, + [THERMAL_GENL_EVENT_CDEV_ADD] = thermal_genl_event_cdev_add, + [THERMAL_GENL_EVENT_CDEV_DELETE] = thermal_genl_event_cdev_delete, + [THERMAL_GENL_EVENT_CDEV_STATE_UPDATE] = thermal_genl_event_cdev_state_update, + [THERMAL_GENL_EVENT_TZ_GOV_CHANGE] = thermal_genl_event_gov_change, +}; + +/* + * Generic netlink event encoding + */ +static int thermal_genl_send_event(enum thermal_genl_event event, + struct param *p) +{ + struct sk_buff *msg; + int ret = -EMSGSIZE; + void *hdr; + + msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + p->msg = msg; + + hdr = genlmsg_put(msg, 0, 0, &thermal_gnl_family, 0, event); + if (!hdr) + goto out_free_msg; + + ret = event_cb[event](p); + if (ret) + goto out_cancel_msg; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&thermal_gnl_family, msg, 0, 1, GFP_KERNEL); + + return 0; + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); + + return ret; +} + +int thermal_notify_tz_create(int tz_id, const char *name) +{ + struct param p = { .tz_id = tz_id, .name = name }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_CREATE, &p); +} + +int thermal_notify_tz_delete(int tz_id) +{ + struct param p = { .tz_id = tz_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_DELETE, &p); +} + +int thermal_notify_tz_enable(int tz_id) +{ + struct param p = { .tz_id = tz_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_ENABLE, &p); +} + +int thermal_notify_tz_disable(int tz_id) +{ + struct param p = { .tz_id = tz_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_DISABLE, &p); +} + +int thermal_notify_tz_trip_down(int tz_id, int trip_id) +{ + struct param p = { .tz_id = tz_id, .trip_id = trip_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_DOWN, &p); +} + +int thermal_notify_tz_trip_up(int tz_id, int trip_id) +{ + struct param p = { .tz_id = tz_id, .trip_id = trip_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_UP, &p); +} + +int thermal_notify_tz_trip_add(int tz_id, int trip_id, int trip_type, + int trip_temp, int trip_hyst) +{ + struct param p = { .tz_id = tz_id, .trip_id = trip_id, + .trip_type = trip_type, .trip_temp = trip_temp, + .trip_hyst = trip_hyst }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_ADD, &p); +} + +int thermal_notify_tz_trip_delete(int tz_id, int trip_id) +{ + struct param p = { .tz_id = tz_id, .trip_id = trip_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_DELETE, &p); +} + +int thermal_notify_tz_trip_change(int tz_id, int trip_id, int trip_type, + int trip_temp, int trip_hyst) +{ + struct param p = { .tz_id = tz_id, .trip_id = trip_id, + .trip_type = trip_type, .trip_temp = trip_temp, + .trip_hyst = trip_hyst }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_CHANGE, &p); +} + +int thermal_notify_cdev_state_update(int cdev_id, int cdev_state) +{ + struct param p = { .cdev_id = cdev_id, .cdev_state = cdev_state }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_CDEV_STATE_UPDATE, &p); +} + +int thermal_notify_cdev_add(int cdev_id, const char *name, int cdev_max_state) +{ + struct param p = { .cdev_id = cdev_id, .name = name, + .cdev_max_state = cdev_max_state }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_CDEV_ADD, &p); +} + +int thermal_notify_cdev_delete(int cdev_id) +{ + struct param p = { .cdev_id = cdev_id }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_CDEV_DELETE, &p); +} + +int thermal_notify_tz_gov_change(int tz_id, const char *name) +{ + struct param p = { .tz_id = tz_id, .name = name }; + + return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_GOV_CHANGE, &p); +} + +/*************************** Command encoding ********************************/ + +static int __thermal_genl_cmd_tz_get_id(struct thermal_zone_device *tz, + void *data) +{ + struct sk_buff *msg = data; + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, tz->id) || + nla_put_string(msg, THERMAL_GENL_ATTR_TZ_NAME, tz->type)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_cmd_tz_get_id(struct param *p) +{ + struct sk_buff *msg = p->msg; + struct nlattr *start_tz; + int ret; + + start_tz = nla_nest_start(msg, THERMAL_GENL_ATTR_TZ); + if (!start_tz) + return -EMSGSIZE; + + ret = for_each_thermal_zone(__thermal_genl_cmd_tz_get_id, msg); + if (ret) + goto out_cancel_nest; + + nla_nest_end(msg, start_tz); + + return 0; + +out_cancel_nest: + nla_nest_cancel(msg, start_tz); + + return ret; +} + +static int thermal_genl_cmd_tz_get_trip(struct param *p) +{ + struct sk_buff *msg = p->msg; + struct thermal_zone_device *tz; + struct nlattr *start_trip; + int i, id; + + if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) + return -EINVAL; + + id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); + + tz = thermal_zone_get_by_id(id); + if (!tz) + return -EINVAL; + + start_trip = nla_nest_start(msg, THERMAL_GENL_ATTR_TZ_TRIP); + if (!start_trip) + return -EMSGSIZE; + + mutex_lock(&tz->lock); + + for (i = 0; i < tz->trips; i++) { + + enum thermal_trip_type type; + int temp, hyst; + + tz->ops->get_trip_type(tz, i, &type); + tz->ops->get_trip_temp(tz, i, &temp); + tz->ops->get_trip_hyst(tz, i, &hyst); + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, i) || + nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, type) || + nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TEMP, temp) || + nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_HYST, hyst)) + goto out_cancel_nest; + } + + mutex_unlock(&tz->lock); + + nla_nest_end(msg, start_trip); + + return 0; + +out_cancel_nest: + mutex_unlock(&tz->lock); + + return -EMSGSIZE; +} + +static int thermal_genl_cmd_tz_get_temp(struct param *p) +{ + struct sk_buff *msg = p->msg; + struct thermal_zone_device *tz; + int temp, ret, id; + + if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) + return -EINVAL; + + id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); + + tz = thermal_zone_get_by_id(id); + if (!tz) + return -EINVAL; + + ret = thermal_zone_get_temp(tz, &temp); + if (ret) + return ret; + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, id) || + nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TEMP, temp)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_cmd_tz_get_gov(struct param *p) +{ + struct sk_buff *msg = p->msg; + struct thermal_zone_device *tz; + int id, ret = 0; + + if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) + return -EINVAL; + + id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); + + tz = thermal_zone_get_by_id(id); + if (!tz) + return -EINVAL; + + mutex_lock(&tz->lock); + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, id) || + nla_put_string(msg, THERMAL_GENL_ATTR_TZ_GOV_NAME, + tz->governor->name)) + ret = -EMSGSIZE; + + mutex_unlock(&tz->lock); + + return ret; +} + +static int __thermal_genl_cmd_cdev_get(struct thermal_cooling_device *cdev, + void *data) +{ + struct sk_buff *msg = data; + + if (nla_put_u32(msg, THERMAL_GENL_ATTR_CDEV_ID, cdev->id)) + return -EMSGSIZE; + + if (nla_put_string(msg, THERMAL_GENL_ATTR_CDEV_NAME, cdev->type)) + return -EMSGSIZE; + + return 0; +} + +static int thermal_genl_cmd_cdev_get(struct param *p) +{ + struct sk_buff *msg = p->msg; + struct nlattr *start_cdev; + int ret; + + start_cdev = nla_nest_start(msg, THERMAL_GENL_ATTR_CDEV); + if (!start_cdev) + return -EMSGSIZE; + + ret = for_each_thermal_cooling_device(__thermal_genl_cmd_cdev_get, msg); + if (ret) + goto out_cancel_nest; + + nla_nest_end(msg, start_cdev); + + return 0; +out_cancel_nest: + nla_nest_cancel(msg, start_cdev); + + return ret; +} + +static cb_t cmd_cb[] = { + [THERMAL_GENL_CMD_TZ_GET_ID] = thermal_genl_cmd_tz_get_id, + [THERMAL_GENL_CMD_TZ_GET_TRIP] = thermal_genl_cmd_tz_get_trip, + [THERMAL_GENL_CMD_TZ_GET_TEMP] = thermal_genl_cmd_tz_get_temp, + [THERMAL_GENL_CMD_TZ_GET_GOV] = thermal_genl_cmd_tz_get_gov, + [THERMAL_GENL_CMD_CDEV_GET] = thermal_genl_cmd_cdev_get, +}; + +static int thermal_genl_cmd_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct param p = { .msg = skb }; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + int cmd = info->ops->cmd; + int ret = -EMSGSIZE; + void *hdr; + + hdr = genlmsg_put(skb, 0, 0, &thermal_gnl_family, 0, cmd); + if (!hdr) + return -EMSGSIZE; + + ret = cmd_cb[cmd](&p); + if (ret) + goto out_cancel_msg; + + genlmsg_end(skb, hdr); + + return 0; + +out_cancel_msg: + genlmsg_cancel(skb, hdr); + + return ret; +} + +static int thermal_genl_cmd_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct param p = { .attrs = info->attrs }; + struct sk_buff *msg; + void *hdr; + int cmd = info->genlhdr->cmd; + int ret = -EMSGSIZE; + + msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + p.msg = msg; + + hdr = genlmsg_put_reply(msg, info, &thermal_gnl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = cmd_cb[cmd](&p); + if (ret) + goto out_cancel_msg; + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); + + return ret; +} + +static const struct genl_ops thermal_genl_ops[] = { + { + .cmd = THERMAL_GENL_CMD_TZ_GET_ID, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .dumpit = thermal_genl_cmd_dumpit, + }, + { + .cmd = THERMAL_GENL_CMD_TZ_GET_TRIP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = thermal_genl_cmd_doit, + }, + { + .cmd = THERMAL_GENL_CMD_TZ_GET_TEMP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = thermal_genl_cmd_doit, + }, + { + .cmd = THERMAL_GENL_CMD_TZ_GET_GOV, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = thermal_genl_cmd_doit, + }, + { + .cmd = THERMAL_GENL_CMD_CDEV_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .dumpit = thermal_genl_cmd_dumpit, + }, +}; + +static struct genl_family thermal_gnl_family __ro_after_init = { + .hdrsize = 0, + .name = THERMAL_GENL_FAMILY_NAME, + .version = THERMAL_GENL_VERSION, + .maxattr = THERMAL_GENL_ATTR_MAX, + .policy = thermal_genl_policy, + .ops = thermal_genl_ops, + .n_ops = ARRAY_SIZE(thermal_genl_ops), + .mcgrps = thermal_genl_mcgrps, + .n_mcgrps = ARRAY_SIZE(thermal_genl_mcgrps), +}; + +static int __init thermal_netlink_init(void) +{ + return genl_register_family(&thermal_gnl_family); +} +core_initcall(thermal_netlink_init); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 108251f23e5c..42ef807e5d84 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -37,18 +37,6 @@ struct thermal_cooling_device; struct thermal_instance; struct thermal_attr; -enum thermal_device_mode { - THERMAL_DEVICE_DISABLED = 0, - THERMAL_DEVICE_ENABLED, -}; - -enum thermal_trip_type { - THERMAL_TRIP_ACTIVE = 0, - THERMAL_TRIP_PASSIVE, - THERMAL_TRIP_HOT, - THERMAL_TRIP_CRITICAL, -}; - enum thermal_trend { THERMAL_TREND_STABLE, /* temperature is stable */ THERMAL_TREND_RAISING, /* temperature is raising */ @@ -303,11 +291,6 @@ struct thermal_zone_params { int offset; }; -struct thermal_genl_event { - u32 orig; - enum events event; -}; - /** * struct thermal_zone_of_device_ops - scallbacks for handling DT based zones * diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index 96218378dda8..c105054cbb57 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -4,31 +4,86 @@ #define THERMAL_NAME_LENGTH 20 -/* Adding event notification support elements */ -#define THERMAL_GENL_FAMILY_NAME "thermal_event" -#define THERMAL_GENL_VERSION 0x01 -#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_grp" - -/* Events supported by Thermal Netlink */ -enum events { - THERMAL_AUX0, - THERMAL_AUX1, - THERMAL_CRITICAL, - THERMAL_DEV_FAULT, +enum thermal_device_mode { + THERMAL_DEVICE_DISABLED = 0, + THERMAL_DEVICE_ENABLED, +}; + +enum thermal_trip_type { + THERMAL_TRIP_ACTIVE = 0, + THERMAL_TRIP_PASSIVE, + THERMAL_TRIP_HOT, + THERMAL_TRIP_CRITICAL, }; -/* attributes of thermal_genl_family */ -enum { +/* Adding event notification support elements */ +#define THERMAL_GENL_FAMILY_NAME "thermal" +#define THERMAL_GENL_VERSION 0x01 +#define THERMAL_GENL_SAMPLING_GROUP_NAME "sampling" +#define THERMAL_GENL_EVENT_GROUP_NAME "event" + +/* Attributes of thermal_genl_family */ +enum thermal_genl_attr { THERMAL_GENL_ATTR_UNSPEC, - THERMAL_GENL_ATTR_EVENT, + THERMAL_GENL_ATTR_TZ, + THERMAL_GENL_ATTR_TZ_ID, + THERMAL_GENL_ATTR_TZ_TEMP, + THERMAL_GENL_ATTR_TZ_TRIP, + THERMAL_GENL_ATTR_TZ_TRIP_ID, + THERMAL_GENL_ATTR_TZ_TRIP_TYPE, + THERMAL_GENL_ATTR_TZ_TRIP_TEMP, + THERMAL_GENL_ATTR_TZ_TRIP_HYST, + THERMAL_GENL_ATTR_TZ_MODE, + THERMAL_GENL_ATTR_TZ_NAME, + THERMAL_GENL_ATTR_TZ_CDEV_WEIGHT, + THERMAL_GENL_ATTR_TZ_GOV, + THERMAL_GENL_ATTR_TZ_GOV_NAME, + THERMAL_GENL_ATTR_CDEV, + THERMAL_GENL_ATTR_CDEV_ID, + THERMAL_GENL_ATTR_CDEV_CUR_STATE, + THERMAL_GENL_ATTR_CDEV_MAX_STATE, + THERMAL_GENL_ATTR_CDEV_NAME, + THERMAL_GENL_ATTR_GOV_NAME, + __THERMAL_GENL_ATTR_MAX, }; #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1) -/* commands supported by the thermal_genl_family */ -enum { +enum thermal_genl_sampling { + THERMAL_GENL_SAMPLING_TEMP, + __THERMAL_GENL_SAMPLING_MAX, +}; +#define THERMAL_GENL_SAMPLING_MAX (__THERMAL_GENL_SAMPLING_MAX - 1) + +/* Events of thermal_genl_family */ +enum thermal_genl_event { + THERMAL_GENL_EVENT_UNSPEC, + THERMAL_GENL_EVENT_TZ_CREATE, /* Thermal zone creation */ + THERMAL_GENL_EVENT_TZ_DELETE, /* Thermal zone deletion */ + THERMAL_GENL_EVENT_TZ_DISABLE, /* Thermal zone disabed */ + THERMAL_GENL_EVENT_TZ_ENABLE, /* Thermal zone enabled */ + THERMAL_GENL_EVENT_TZ_TRIP_UP, /* Trip point crossed the way up */ + THERMAL_GENL_EVENT_TZ_TRIP_DOWN, /* Trip point crossed the way down */ + THERMAL_GENL_EVENT_TZ_TRIP_CHANGE, /* Trip point changed */ + THERMAL_GENL_EVENT_TZ_TRIP_ADD, /* Trip point added */ + THERMAL_GENL_EVENT_TZ_TRIP_DELETE, /* Trip point deleted */ + THERMAL_GENL_EVENT_CDEV_ADD, /* Cdev bound to the thermal zone */ + THERMAL_GENL_EVENT_CDEV_DELETE, /* Cdev unbound */ + THERMAL_GENL_EVENT_CDEV_STATE_UPDATE, /* Cdev state updated */ + THERMAL_GENL_EVENT_TZ_GOV_CHANGE, /* Governor policy changed */ + __THERMAL_GENL_EVENT_MAX, +}; +#define THERMAL_GENL_EVENT_MAX (__THERMAL_GENL_EVENT_MAX - 1) + +/* Commands supported by the thermal_genl_family */ +enum thermal_genl_cmd { THERMAL_GENL_CMD_UNSPEC, - THERMAL_GENL_CMD_EVENT, + THERMAL_GENL_CMD_TZ_GET_ID, /* List of thermal zones id */ + THERMAL_GENL_CMD_TZ_GET_TRIP, /* List of thermal trips */ + THERMAL_GENL_CMD_TZ_GET_TEMP, /* Get the thermal zone temperature */ + THERMAL_GENL_CMD_TZ_GET_GOV, /* Get the thermal zone governor */ + THERMAL_GENL_CMD_TZ_GET_MODE, /* Get the thermal zone mode */ + THERMAL_GENL_CMD_CDEV_GET, /* List of cdev id */ __THERMAL_GENL_CMD_MAX, }; #define THERMAL_GENL_CMD_MAX (__THERMAL_GENL_CMD_MAX - 1) -- cgit v1.2.3 From f5836749c9c04a10decd2742845ad4870965fdef Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 6 Jul 2020 16:01:25 -0700 Subject: bpf: Add BPF_CGROUP_INET_SOCK_RELEASE hook Sometimes it's handy to know when the socket gets freed. In particular, we'd like to try to use a smarter allocation of ports for bpf_bind and explore the possibility of limiting the number of SOCK_DGRAM sockets the process can have. Implement BPF_CGROUP_INET_SOCK_RELEASE hook that triggers on inet socket release. It triggers only for userspace sockets (not in-kernel ones) and therefore has the same semantics as the existing BPF_CGROUP_INET_SOCK_CREATE. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200706230128.4073544-2-sdf@google.com --- include/linux/bpf-cgroup.h | 4 ++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 3 +++ net/core/filter.c | 1 + net/ipv4/af_inet.c | 3 +++ 5 files changed, 12 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c66c545e161a..2c6f26670acc 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -210,6 +210,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) +#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE) + #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) @@ -401,6 +404,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da9bf35a26f8..548a749aebb3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -226,6 +226,7 @@ enum bpf_attach_type { BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, + BPF_CGROUP_INET_SOCK_RELEASE, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..156f51ffada2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1981,6 +1981,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return 0; @@ -2779,6 +2780,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return BPF_PROG_TYPE_CGROUP_SOCK; @@ -2929,6 +2931,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_POST_BIND: diff --git a/net/core/filter.c b/net/core/filter.c index c5e696e6c315..ddcc0d6209e1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6890,6 +6890,7 @@ static bool __sock_filter_check_attach_type(int off, case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: goto full_access; default: return false; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ea6ed6d487ed..ff141d630bdf 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -411,6 +411,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + if (!sk->sk_kern_sock) + BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); + /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); -- cgit v1.2.3 From 82394db7383d33641f3f565bd79792fb41b1741f Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Mon, 29 Jun 2020 12:06:37 -0700 Subject: block: add capacity field to zone descriptors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the zoned storage model, the sectors within a zone are typically all writeable. With the introduction of the Zoned Namespace (ZNS) Command Set in the NVM Express organization, the model was extended to have a specific writeable capacity. Extend the zone descriptor data structure with a zone capacity field to indicate to the user how many sectors in a zone are writeable. Introduce backward compatibility in the zone report ioctl by extending the zone report header data structure with a flags field to indicate if the capacity field is available. Reviewed-by: Jens Axboe Reviewed-by: Javier González Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Daniel Wagner Signed-off-by: Matias Bjørling Signed-off-by: Christoph Hellwig --- block/blk-zoned.c | 1 + drivers/block/null_blk_zoned.c | 2 ++ drivers/scsi/sd_zbc.c | 1 + include/uapi/linux/blkzoned.h | 15 +++++++++++++-- 4 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 23831fa8701d..81152a260354 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -312,6 +312,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, return ret; rep.nr_zones = ret; + rep.flags = BLK_ZONE_REP_CAPACITY; if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) return -EFAULT; return 0; diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index cc47606d8ffe..624aac09b005 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -47,6 +47,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = sector; zone->len = dev->zone_size_sects; + zone->capacity = zone->len; zone->wp = zone->start + zone->len; zone->type = BLK_ZONE_TYPE_CONVENTIONAL; zone->cond = BLK_ZONE_COND_NOT_WP; @@ -59,6 +60,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = zone->wp = sector; zone->len = dev->zone_size_sects; + zone->capacity = zone->len; zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; zone->cond = BLK_ZONE_COND_EMPTY; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 6f7eba66687e..183a20720da9 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -59,6 +59,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf, zone.non_seq = 1; zone.len = logical_to_sectors(sdp, get_unaligned_be64(&buf[8])); + zone.capacity = zone.len; zone.start = logical_to_sectors(sdp, get_unaligned_be64(&buf[16])); zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24])); if (zone.type != ZBC_ZONE_TYPE_CONV && diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 0cdef67135f0..42c3366cc25f 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -73,6 +73,15 @@ enum blk_zone_cond { BLK_ZONE_COND_OFFLINE = 0xF, }; +/** + * enum blk_zone_report_flags - Feature flags of reported zone descriptors. + * + * @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field. + */ +enum blk_zone_report_flags { + BLK_ZONE_REP_CAPACITY = (1 << 0), +}; + /** * struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl. * @@ -99,7 +108,9 @@ struct blk_zone { __u8 cond; /* Zone condition */ __u8 non_seq; /* Non-sequential write resources active */ __u8 reset; /* Reset write pointer recommended */ - __u8 reserved[36]; + __u8 resv[4]; + __u64 capacity; /* Zone capacity in number of sectors */ + __u8 reserved[24]; }; /** @@ -115,7 +126,7 @@ struct blk_zone { struct blk_zone_report { __u64 sector; __u32 nr_zones; - __u8 reserved[4]; + __u32 flags; struct blk_zone zones[0]; }; -- cgit v1.2.3 From 1aa561b1a4c0ae2a9a9b9c21a84b5ca66b4775d8 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 3 Jun 2020 16:56:21 -0700 Subject: kvm: x86: Add "last CPU" to some KVM_EXIT information More often than not, a failed VM-entry in an x86 production environment is induced by a defective CPU. To help identify the bad hardware, include the id of the last logical CPU to run a vCPU in the information provided to userspace on a KVM exit for failed VM-entry or for KVM internal errors not associated with emulation. The presence of this additional information is indicated by a new capability, KVM_CAP_LAST_CPU. Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20200603235623.245638-5-jmattson@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 1 + arch/x86/kvm/svm/svm.c | 4 +++- arch/x86/kvm/vmx/vmx.c | 10 ++++++++-- arch/x86/kvm/x86.c | 1 + include/uapi/linux/kvm.h | 2 ++ 5 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 426f94582b7a..1cfe79b932d6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4794,6 +4794,7 @@ hardware_exit_reason. /* KVM_EXIT_FAIL_ENTRY */ struct { __u64 hardware_entry_failure_reason; + __u32 cpu; /* if KVM_LAST_CPU */ } fail_entry; If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 24b7f321874f..8ecd46f2cb1e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2947,6 +2947,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; + kvm_run->fail_entry.cpu = svm->last_cpu; dump_vmcb(vcpu); return 0; } @@ -2970,8 +2971,9 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; + vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_code; + vcpu->run->internal.data[1] = svm->last_cpu; return 0; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4d8f12c0a5c6..b52bcebfa094 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4781,10 +4781,11 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 3; + vcpu->run->internal.ndata = 4; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; vcpu->run->internal.data[2] = error_code; + vcpu->run->internal.data[3] = vmx->last_cpu; return 0; } @@ -6006,6 +6007,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; + vcpu->run->fail_entry.cpu = vmx->last_cpu; return 0; } @@ -6014,6 +6016,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + vcpu->run->fail_entry.cpu = vmx->last_cpu; return 0; } @@ -6040,6 +6043,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->internal.data[3] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); } + vcpu->run->internal.data[vcpu->run->internal.ndata++] = + vmx->last_cpu; return 0; } @@ -6095,8 +6100,9 @@ unexpected_vmexit: vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; + vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[1] = vmx->last_cpu; return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 82f457f0e7e0..1a0fad1018f9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3510,6 +3510,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_LAST_CPU: r = 1; break; case KVM_CAP_SYNC_REGS: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4fdf30316582..ff9b335620d0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -289,6 +289,7 @@ struct kvm_run { /* KVM_EXIT_FAIL_ENTRY */ struct { __u64 hardware_entry_failure_reason; + __u32 cpu; } fail_entry; /* KVM_EXIT_EXCEPTION */ struct { @@ -1031,6 +1032,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_SECURE_GUEST 181 #define KVM_CAP_HALT_POLL 182 #define KVM_CAP_ASYNC_PF_INT 183 +#define KVM_CAP_LAST_CPU 184 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 065e0d42a0a728d7f6c2aec7c9f3e5dc7b715394 Mon Sep 17 00:00:00 2001 From: Meir Lichtinger Date: Mon, 6 Jul 2020 20:42:32 -0700 Subject: ethtool: Add support for 100Gbps per lane link modes Define 100G, 200G and 400G link modes using 100Gbps per lane LR, ER and FR are defined as a single link mode because they are using same technology and by design are fully interoperable. EEPROM content indicates if the module is LR, ER, or FR, and the user space ethtool decoder is planned to support decoding these modes in the EEPROM. Signed-off-by: Meir Lichtinger CC: Andrew Lunn Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/phy/phy-core.c | 17 ++++++++++++++++- include/uapi/linux/ethtool.h | 15 +++++++++++++++ net/ethtool/common.c | 15 +++++++++++++++ net/ethtool/linkmodes.c | 15 +++++++++++++++ 4 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 46bd68e9ecfa..ff8e14b01eeb 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -8,7 +8,7 @@ const char *phy_speed_to_str(int speed) { - BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 75, + BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 90, "Enum ethtool_link_mode_bit_indices and phylib are out of sync. " "If a speed or mode has been added please update phy_speed_to_str " "and the PHY settings array.\n"); @@ -78,12 +78,22 @@ static const struct phy_setting settings[] = { PHY_SETTING( 400000, FULL, 400000baseLR8_ER8_FR8_Full ), PHY_SETTING( 400000, FULL, 400000baseDR8_Full ), PHY_SETTING( 400000, FULL, 400000baseSR8_Full ), + PHY_SETTING( 400000, FULL, 400000baseCR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseKR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseLR4_ER4_FR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseDR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseSR4_Full ), /* 200G */ PHY_SETTING( 200000, FULL, 200000baseCR4_Full ), PHY_SETTING( 200000, FULL, 200000baseKR4_Full ), PHY_SETTING( 200000, FULL, 200000baseLR4_ER4_FR4_Full ), PHY_SETTING( 200000, FULL, 200000baseDR4_Full ), PHY_SETTING( 200000, FULL, 200000baseSR4_Full ), + PHY_SETTING( 200000, FULL, 200000baseCR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseKR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseLR2_ER2_FR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseDR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseSR2_Full ), /* 100G */ PHY_SETTING( 100000, FULL, 100000baseCR4_Full ), PHY_SETTING( 100000, FULL, 100000baseKR4_Full ), @@ -94,6 +104,11 @@ static const struct phy_setting settings[] = { PHY_SETTING( 100000, FULL, 100000baseLR2_ER2_FR2_Full ), PHY_SETTING( 100000, FULL, 100000baseDR2_Full ), PHY_SETTING( 100000, FULL, 100000baseSR2_Full ), + PHY_SETTING( 100000, FULL, 100000baseCR_Full ), + PHY_SETTING( 100000, FULL, 100000baseKR_Full ), + PHY_SETTING( 100000, FULL, 100000baseLR_ER_FR_Full ), + PHY_SETTING( 100000, FULL, 100000baseDR_Full ), + PHY_SETTING( 100000, FULL, 100000baseSR_Full ), /* 56G */ PHY_SETTING( 56000, FULL, 56000baseCR4_Full ), PHY_SETTING( 56000, FULL, 56000baseKR4_Full ), diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index d1413538ef30..60856e0f9618 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1600,6 +1600,21 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, ETHTOOL_LINK_MODE_FEC_LLRS_BIT = 74, + ETHTOOL_LINK_MODE_100000baseKR_Full_BIT = 75, + ETHTOOL_LINK_MODE_100000baseSR_Full_BIT = 76, + ETHTOOL_LINK_MODE_100000baseLR_ER_FR_Full_BIT = 77, + ETHTOOL_LINK_MODE_100000baseCR_Full_BIT = 78, + ETHTOOL_LINK_MODE_100000baseDR_Full_BIT = 79, + ETHTOOL_LINK_MODE_200000baseKR2_Full_BIT = 80, + ETHTOOL_LINK_MODE_200000baseSR2_Full_BIT = 81, + ETHTOOL_LINK_MODE_200000baseLR2_ER2_FR2_Full_BIT = 82, + ETHTOOL_LINK_MODE_200000baseDR2_Full_BIT = 83, + ETHTOOL_LINK_MODE_200000baseCR2_Full_BIT = 84, + ETHTOOL_LINK_MODE_400000baseKR4_Full_BIT = 85, + ETHTOOL_LINK_MODE_400000baseSR4_Full_BIT = 86, + ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, + ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, + ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS }; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index ce4dbae5a943..c54166713797 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -176,6 +176,21 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(400000, DR8, Full), __DEFINE_LINK_MODE_NAME(400000, CR8, Full), __DEFINE_SPECIAL_MODE_NAME(FEC_LLRS, "LLRS"), + __DEFINE_LINK_MODE_NAME(100000, KR, Full), + __DEFINE_LINK_MODE_NAME(100000, SR, Full), + __DEFINE_LINK_MODE_NAME(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_NAME(100000, DR, Full), + __DEFINE_LINK_MODE_NAME(100000, CR, Full), + __DEFINE_LINK_MODE_NAME(200000, KR2, Full), + __DEFINE_LINK_MODE_NAME(200000, SR2, Full), + __DEFINE_LINK_MODE_NAME(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_NAME(200000, DR2, Full), + __DEFINE_LINK_MODE_NAME(200000, CR2, Full), + __DEFINE_LINK_MODE_NAME(400000, KR4, Full), + __DEFINE_LINK_MODE_NAME(400000, SR4, Full), + __DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_NAME(400000, DR4, Full), + __DEFINE_LINK_MODE_NAME(400000, CR4, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index fd4f3e58c6f6..317a93129551 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -257,6 +257,21 @@ static const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), + __DEFINE_LINK_MODE_PARAMS(100000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), }; static const struct nla_policy -- cgit v1.2.3 From 6d5f904904608a9cd32854d7d0a4dd65b27f9935 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 9 Jul 2020 09:15:29 +0800 Subject: io_uring: export cq overflow status to userspace For those applications which are not willing to use io_uring_enter() to reap and handle cqes, they may completely rely on liburing's io_uring_peek_cqe(), but if cq ring has overflowed, currently because io_uring_peek_cqe() is not aware of this overflow, it won't enter kernel to flush cqes, below test program can reveal this bug: static void test_cq_overflow(struct io_uring *ring) { struct io_uring_cqe *cqe; struct io_uring_sqe *sqe; int issued = 0; int ret = 0; do { sqe = io_uring_get_sqe(ring); if (!sqe) { fprintf(stderr, "get sqe failed\n"); break;; } ret = io_uring_submit(ring); if (ret <= 0) { if (ret != -EBUSY) fprintf(stderr, "sqe submit failed: %d\n", ret); break; } issued++; } while (ret > 0); assert(ret == -EBUSY); printf("issued requests: %d\n", issued); while (issued) { ret = io_uring_peek_cqe(ring, &cqe); if (ret) { if (ret != -EAGAIN) { fprintf(stderr, "peek completion failed: %s\n", strerror(ret)); break; } printf("left requets: %d\n", issued); continue; } io_uring_cqe_seen(ring, cqe); issued--; printf("left requets: %d\n", issued); } } int main(int argc, char *argv[]) { int ret; struct io_uring ring; ret = io_uring_queue_init(16, &ring, 0); if (ret) { fprintf(stderr, "ring setup failed: %d\n", ret); return 1; } test_cq_overflow(&ring); return 0; } To fix this issue, export cq overflow status to userspace by adding new IORING_SQ_CQ_OVERFLOW flag, then helper functions() in liburing, such as io_uring_peek_cqe, can be aware of this cq overflow and do flush accordingly. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 +++++++++-- include/uapi/linux/io_uring.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/fs/io_uring.c b/fs/io_uring.c index d37d7ea5ebe5..32e37c38f274 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1274,6 +1274,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) { clear_bit(0, &ctx->sq_check_overflow); clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -1311,6 +1312,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) if (list_empty(&ctx->cq_overflow_list)) { set_bit(0, &ctx->sq_check_overflow); set_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } req->flags |= REQ_F_OVERFLOW; refcount_inc(&req->refs); @@ -6080,9 +6082,9 @@ static int io_sq_thread(void *data) } /* Tell userspace we may need a wakeup call */ + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; - /* make sure to read SQ tail after writing flags */ - smp_mb(); + spin_unlock_irq(&ctx->completion_lock); to_submit = io_sqring_entries(ctx); if (!to_submit || ret == -EBUSY) { @@ -6100,13 +6102,17 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); } mutex_lock(&ctx->uring_lock); @@ -7488,6 +7494,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (list_empty(&ctx->cq_overflow_list)) { clear_bit(0, &ctx->sq_check_overflow); clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } spin_unlock_irq(&ctx->completion_lock); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 92c22699a5a7..7843742b8b74 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -197,6 +197,7 @@ struct io_sqring_offsets { * sq_ring->flags */ #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ +#define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */ struct io_cqring_offsets { __u32 head; -- cgit v1.2.3 From 3f935c75eb52dd968351dba824adf466fb9c9429 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jul 2020 15:12:39 +0200 Subject: inet_diag: support for wider protocol numbers After commit bf9765145b85 ("sock: Make sk_protocol a 16-bit value") the current size of 'sdiag_protocol' is not sufficient to represent the possible protocol values. This change introduces a new inet diag request attribute to let user space specify the relevant protocol number using u32 values. The attribute is parsed by inet diag core on get/dump command and the extended protocol value, if available, is preferred to 'sdiag_protocol' to lookup the diag handler. The parse attributed are exposed to all the diag handlers via the cb->data. Note that inet_diag_dump_one_icsk() is left unmodified, as it will not be used by protocol using the extended attribute. Suggested-by: David S. Miller Co-developed-by: Christoph Paasch Signed-off-by: Christoph Paasch Acked-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + net/core/sock.c | 1 + net/ipv4/inet_diag.c | 65 +++++++++++++++++++++++++++++++----------- 3 files changed, 50 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index e6f183ee8417..5ba122c1949a 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -65,6 +65,7 @@ enum { INET_DIAG_REQ_NONE, INET_DIAG_REQ_BYTECODE, INET_DIAG_REQ_SK_BPF_STORAGES, + INET_DIAG_REQ_PROTOCOL, __INET_DIAG_REQ_MAX, }; diff --git a/net/core/sock.c b/net/core/sock.c index f5b5fdd61c88..de26fe4ea19f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3566,6 +3566,7 @@ int sock_load_diag_module(int family, int protocol) #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && + protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 125f4f8a36b4..4a98dd736270 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -52,6 +52,11 @@ static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { + if (proto < 0 || proto >= IPPROTO_MAX) { + mutex_lock(&inet_diag_table_mutex); + return ERR_PTR(-ENOENT); + } + if (!inet_diag_table[proto]) sock_load_diag_module(AF_INET, proto); @@ -181,6 +186,28 @@ errout: } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); +static void inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr **req_nlas) +{ + struct nlattr *nla; + int remaining; + + nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) { + int type = nla_type(nla); + + if (type < __INET_DIAG_REQ_MAX) + req_nlas[type] = nla; + } +} + +static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req, + const struct inet_diag_dump_data *data) +{ + if (data->req_nlas[INET_DIAG_REQ_PROTOCOL]) + return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]); + return req->sdiag_protocol; +} + #define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, @@ -198,7 +225,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, void *info = NULL; cb_data = cb->data; - handler = inet_diag_table[req->sdiag_protocol]; + handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)]; BUG_ON(!handler); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -539,20 +566,25 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, + int hdrlen, const struct inet_diag_req_v2 *req) { const struct inet_diag_handler *handler; - int err; + struct inet_diag_dump_data dump_data; + int err, protocol; - handler = inet_diag_lock_handler(req->sdiag_protocol); + memset(&dump_data, 0, sizeof(dump_data)); + inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas); + protocol = inet_diag_get_protocol(req, &dump_data); + + handler = inet_diag_lock_handler(protocol); if (IS_ERR(handler)) { err = PTR_ERR(handler); } else if (cmd == SOCK_DIAG_BY_FAMILY) { - struct inet_diag_dump_data empty_dump_data = {}; struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, - .data = &empty_dump_data, + .data = &dump_data, }; err = handler->dump_one(&cb, req); } else if (cmd == SOCK_DESTROY && handler->destroy) { @@ -1103,13 +1135,16 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { + struct inet_diag_dump_data *cb_data = cb->data; const struct inet_diag_handler *handler; u32 prev_min_dump_alloc; - int err = 0; + int protocol, err = 0; + + protocol = inet_diag_get_protocol(r, cb_data); again: prev_min_dump_alloc = cb->min_dump_alloc; - handler = inet_diag_lock_handler(r->sdiag_protocol); + handler = inet_diag_lock_handler(protocol); if (!IS_ERR(handler)) handler->dump(skb, cb, r); else @@ -1139,19 +1174,13 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) struct inet_diag_dump_data *cb_data; struct sk_buff *skb = cb->skb; struct nlattr *nla; - int rem, err; + int err; cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); if (!cb_data) return -ENOMEM; - nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), rem) { - int type = nla_type(nla); - - if (type < __INET_DIAG_REQ_MAX) - cb_data->req_nlas[type] = nla; - } + inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas); nla = cb_data->inet_diag_nla_bc; if (nla) { @@ -1237,7 +1266,8 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, req.idiag_states = rc->idiag_states; req.id = rc->id; - return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req); + return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, + sizeof(struct inet_diag_req), &req); } static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -1279,7 +1309,8 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) return netlink_dump_start(net->diag_nlsk, skb, h, &c); } - return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h)); + return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen, + nlmsg_data(h)); } static -- cgit v1.2.3 From ac3b45f6095452a9731f8825be1513d326dbfa15 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jul 2020 15:12:41 +0200 Subject: mptcp: add MPTCP socket diag interface exposes basic inet socket attribute, plus some MPTCP socket fields comprising PM status and MPTCP-level sequence numbers. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 17 +++++ net/mptcp/Kconfig | 4 ++ net/mptcp/Makefile | 2 + net/mptcp/mptcp_diag.c | 169 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 192 insertions(+) create mode 100644 net/mptcp/mptcp_diag.c (limited to 'include/uapi') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5f2c77082d9e..9762660df741 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -86,4 +86,21 @@ enum { __MPTCP_PM_CMD_AFTER_LAST }; +#define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) +#define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) + +struct mptcp_info { + __u8 mptcpi_subflows; + __u8 mptcpi_add_addr_signal; + __u8 mptcpi_add_addr_accepted; + __u8 mptcpi_subflows_max; + __u8 mptcpi_add_addr_signal_max; + __u8 mptcpi_add_addr_accepted_max; + __u32 mptcpi_flags; + __u32 mptcpi_token; + __u64 mptcpi_write_seq; + __u64 mptcpi_snd_una; + __u64 mptcpi_rcv_nxt; +}; + #endif /* _UAPI_MPTCP_H */ diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index af84fce70bb0..698bc3525160 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -13,6 +13,10 @@ config MPTCP if MPTCP +config INET_MPTCP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG + config MPTCP_IPV6 bool "MPTCP: IPv6 support for Multipath TCP" select IPV6 diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index c53f9b845523..2360cbd27d59 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -4,6 +4,8 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o +obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o + mptcp_crypto_test-objs := crypto_test.o mptcp_token_test-objs := token_test.o obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c new file mode 100644 index 000000000000..5f390a97f556 --- /dev/null +++ b/net/mptcp/mptcp_diag.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP socket monitoring support + * + * Copyright (c) 2020 Red Hat + * + * Author: Paolo Abeni + */ + +#include +#include +#include +#include +#include +#include "protocol.h" + +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + struct nlattr *bc, bool net_admin) +{ + if (!inet_diag_bc_sk(bc, sk)) + return 0; + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, + net_admin); +} + +static int mptcp_diag_dump_one(struct netlink_callback *cb, + const struct inet_diag_req_v2 *req) +{ + struct sk_buff *in_skb = cb->skb; + struct mptcp_sock *msk = NULL; + struct sk_buff *rep; + int err = -ENOENT; + struct net *net; + struct sock *sk; + + net = sock_net(in_skb->sk); + msk = mptcp_token_get_sock(req->id.idiag_cookie[0]); + if (!msk) + goto out_nosk; + + err = -ENOMEM; + sk = (struct sock *)msk; + rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct mptcp_info)) + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, + GFP_KERNEL); + if (!rep) + goto out; + + err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(rep); + goto out; + } + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + sock_put(sk); + +out_nosk: + return err; +} + +static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r) +{ + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; + struct mptcp_sock *msk; + struct nlattr *bc; + + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; + + while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) != + NULL) { + struct inet_sock *inet = (struct inet_sock *)msk; + struct sock *sk = (struct sock *)msk; + int ret = 0; + + if (!(r->idiag_states & (1 << sk->sk_state))) + goto next; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next; + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + + ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); +next: + sock_put(sk); + if (ret < 0) { + /* will retry on the same position */ + cb->args[1]--; + break; + } + cond_resched(); + } +} + +static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_info *info = _info; + u32 flags = 0; + bool slow; + u8 val; + + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); + if (!info) + return; + + slow = lock_sock_fast(sk); + info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); + info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); + info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max); + val = READ_ONCE(msk->pm.add_addr_signal_max); + info->mptcpi_add_addr_signal_max = val; + val = READ_ONCE(msk->pm.add_addr_accept_max); + info->mptcpi_add_addr_accepted_max = val; + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) + flags |= MPTCP_INFO_FLAG_FALLBACK; + if (READ_ONCE(msk->can_ack)) + flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; + info->mptcpi_flags = flags; + info->mptcpi_token = READ_ONCE(msk->token); + info->mptcpi_write_seq = READ_ONCE(msk->write_seq); + info->mptcpi_snd_una = atomic64_read(&msk->snd_una); + info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + unlock_sock_fast(sk, slow); +} + +static const struct inet_diag_handler mptcp_diag_handler = { + .dump = mptcp_diag_dump, + .dump_one = mptcp_diag_dump_one, + .idiag_get_info = mptcp_diag_get_info, + .idiag_type = IPPROTO_MPTCP, + .idiag_info_size = sizeof(struct mptcp_info), +}; + +static int __init mptcp_diag_init(void) +{ + return inet_diag_register(&mptcp_diag_handler); +} + +static void __exit mptcp_diag_exit(void) +{ + inet_diag_unregister(&mptcp_diag_handler); +} + +module_init(mptcp_diag_init); +module_exit(mptcp_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */); -- cgit v1.2.3 From a21cf0a8330bba60e44ca6c99e1591042f336ff5 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 9 Jul 2020 16:18:18 +0300 Subject: devlink: Add a new devlink port lanes attribute and pass to netlink Add a new devlink port attribute that indicates the port's number of lanes. Drivers are expected to set it via devlink_port_attrs_set(), before registering the port. The attribute is not passed to user space in case the number of lanes is invalid (0). Signed-off-by: Danielle Ratson Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 1 + include/net/devlink.h | 2 ++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 4 ++++ 4 files changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index f44cb1a537f3..6cde196f6b70 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -2134,6 +2134,7 @@ static int __mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port, int err; attrs.split = split; + attrs.lanes = lanes; attrs.flavour = flavour; attrs.phys.port_number = port_number; attrs.phys.split_subport_number = split_port_subnumber; diff --git a/include/net/devlink.h b/include/net/devlink.h index 8f9db991192d..91a9f8770d08 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -68,10 +68,12 @@ struct devlink_port_pci_vf_attrs { * struct devlink_port_attrs - devlink port object * @flavour: flavour of the port * @split: indicates if this is split port + * @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink. * @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL */ struct devlink_port_attrs { u8 split:1; + u32 lanes; enum devlink_port_flavour flavour; struct netdev_phys_item_id switch_id; union { diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 87c83a82991b..f741ab8d9cf0 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -455,6 +455,8 @@ enum devlink_attr { DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, /* string */ + DEVLINK_ATTR_PORT_LANES, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 266936c38357..7f26d1054974 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -530,6 +530,10 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, if (!devlink_port->attrs_set) return 0; + if (attrs->lanes) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) + return -EMSGSIZE; + } if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { -- cgit v1.2.3 From a0f49b54865273c895be3826d6d59cbc5ad725c2 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 9 Jul 2020 16:18:20 +0300 Subject: devlink: Add a new devlink port split ability attribute and pass to netlink Add a new attribute that indicates the split ability of devlink port. Drivers are expected to set it via devlink_port_attrs_set(), before registering the port. Signed-off-by: Danielle Ratson Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 1 + include/net/devlink.h | 4 +++- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 3 +++ 5 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index f85f5d88d331..8b3791d73c99 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -2135,6 +2135,7 @@ static int __mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port, attrs.split = split; attrs.lanes = lanes; + attrs.splittable = splittable; attrs.flavour = flavour; attrs.phys.port_number = port_number; attrs.phys.split_subport_number = split_port_subnumber; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index 71f4e624b3db..b6a10565309a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -367,6 +367,7 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port) return ret; attrs.split = eth_port.is_split; + attrs.splittable = !attrs.split; attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; attrs.phys.port_number = eth_port.label_port; attrs.phys.split_subport_number = eth_port.label_subport; diff --git a/include/net/devlink.h b/include/net/devlink.h index 91a9f8770d08..746bed538664 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -68,11 +68,13 @@ struct devlink_port_pci_vf_attrs { * struct devlink_port_attrs - devlink port object * @flavour: flavour of the port * @split: indicates if this is split port + * @splittable: indicates if the port can be split. * @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink. * @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL */ struct devlink_port_attrs { - u8 split:1; + u8 split:1, + splittable:1; u32 lanes; enum devlink_port_flavour flavour; struct netdev_phys_item_id switch_id; diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index f741ab8d9cf0..cfef4245ea5a 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -456,6 +456,7 @@ enum devlink_attr { DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, /* string */ DEVLINK_ATTR_PORT_LANES, /* u32 */ + DEVLINK_ATTR_PORT_SPLITTABLE, /* u8 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 7f26d1054974..94c797b74378 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -534,6 +534,8 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) return -EMSGSIZE; } + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) + return -EMSGSIZE; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { @@ -7547,6 +7549,7 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) return; + WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); -- cgit v1.2.3 From f794db6841e5480208f0c3a3ac1df445a96b079e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 9 Jul 2020 14:08:51 +0200 Subject: virt: vbox: Fix VBGL_IOCTL_VMMDEV_REQUEST_BIG and _LOG req numbers to match upstream Until this commit the mainline kernel version (this version) of the vboxguest module contained a bug where it defined VBGL_IOCTL_VMMDEV_REQUEST_BIG and VBGL_IOCTL_LOG using _IOC(_IOC_READ | _IOC_WRITE, 'V', ...) instead of _IO(V, ...) as the out of tree VirtualBox upstream version does. Since the VirtualBox userspace bits are always built against VirtualBox upstream's headers, this means that so far the mainline kernel version of the vboxguest module has been failing these 2 ioctls with -ENOTTY. I guess that VBGL_IOCTL_VMMDEV_REQUEST_BIG is never used causing us to not hit that one and sofar the vboxguest driver has failed to actually log any log messages passed it through VBGL_IOCTL_LOG. This commit changes the VBGL_IOCTL_VMMDEV_REQUEST_BIG and VBGL_IOCTL_LOG defines to match the out of tree VirtualBox upstream vboxguest version, while keeping compatibility with the old wrong request defines so as to not break the kernel ABI in case someone has been using the old request defines. Fixes: f6ddd094f579 ("virt: Add vboxguest driver for Virtual Box Guest integration UAPI") Cc: stable@vger.kernel.org Acked-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200709120858.63928-2-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/virt/vboxguest/vboxguest_core.c | 4 +++- drivers/virt/vboxguest/vboxguest_core.h | 15 +++++++++++++++ drivers/virt/vboxguest/vboxguest_linux.c | 3 ++- include/uapi/linux/vboxguest.h | 4 ++-- 4 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/drivers/virt/vboxguest/vboxguest_core.c b/drivers/virt/vboxguest/vboxguest_core.c index b690a8a4bf9e..8fab04e76c14 100644 --- a/drivers/virt/vboxguest/vboxguest_core.c +++ b/drivers/virt/vboxguest/vboxguest_core.c @@ -1520,7 +1520,8 @@ int vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data) /* For VMMDEV_REQUEST hdr->type != VBG_IOCTL_HDR_TYPE_DEFAULT */ if (req_no_size == VBG_IOCTL_VMMDEV_REQUEST(0) || - req == VBG_IOCTL_VMMDEV_REQUEST_BIG) + req == VBG_IOCTL_VMMDEV_REQUEST_BIG || + req == VBG_IOCTL_VMMDEV_REQUEST_BIG_ALT) return vbg_ioctl_vmmrequest(gdev, session, data); if (hdr->type != VBG_IOCTL_HDR_TYPE_DEFAULT) @@ -1558,6 +1559,7 @@ int vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data) case VBG_IOCTL_HGCM_CALL(0): return vbg_ioctl_hgcm_call(gdev, session, f32bit, data); case VBG_IOCTL_LOG(0): + case VBG_IOCTL_LOG_ALT(0): return vbg_ioctl_log(data); } diff --git a/drivers/virt/vboxguest/vboxguest_core.h b/drivers/virt/vboxguest/vboxguest_core.h index 4188c12b839f..77c3a9c8255d 100644 --- a/drivers/virt/vboxguest/vboxguest_core.h +++ b/drivers/virt/vboxguest/vboxguest_core.h @@ -15,6 +15,21 @@ #include #include "vmmdev.h" +/* + * The mainline kernel version (this version) of the vboxguest module + * contained a bug where it defined VBGL_IOCTL_VMMDEV_REQUEST_BIG and + * VBGL_IOCTL_LOG using _IOC(_IOC_READ | _IOC_WRITE, 'V', ...) instead + * of _IO(V, ...) as the out of tree VirtualBox upstream version does. + * + * These _ALT definitions keep compatibility with the wrong defines the + * mainline kernel version used for a while. + * Note the VirtualBox userspace bits have always been built against + * VirtualBox upstream's headers, so this is likely not necessary. But + * we must never break our ABI so we keep these around to be 100% sure. + */ +#define VBG_IOCTL_VMMDEV_REQUEST_BIG_ALT _IOC(_IOC_READ | _IOC_WRITE, 'V', 3, 0) +#define VBG_IOCTL_LOG_ALT(s) _IOC(_IOC_READ | _IOC_WRITE, 'V', 9, s) + struct vbg_session; /** VBox guest memory balloon. */ diff --git a/drivers/virt/vboxguest/vboxguest_linux.c b/drivers/virt/vboxguest/vboxguest_linux.c index 6e8c0f1c1056..32c2c52f7e84 100644 --- a/drivers/virt/vboxguest/vboxguest_linux.c +++ b/drivers/virt/vboxguest/vboxguest_linux.c @@ -131,7 +131,8 @@ static long vbg_misc_device_ioctl(struct file *filp, unsigned int req, * the need for a bounce-buffer and another copy later on. */ is_vmmdev_req = (req & ~IOCSIZE_MASK) == VBG_IOCTL_VMMDEV_REQUEST(0) || - req == VBG_IOCTL_VMMDEV_REQUEST_BIG; + req == VBG_IOCTL_VMMDEV_REQUEST_BIG || + req == VBG_IOCTL_VMMDEV_REQUEST_BIG_ALT; if (is_vmmdev_req) buf = vbg_req_alloc(size, VBG_IOCTL_HDR_TYPE_DEFAULT, diff --git a/include/uapi/linux/vboxguest.h b/include/uapi/linux/vboxguest.h index 9cec58a6a5ea..f79d7abe27db 100644 --- a/include/uapi/linux/vboxguest.h +++ b/include/uapi/linux/vboxguest.h @@ -103,7 +103,7 @@ VMMDEV_ASSERT_SIZE(vbg_ioctl_driver_version_info, 24 + 20); /* IOCTL to perform a VMM Device request larger then 1KB. */ -#define VBG_IOCTL_VMMDEV_REQUEST_BIG _IOC(_IOC_READ | _IOC_WRITE, 'V', 3, 0) +#define VBG_IOCTL_VMMDEV_REQUEST_BIG _IO('V', 3) /** VBG_IOCTL_HGCM_CONNECT data structure. */ @@ -198,7 +198,7 @@ struct vbg_ioctl_log { } u; }; -#define VBG_IOCTL_LOG(s) _IOC(_IOC_READ | _IOC_WRITE, 'V', 9, s) +#define VBG_IOCTL_LOG(s) _IO('V', 9) /** VBG_IOCTL_WAIT_FOR_EVENTS data structure. */ -- cgit v1.2.3 From 631beddc5466731b048263a4a9d3d67150e72f8d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 9 Jul 2020 14:08:55 +0200 Subject: virt: vbox: Add support for the new VBG_IOCTL_ACQUIRE_GUEST_CAPABILITIES ioctl Add support for the new VBG_IOCTL_ACQUIRE_GUEST_CAPABILITIES ioctl, this is necessary for automatic resizing of the guest resolution to match the VM-window size to work with the new VMSVGA virtual GPU which is now the new default in VirtualBox. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1789545 Acked-by: Arnd Bergmann Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200709120858.63928-6-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/virt/vboxguest/vboxguest_core.c | 163 +++++++++++++++++++++++++++++++- drivers/virt/vboxguest/vboxguest_core.h | 14 +++ include/uapi/linux/vboxguest.h | 24 +++++ 3 files changed, 200 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/virt/vboxguest/vboxguest_core.c b/drivers/virt/vboxguest/vboxguest_core.c index 15b3cb618c6e..4f1addaa3f6f 100644 --- a/drivers/virt/vboxguest/vboxguest_core.c +++ b/drivers/virt/vboxguest/vboxguest_core.c @@ -679,7 +679,7 @@ static int vbg_set_host_capabilities(struct vbg_dev *gdev, WARN_ON(!mutex_is_locked(&gdev->session_mutex)); - caps = gdev->set_guest_caps_tracker.mask; + caps = gdev->acquired_guest_caps | gdev->set_guest_caps_tracker.mask; if (gdev->guest_caps_host == caps) return 0; @@ -703,6 +703,113 @@ static int vbg_set_host_capabilities(struct vbg_dev *gdev, return vbg_status_code_to_errno(rc); } +/** + * Acquire (get exclusive access) guest capabilities for a session. + * Takes the session mutex. + * Return: 0 or negative errno value. + * @gdev: The Guest extension device. + * @session: The session. + * @flags: Flags (VBGL_IOC_AGC_FLAGS_XXX). + * @or_mask: The capabilities to add. + * @not_mask: The capabilities to remove. + * @session_termination: Set if we're called by the session cleanup code. + * This tweaks the error handling so we perform + * proper session cleanup even if the host + * misbehaves. + */ +static int vbg_acquire_session_capabilities(struct vbg_dev *gdev, + struct vbg_session *session, + u32 or_mask, u32 not_mask, + u32 flags, bool session_termination) +{ + unsigned long irqflags; + bool wakeup = false; + int ret = 0; + + mutex_lock(&gdev->session_mutex); + + if (gdev->set_guest_caps_tracker.mask & or_mask) { + vbg_err("%s error: cannot acquire caps which are currently set\n", + __func__); + ret = -EINVAL; + goto out; + } + + /* + * Mark any caps in the or_mask as now being in acquire-mode. Note + * once caps are in acquire_mode they always stay in this mode. + * This impacts event handling, so we take the event-lock. + */ + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + gdev->acquire_mode_guest_caps |= or_mask; + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + + /* If we only have to switch the caps to acquire mode, we're done. */ + if (flags & VBGL_IOC_AGC_FLAGS_CONFIG_ACQUIRE_MODE) + goto out; + + not_mask &= ~or_mask; /* or_mask takes priority over not_mask */ + not_mask &= session->acquired_guest_caps; + or_mask &= ~session->acquired_guest_caps; + + if (or_mask == 0 && not_mask == 0) + goto out; + + if (gdev->acquired_guest_caps & or_mask) { + ret = -EBUSY; + goto out; + } + + gdev->acquired_guest_caps |= or_mask; + gdev->acquired_guest_caps &= ~not_mask; + /* session->acquired_guest_caps impacts event handling, take the lock */ + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + session->acquired_guest_caps |= or_mask; + session->acquired_guest_caps &= ~not_mask; + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + + ret = vbg_set_host_capabilities(gdev, session, session_termination); + /* Roll back on failure, unless it's session termination time. */ + if (ret < 0 && !session_termination) { + gdev->acquired_guest_caps &= ~or_mask; + gdev->acquired_guest_caps |= not_mask; + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + session->acquired_guest_caps &= ~or_mask; + session->acquired_guest_caps |= not_mask; + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + } + + /* + * If we added a capability, check if that means some other thread in + * our session should be unblocked because there are events pending + * (the result of vbg_get_allowed_event_mask_for_session() may change). + * + * HACK ALERT! When the seamless support capability is added we generate + * a seamless change event so that the ring-3 client can sync with + * the seamless state. + */ + if (ret == 0 && or_mask != 0) { + spin_lock_irqsave(&gdev->event_spinlock, irqflags); + + if (or_mask & VMMDEV_GUEST_SUPPORTS_SEAMLESS) + gdev->pending_events |= + VMMDEV_EVENT_SEAMLESS_MODE_CHANGE_REQUEST; + + if (gdev->pending_events) + wakeup = true; + + spin_unlock_irqrestore(&gdev->event_spinlock, irqflags); + + if (wakeup) + wake_up(&gdev->event_wq); + } + +out: + mutex_unlock(&gdev->session_mutex); + + return ret; +} + /** * Sets the guest capabilities for a session. Takes the session spinlock. * Return: 0 or negative errno value. @@ -725,6 +832,13 @@ static int vbg_set_session_capabilities(struct vbg_dev *gdev, mutex_lock(&gdev->session_mutex); + if (gdev->acquire_mode_guest_caps & or_mask) { + vbg_err("%s error: cannot set caps which are in acquire_mode\n", + __func__); + ret = -EBUSY; + goto out; + } + /* Apply the changes to the session mask. */ previous = session->set_guest_caps; session->set_guest_caps |= or_mask; @@ -962,6 +1076,7 @@ void vbg_core_close_session(struct vbg_session *session) struct vbg_dev *gdev = session->gdev; int i, rc; + vbg_acquire_session_capabilities(gdev, session, 0, U32_MAX, 0, true); vbg_set_session_capabilities(gdev, session, 0, U32_MAX, true); vbg_set_session_event_filter(gdev, session, 0, U32_MAX, true); @@ -1019,6 +1134,25 @@ static int vbg_ioctl_driver_version_info( return 0; } +/* Must be called with the event_lock held */ +static u32 vbg_get_allowed_event_mask_for_session(struct vbg_dev *gdev, + struct vbg_session *session) +{ + u32 acquire_mode_caps = gdev->acquire_mode_guest_caps; + u32 session_acquired_caps = session->acquired_guest_caps; + u32 allowed_events = VMMDEV_EVENT_VALID_EVENT_MASK; + + if ((acquire_mode_caps & VMMDEV_GUEST_SUPPORTS_GRAPHICS) && + !(session_acquired_caps & VMMDEV_GUEST_SUPPORTS_GRAPHICS)) + allowed_events &= ~VMMDEV_EVENT_DISPLAY_CHANGE_REQUEST; + + if ((acquire_mode_caps & VMMDEV_GUEST_SUPPORTS_SEAMLESS) && + !(session_acquired_caps & VMMDEV_GUEST_SUPPORTS_SEAMLESS)) + allowed_events &= ~VMMDEV_EVENT_SEAMLESS_MODE_CHANGE_REQUEST; + + return allowed_events; +} + static bool vbg_wait_event_cond(struct vbg_dev *gdev, struct vbg_session *session, u32 event_mask) @@ -1030,6 +1164,7 @@ static bool vbg_wait_event_cond(struct vbg_dev *gdev, spin_lock_irqsave(&gdev->event_spinlock, flags); events = gdev->pending_events & event_mask; + events &= vbg_get_allowed_event_mask_for_session(gdev, session); wakeup = events || session->cancel_waiters; spin_unlock_irqrestore(&gdev->event_spinlock, flags); @@ -1044,6 +1179,7 @@ static u32 vbg_consume_events_locked(struct vbg_dev *gdev, { u32 events = gdev->pending_events & event_mask; + events &= vbg_get_allowed_event_mask_for_session(gdev, session); gdev->pending_events &= ~events; return events; } @@ -1445,6 +1581,29 @@ static int vbg_ioctl_change_filter_mask(struct vbg_dev *gdev, false); } +static int vbg_ioctl_acquire_guest_capabilities(struct vbg_dev *gdev, + struct vbg_session *session, + struct vbg_ioctl_acquire_guest_caps *caps) +{ + u32 flags, or_mask, not_mask; + + if (vbg_ioctl_chk(&caps->hdr, sizeof(caps->u.in), 0)) + return -EINVAL; + + flags = caps->u.in.flags; + or_mask = caps->u.in.or_mask; + not_mask = caps->u.in.not_mask; + + if (flags & ~VBGL_IOC_AGC_FLAGS_VALID_MASK) + return -EINVAL; + + if ((or_mask | not_mask) & ~VMMDEV_GUEST_CAPABILITIES_MASK) + return -EINVAL; + + return vbg_acquire_session_capabilities(gdev, session, or_mask, + not_mask, flags, false); +} + static int vbg_ioctl_change_guest_capabilities(struct vbg_dev *gdev, struct vbg_session *session, struct vbg_ioctl_set_guest_caps *caps) { @@ -1554,6 +1713,8 @@ int vbg_core_ioctl(struct vbg_session *session, unsigned int req, void *data) return vbg_ioctl_interrupt_all_wait_events(gdev, session, data); case VBG_IOCTL_CHANGE_FILTER_MASK: return vbg_ioctl_change_filter_mask(gdev, session, data); + case VBG_IOCTL_ACQUIRE_GUEST_CAPABILITIES: + return vbg_ioctl_acquire_guest_capabilities(gdev, session, data); case VBG_IOCTL_CHANGE_GUEST_CAPABILITIES: return vbg_ioctl_change_guest_capabilities(gdev, session, data); case VBG_IOCTL_CHECK_BALLOON: diff --git a/drivers/virt/vboxguest/vboxguest_core.h b/drivers/virt/vboxguest/vboxguest_core.h index dc745a033164..ab4bf64e2cec 100644 --- a/drivers/virt/vboxguest/vboxguest_core.h +++ b/drivers/virt/vboxguest/vboxguest_core.h @@ -117,6 +117,15 @@ struct vbg_dev { */ u32 event_filter_host; + /** + * Guest capabilities which have been switched to acquire_mode. + */ + u32 acquire_mode_guest_caps; + /** + * Guest capabilities acquired by vbg_acquire_session_capabilities(). + * Only one session can acquire a capability at a time. + */ + u32 acquired_guest_caps; /** * Usage counters for guest capabilities requested through * vbg_set_session_capabilities(). Indexed by capability bit @@ -164,6 +173,11 @@ struct vbg_session { * host filter. Protected by vbg_gdev.session_mutex. */ u32 event_filter; + /** + * Guest capabilities acquired by vbg_acquire_session_capabilities(). + * Only one session can acquire a capability at a time. + */ + u32 acquired_guest_caps; /** * Guest capabilities set through vbg_set_session_capabilities(). * A capability claimed by any guest session will be reported to the diff --git a/include/uapi/linux/vboxguest.h b/include/uapi/linux/vboxguest.h index f79d7abe27db..15125f6ec60d 100644 --- a/include/uapi/linux/vboxguest.h +++ b/include/uapi/linux/vboxguest.h @@ -257,6 +257,30 @@ VMMDEV_ASSERT_SIZE(vbg_ioctl_change_filter, 24 + 8); _IOWR('V', 12, struct vbg_ioctl_change_filter) +/** VBG_IOCTL_ACQUIRE_GUEST_CAPABILITIES data structure. */ +struct vbg_ioctl_acquire_guest_caps { + /** The header. */ + struct vbg_ioctl_hdr hdr; + union { + struct { + /** Flags (VBGL_IOC_AGC_FLAGS_XXX). */ + __u32 flags; + /** Capabilities to set (VMMDEV_GUEST_SUPPORTS_XXX). */ + __u32 or_mask; + /** Capabilities to drop (VMMDEV_GUEST_SUPPORTS_XXX). */ + __u32 not_mask; + } in; + } u; +}; +VMMDEV_ASSERT_SIZE(vbg_ioctl_acquire_guest_caps, 24 + 12); + +#define VBGL_IOC_AGC_FLAGS_CONFIG_ACQUIRE_MODE 0x00000001 +#define VBGL_IOC_AGC_FLAGS_VALID_MASK 0x00000001 + +#define VBG_IOCTL_ACQUIRE_GUEST_CAPABILITIES \ + _IOWR('V', 13, struct vbg_ioctl_acquire_guest_caps) + + /** VBG_IOCTL_CHANGE_GUEST_CAPABILITIES data structure. */ struct vbg_ioctl_set_guest_caps { /** The header. */ -- cgit v1.2.3 From 316b0035402f05fe9e9e5334d1ff65dae285cb7c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 9 Jul 2020 14:08:56 +0200 Subject: virt: vbox: Add a few new vmmdev request types to the userspace whitelist Upstream VirtualBox has defined and is using a few new request types for vmmdev requests passed through /dev/vboxguest to the hypervisor. Add the defines for these to vbox_vmmdev_types.h and add add them to the whitelists of vmmdev requests which userspace is allowed to make. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1789545 Acked-by: Arnd Bergmann Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200709120858.63928-7-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/virt/vboxguest/vboxguest_core.c | 2 ++ include/uapi/linux/vbox_vmmdev_types.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/drivers/virt/vboxguest/vboxguest_core.c b/drivers/virt/vboxguest/vboxguest_core.c index 4f1addaa3f6f..ffd76b949276 100644 --- a/drivers/virt/vboxguest/vboxguest_core.c +++ b/drivers/virt/vboxguest/vboxguest_core.c @@ -1299,7 +1299,9 @@ static int vbg_req_allowed(struct vbg_dev *gdev, struct vbg_session *session, case VMMDEVREQ_VIDEO_ACCEL_ENABLE: case VMMDEVREQ_VIDEO_ACCEL_FLUSH: case VMMDEVREQ_VIDEO_SET_VISIBLE_REGION: + case VMMDEVREQ_VIDEO_UPDATE_MONITOR_POSITIONS: case VMMDEVREQ_GET_DISPLAY_CHANGE_REQEX: + case VMMDEVREQ_GET_DISPLAY_CHANGE_REQ_MULTI: case VMMDEVREQ_GET_SEAMLESS_CHANGE_REQ: case VMMDEVREQ_GET_VRDPCHANGE_REQ: case VMMDEVREQ_LOG_STRING: diff --git a/include/uapi/linux/vbox_vmmdev_types.h b/include/uapi/linux/vbox_vmmdev_types.h index c27289fd619a..f8a8d6b3c521 100644 --- a/include/uapi/linux/vbox_vmmdev_types.h +++ b/include/uapi/linux/vbox_vmmdev_types.h @@ -63,6 +63,7 @@ enum vmmdev_request_type { VMMDEVREQ_SET_GUEST_CAPABILITIES = 56, VMMDEVREQ_VIDEMODE_SUPPORTED2 = 57, /* since version 3.2.0 */ VMMDEVREQ_GET_DISPLAY_CHANGE_REQEX = 80, /* since version 4.2.4 */ + VMMDEVREQ_GET_DISPLAY_CHANGE_REQ_MULTI = 81, VMMDEVREQ_HGCM_CONNECT = 60, VMMDEVREQ_HGCM_DISCONNECT = 61, VMMDEVREQ_HGCM_CALL32 = 62, @@ -92,6 +93,8 @@ enum vmmdev_request_type { VMMDEVREQ_WRITE_COREDUMP = 218, VMMDEVREQ_GUEST_HEARTBEAT = 219, VMMDEVREQ_HEARTBEAT_CONFIGURE = 220, + VMMDEVREQ_NT_BUG_CHECK = 221, + VMMDEVREQ_VIDEO_UPDATE_MONITOR_POSITIONS = 222, /* Ensure the enum is a 32 bit data-type */ VMMDEVREQ_SIZEHACK = 0x7fffffff }; -- cgit v1.2.3 From 04aaca197f16aff608e19ee98b1e55f535d746be Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 17 Jun 2020 17:33:13 +0900 Subject: char: raw: do not leak CONFIG_MAX_RAW_DEVS to userspace include/uapi/linux/raw.h leaks CONFIG_MAX_RAW_DEVS to userspace. Userspace programs cannot use MAX_RAW_MINORS since CONFIG_MAX_RAW_DEVS is not available anyway. Remove the MAX_RAW_MINORS definition from the exported header, and use CONFIG_MAX_RAW_DEVS in drivers/char/raw.c While I was here, I converted printk(KERN_WARNING ...) to pr_warn(...) and stretched the warning message. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20200617083313.183184-1-masahiroy@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/char/raw.c | 8 ++++---- include/uapi/linux/raw.h | 2 -- scripts/headers_install.sh | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 3484e9145aea..380bf518338e 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -37,7 +37,7 @@ static struct raw_device_data *raw_devices; static DEFINE_MUTEX(raw_mutex); static const struct file_operations raw_ctl_fops; /* forward declaration */ -static int max_raw_minors = MAX_RAW_MINORS; +static int max_raw_minors = CONFIG_MAX_RAW_DEVS; module_param(max_raw_minors, int, 0); MODULE_PARM_DESC(max_raw_minors, "Maximum number of raw devices (1-65536)"); @@ -317,9 +317,9 @@ static int __init raw_init(void) int ret; if (max_raw_minors < 1 || max_raw_minors > 65536) { - printk(KERN_WARNING "raw: invalid max_raw_minors (must be" - " between 1 and 65536), using %d\n", MAX_RAW_MINORS); - max_raw_minors = MAX_RAW_MINORS; + pr_warn("raw: invalid max_raw_minors (must be between 1 and 65536), using %d\n", + CONFIG_MAX_RAW_DEVS); + max_raw_minors = CONFIG_MAX_RAW_DEVS; } raw_devices = vzalloc(array_size(max_raw_minors, diff --git a/include/uapi/linux/raw.h b/include/uapi/linux/raw.h index dc96dda479d6..47874919d0b9 100644 --- a/include/uapi/linux/raw.h +++ b/include/uapi/linux/raw.h @@ -14,6 +14,4 @@ struct raw_config_request __u64 block_minor; }; -#define MAX_RAW_MINORS CONFIG_MAX_RAW_DEVS - #endif /* __LINUX_RAW_H */ diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh index 224f51012b6e..cdd66038818c 100755 --- a/scripts/headers_install.sh +++ b/scripts/headers_install.sh @@ -90,7 +90,6 @@ include/uapi/linux/elfcore.h:CONFIG_BINFMT_ELF_FDPIC include/uapi/linux/eventpoll.h:CONFIG_PM_SLEEP include/uapi/linux/hw_breakpoint.h:CONFIG_HAVE_MIXED_BREAKPOINTS_REGS include/uapi/linux/pktcdvd.h:CONFIG_CDROM_PKTCDVD_WCACHE -include/uapi/linux/raw.h:CONFIG_MAX_RAW_DEVS " for c in $configs -- cgit v1.2.3 From 7c97f3aded10aa86fc1944341288434117e9c926 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Thu, 2 Jul 2020 11:29:31 +0300 Subject: RDMA/counter: Add PID category support in auto mode With the "PID" category QPs have same PID will be bound to same counter; If this category is not set then QPs have different PIDs will be bound to same counter. This is implemented for 2 reasons: 1. The counter is a limited resource, while there may be dozens of applications, each of which creates several types of QPs, which means it may doesn't have enough counter. 2. The system administrator needs all QPs created by all applications with same type bound to one counter. The counter name and PID is only make sense when "PID" category are configured. This category can also be used in combine with others, e.g. QP type. Link: https://lore.kernel.org/r/20200702082933.424537-2-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/counters.c | 20 +++++--------------- drivers/infiniband/core/nldev.c | 8 ++++++-- include/uapi/rdma/rdma_netlink.h | 1 + 3 files changed, 12 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 2257d7f7810f..40204c6caa5b 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -8,7 +8,7 @@ #include "core_priv.h" #include "restrack.h" -#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE) +#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID) static int __counter_set_mode(struct rdma_counter_mode *curr, enum rdma_nl_counter_mode new_mode, @@ -149,23 +149,13 @@ static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, struct auto_mode_param *param = &counter->mode.param; bool match = true; - /* - * Ensure that counter belongs to the right PID. This operation can - * race with user space which kills the process and leaves QP and - * counters orphans. - * - * It is not a big deal because exitted task will leave both QP and - * counter in the same bucket of zombie process. Just ensure that - * process is still alive before procedding. - * - */ - if (task_pid_nr(counter->res.task) != task_pid_nr(qp->res.task) || - !task_pid_nr(qp->res.task)) - return false; - if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) match &= (param->qp_type == qp->qp_type); + if (auto_mask & RDMA_COUNTER_MASK_PID) + match &= (task_pid_nr(counter->res.task) == + task_pid_nr(qp->res.task)); + return match; } diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 1051b5622b62..76af7ea2875d 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -711,11 +711,16 @@ static int fill_stat_counter_mode(struct sk_buff *msg, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode)) return -EMSGSIZE; - if (m->mode == RDMA_COUNTER_MODE_AUTO) + if (m->mode == RDMA_COUNTER_MODE_AUTO) { if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type)) return -EMSGSIZE; + if ((m->mask & RDMA_COUNTER_MASK_PID) && + fill_res_name_pid(msg, &counter->res)) + return -EMSGSIZE; + } + return 0; } @@ -855,7 +860,6 @@ static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) || - fill_res_name_pid(msg, &counter->res) || fill_stat_counter_mode(msg, counter) || fill_stat_counter_qps(msg, counter) || fill_stat_counter_hwcounters(msg, counter)) diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 3826143d420d..d2f5b8396243 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -569,5 +569,6 @@ enum rdma_nl_counter_mode { */ enum rdma_nl_counter_mask { RDMA_COUNTER_MASK_QP_TYPE = 1, + RDMA_COUNTER_MASK_PID = 1 << 1, }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From c7d759eb7b12f91a25f4d3cd03ff5209046ddfc2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 9 Jul 2020 17:42:47 -0700 Subject: ethtool: add tunnel info interface Add an interface to report offloaded UDP ports via ethtool netlink. Now that core takes care of tracking which UDP tunnel ports the NICs are aware of we can quite easily export this information out to user space. The responsibility of writing the netlink dumps is split between ethtool code and udp_tunnel_nic.c - since udp_tunnel module may not always be loaded, yet we should always report the capabilities of the NIC. $ ethtool --show-tunnels eth0 Tunnel information for eth0: UDP port table 0: Size: 4 Types: vxlan No entries UDP port table 1: Size: 4 Types: geneve, vxlan-gpe Entries (1): port 1230, vxlan-gpe v4: - back to v2, build fix is now directly in udp_tunnel.h v3: - don't compile ETHTOOL_MSG_TUNNEL_INFO_GET in if CONFIG_INET not set. v2: - fix string set count, - reorder enums in the uAPI, - fix type of ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES to bitset in docs and comments. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 33 ++++ include/net/udp_tunnel.h | 21 +++ include/uapi/linux/ethtool.h | 2 + include/uapi/linux/ethtool_netlink.h | 55 ++++++ net/ethtool/Makefile | 3 +- net/ethtool/common.c | 9 + net/ethtool/common.h | 1 + net/ethtool/netlink.c | 12 ++ net/ethtool/netlink.h | 4 + net/ethtool/strset.c | 5 + net/ethtool/tunnels.c | 259 +++++++++++++++++++++++++++ net/ipv4/udp_tunnel_nic.c | 69 +++++++ 12 files changed, 472 insertions(+), 1 deletion(-) create mode 100644 net/ethtool/tunnels.c (limited to 'include/uapi') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 459a0d11cfde..7d75f1e32152 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1230,6 +1230,39 @@ used to report the amplitude of the reflection for a given pair. | | | ``ETHTOOL_A_CABLE_AMPLITUDE_mV`` | s16 | Reflection amplitude | +-+-+-----------------------------------------+--------+----------------------+ +TUNNEL_INFO +=========== + +Gets information about the tunnel state NIC is aware of. + +Request contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_TUNNEL_INFO_HEADER`` nested request header + ===================================== ====== ========================== + +Kernel response contents: + + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_TUNNEL_INFO_HEADER`` | nested | reply header | + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_TUNNEL_INFO_UDP_PORTS`` | nested | all UDP port tables | + +-+-------------------------------------------+--------+---------------------+ + | | ``ETHTOOL_A_TUNNEL_UDP_TABLE`` | nested | one UDP port table | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE`` | u32 | max size of the | + | | | | | table | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES`` | bitset | tunnel types which | + | | | | | table can hold | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY`` | nested | offloaded UDP port | + +-+-+-+---------------------------------------+--------+---------------------+ + | | | | ``ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT`` | be16 | UDP port | + +-+-+-+---------------------------------------+--------+---------------------+ + | | | | ``ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE`` | u32 | tunnel type | + +-+-+-+---------------------------------------+--------+---------------------+ + Request translation =================== diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index ee34619e4cfa..dd20ce99740c 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -255,6 +255,10 @@ struct udp_tunnel_nic_ops { void (*add_port)(struct net_device *dev, struct udp_tunnel_info *ti); void (*del_port)(struct net_device *dev, struct udp_tunnel_info *ti); void (*reset_ntf)(struct net_device *dev); + + size_t (*dump_size)(struct net_device *dev, unsigned int table); + int (*dump_write)(struct net_device *dev, unsigned int table, + struct sk_buff *skb); }; #ifdef CONFIG_INET @@ -318,4 +322,21 @@ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev) if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->reset_ntf(dev); } + +static inline size_t +udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) +{ + if (!udp_tunnel_nic_ops) + return 0; + return udp_tunnel_nic_ops->dump_size(dev, table); +} + +static inline int +udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, + struct sk_buff *skb) +{ + if (!udp_tunnel_nic_ops) + return 0; + return udp_tunnel_nic_ops->dump_write(dev, table, skb); +} #endif diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 60856e0f9618..b4f2d134e713 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -669,6 +669,7 @@ enum ethtool_link_ext_substate_cable_issue { * @ETH_SS_SOF_TIMESTAMPING: SOF_TIMESTAMPING_* flags * @ETH_SS_TS_TX_TYPES: timestamping Tx types * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters + * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types */ enum ethtool_stringset { ETH_SS_TEST = 0, @@ -686,6 +687,7 @@ enum ethtool_stringset { ETH_SS_SOF_TIMESTAMPING, ETH_SS_TS_TX_TYPES, ETH_SS_TS_RX_FILTERS, + ETH_SS_UDP_TUNNEL_TYPES, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index c12ce4df4b6b..5dcd24cb33ea 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -41,6 +41,7 @@ enum { ETHTOOL_MSG_TSINFO_GET, ETHTOOL_MSG_CABLE_TEST_ACT, ETHTOOL_MSG_CABLE_TEST_TDR_ACT, + ETHTOOL_MSG_TUNNEL_INFO_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -556,6 +557,60 @@ enum { ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 }; +/* TUNNEL INFO */ + +enum { + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, + ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE, + + __ETHTOOL_UDP_TUNNEL_TYPE_CNT +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, /* be16 */ + ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT, + ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, /* u32 */ + ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, /* bitset */ + ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, /* nest - _UDP_ENTRY_* */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT, + ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_TABLE, /* nest - _UDP_TABLE_* */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_CNT, + ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_INFO_UNSPEC, + ETHTOOL_A_TUNNEL_INFO_HEADER, /* nest - _A_HEADER_* */ + + ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, /* nest - _UDP_TABLE */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_INFO_CNT, + ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 0c2b94f20499..7a849ff22dad 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -6,4 +6,5 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ - channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o + channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ + tunnels.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index c54166713797..ed19573fccd7 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include @@ -272,6 +273,14 @@ const char ts_rx_filter_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT); +const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = { + [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan", + [ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve", + [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE] = "vxlan-gpe", +}; +static_assert(ARRAY_SIZE(udp_tunnel_type_names) == + __ETHTOOL_UDP_TUNNEL_TYPE_CNT); + /* return false if legacy contained non-0 deprecated fields * maxtxpkt/maxrxpkt. rest of ksettings always updated */ diff --git a/net/ethtool/common.h b/net/ethtool/common.h index b83bef38368c..3d9251c95a8b 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -28,6 +28,7 @@ extern const char wol_mode_names[][ETH_GSTRING_LEN]; extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; extern const char ts_tx_type_names[][ETH_GSTRING_LEN]; extern const char ts_rx_filter_names[][ETH_GSTRING_LEN]; +extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 88fd07f47040..fb9d096faaa4 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -181,6 +181,12 @@ err: return NULL; } +void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd) +{ + return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + ðtool_genl_family, 0, cmd); +} + void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) { return genlmsg_put(skb, 0, ++ethnl_bcast_seq, ðtool_genl_family, 0, @@ -849,6 +855,12 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test_tdr, }, + { + .cmd = ETHTOOL_MSG_TUNNEL_INFO_GET, + .doit = ethnl_tunnel_info_doit, + .start = ethnl_tunnel_info_start, + .dumpit = ethnl_tunnel_info_dumpit, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 9a96b6e90dc2..e2085005caac 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -19,6 +19,7 @@ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); +void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd); void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); @@ -361,5 +362,8 @@ int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); +int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); +int ethnl_tunnel_info_start(struct netlink_callback *cb); +int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 773634b6b048..82707b662fe4 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -75,6 +75,11 @@ static const struct strset_info info_template[] = { .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, + [ETH_SS_UDP_TUNNEL_TYPES] = { + .per_dev = false, + .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + .strings = udp_tunnel_type_names, + }, }; struct strset_req_info { diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c new file mode 100644 index 000000000000..6b89255f1231 --- /dev/null +++ b/net/ethtool/tunnels.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#include "bitset.h" +#include "common.h" +#include "netlink.h" + +static const struct nla_policy +ethtool_tunnel_info_policy[ETHTOOL_A_TUNNEL_INFO_MAX + 1] = { + [ETHTOOL_A_TUNNEL_INFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_TUNNEL_INFO_HEADER] = { .type = NLA_NESTED }, +}; + +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN == ilog2(UDP_TUNNEL_TYPE_VXLAN)); +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_GENEVE == ilog2(UDP_TUNNEL_TYPE_GENEVE)); +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE == + ilog2(UDP_TUNNEL_TYPE_VXLAN_GPE)); + +static ssize_t +ethnl_tunnel_info_reply_size(const struct ethnl_req_info *req_base, + struct netlink_ext_ack *extack) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct udp_tunnel_nic_info *info; + unsigned int i; + size_t size; + int ret; + + info = req_base->dev->udp_tunnel_nic_info; + if (!info) { + NL_SET_ERR_MSG(extack, + "device does not report tunnel offload info"); + return -EOPNOTSUPP; + } + + size = nla_total_size(0); /* _INFO_UDP_PORTS */ + + for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + return size; + + size += nla_total_size(0); /* _UDP_TABLE */ + size += nla_total_size(sizeof(u32)); /* _UDP_TABLE_SIZE */ + ret = ethnl_bitset32_size(&info->tables[i].tunnel_types, NULL, + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact); + if (ret < 0) + return ret; + size += ret; + + size += udp_tunnel_nic_dump_size(req_base->dev, i); + } + + return size; +} + +static int +ethnl_tunnel_info_fill_reply(const struct ethnl_req_info *req_base, + struct sk_buff *skb) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct udp_tunnel_nic_info *info; + struct nlattr *ports, *table; + unsigned int i; + + info = req_base->dev->udp_tunnel_nic_info; + if (!info) + return -EOPNOTSUPP; + + ports = nla_nest_start(skb, ETHTOOL_A_TUNNEL_INFO_UDP_PORTS); + if (!ports) + return -EMSGSIZE; + + for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + break; + + table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE); + if (!table) + goto err_cancel_ports; + + if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, + info->tables[i].n_entries)) + goto err_cancel_table; + + if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, + &info->tables[i].tunnel_types, NULL, + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact)) + goto err_cancel_table; + + if (udp_tunnel_nic_dump_write(req_base->dev, i, skb)) + goto err_cancel_table; + + nla_nest_end(skb, table); + } + + nla_nest_end(skb, ports); + + return 0; + +err_cancel_table: + nla_nest_cancel(skb, table); +err_cancel_ports: + nla_nest_cancel(skb, ports); + return -EMSGSIZE; +} + +static int +ethnl_tunnel_info_req_parse(struct ethnl_req_info *req_info, + const struct nlmsghdr *nlhdr, struct net *net, + struct netlink_ext_ack *extack, bool require_dev) +{ + struct nlattr *tb[ETHTOOL_A_TUNNEL_INFO_MAX + 1]; + int ret; + + ret = nlmsg_parse(nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_TUNNEL_INFO_MAX, + ethtool_tunnel_info_policy, extack); + if (ret < 0) + return ret; + + return ethnl_parse_header_dev_get(req_info, + tb[ETHTOOL_A_TUNNEL_INFO_HEADER], + net, extack, require_dev); +} + +int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_req_info req_info = {}; + struct sk_buff *rskb; + void *reply_payload; + int reply_len; + int ret; + + ret = ethnl_tunnel_info_req_parse(&req_info, info->nlhdr, + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + rtnl_lock(); + ret = ethnl_tunnel_info_reply_size(&req_info, info->extack); + if (ret < 0) + goto err_unlock_rtnl; + reply_len = ret + ethnl_reply_header_size(); + + rskb = ethnl_reply_init(reply_len, req_info.dev, + ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_A_TUNNEL_INFO_HEADER, + info, &reply_payload); + if (!rskb) { + ret = -ENOMEM; + goto err_unlock_rtnl; + } + + ret = ethnl_tunnel_info_fill_reply(&req_info, rskb); + if (ret) + goto err_free_msg; + rtnl_unlock(); + dev_put(req_info.dev); + genlmsg_end(rskb, reply_payload); + + return genlmsg_reply(rskb, info); + +err_free_msg: + nlmsg_free(rskb); +err_unlock_rtnl: + rtnl_unlock(); + dev_put(req_info.dev); + return ret; +} + +struct ethnl_tunnel_info_dump_ctx { + struct ethnl_req_info req_info; + int pos_hash; + int pos_idx; +}; + +int ethnl_tunnel_info_start(struct netlink_callback *cb) +{ + struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + memset(ctx, 0, sizeof(*ctx)); + + ret = ethnl_tunnel_info_req_parse(&ctx->req_info, cb->nlh, + sock_net(cb->skb->sk), cb->extack, + false); + if (ctx->req_info.dev) { + dev_put(ctx->req_info.dev); + ctx->req_info.dev = NULL; + } + + return ret; +} + +int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + int s_idx = ctx->pos_idx; + int h, idx = 0; + int ret = 0; + void *ehdr; + + rtnl_lock(); + cb->seq = net->dev_base_seq; + for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + struct hlist_head *head; + struct net_device *dev; + + head = &net->dev_index_head[h]; + idx = 0; + hlist_for_each_entry(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TUNNEL_INFO_GET); + if (!ehdr) { + ret = -EMSGSIZE; + goto out; + } + + ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TUNNEL_INFO_HEADER); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + goto out; + } + + ctx->req_info.dev = dev; + ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); + ctx->req_info.dev = NULL; + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + goto cont; + goto out; + } + genlmsg_end(skb, ehdr); +cont: + idx++; + } + } +out: + rtnl_unlock(); + + ctx->pos_hash = h; + ctx->pos_idx = idx; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + if (ret == -EMSGSIZE && skb->len) + return skb->len; + return ret; +} diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index 056cfe0b770e..f0dbd9905a53 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. +#include #include #include #include @@ -72,6 +73,12 @@ udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) return entry->use_cnt == 0 && !entry->flags; } +static bool +udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); +} + static bool udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) { @@ -564,12 +571,74 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) __udp_tunnel_nic_device_sync(dev, utn); } +static size_t +__udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int j; + size_t size; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + size = 0; + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + size += nla_total_size(0) + /* _TABLE_ENTRY */ + nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ + nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ + } + + return size; +} + +static int +__udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, + struct sk_buff *skb) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + struct nlattr *nest; + unsigned int j; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); + + if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, + utn->entries[table][j].port) || + nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, + ilog2(utn->entries[table][j].type))) + goto err_cancel; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, .add_port = __udp_tunnel_nic_add_port, .del_port = __udp_tunnel_nic_del_port, .reset_ntf = __udp_tunnel_nic_reset_ntf, + .dump_size = __udp_tunnel_nic_dump_size, + .dump_write = __udp_tunnel_nic_dump_write, }; static void -- cgit v1.2.3 From 3edd68399dc155b80335244c8c2673eaa652931a Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Fri, 10 Jul 2020 17:48:11 +0200 Subject: KVM: x86: Add a capability for GUEST_MAXPHYADDR < HOST_MAXPHYADDR support This patch adds a new capability KVM_CAP_SMALLER_MAXPHYADDR which allows userspace to query if the underlying architecture would support GUEST_MAXPHYADDR < HOST_MAXPHYADDR and hence act accordingly (e.g. qemu can decide if it should warn for -cpu ..,phys-bits=X) The complications in this patch are due to unexpected (but documented) behaviour we see with NPF vmexit handling in AMD processor. If SVM is modified to add guest physical address checks in the NPF and guest #PF paths, we see the followning error multiple times in the 'access' test in kvm-unit-tests: test pte.p pte.36 pde.p: FAIL: pte 2000021 expected 2000001 Dump mapping: address: 0x123400000000 ------L4: 24c3027 ------L3: 24c4027 ------L2: 24c5021 ------L1: 1002000021 This is because the PTE's accessed bit is set by the CPU hardware before the NPF vmexit. This is handled completely by hardware and cannot be fixed in software. Therefore, availability of the new capability depends on a boolean variable allow_smaller_maxphyaddr which is set individually by VMX and SVM init routines. On VMX it's always set to true, on SVM it's only set to true when NPT is not enabled. CC: Tom Lendacky CC: Babu Moger Signed-off-by: Mohammed Gamal Message-Id: <20200710154811.418214-10-mgamal@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 15 +++++++++++++++ arch/x86/kvm/vmx/vmx.c | 7 +++++++ arch/x86/kvm/x86.c | 6 ++++++ include/uapi/linux/kvm.h | 2 ++ 5 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1df95f10c903..1bab87a444d7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1263,7 +1263,7 @@ struct kvm_arch_async_pf { }; extern u64 __read_mostly host_efer; - +extern bool __read_mostly allow_smaller_maxphyaddr; extern struct kvm_x86_ops kvm_x86_ops; #define __KVM_HAVE_ARCH_VM_ALLOC diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2371b1e40f39..783330d0e7b8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -924,6 +924,21 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + return 0; err: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 962a78c7dde5..1bb59ae5016d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8309,6 +8309,13 @@ static int __init vmx_init(void) #endif vmx_check_vmcs12_offsets(); + /* + * Intel processors don't have problems with + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable + * it for VMX by default + */ + allow_smaller_maxphyaddr = true; + return 0; } module_init(vmx_init); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35abe69aad28..95ef62922869 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -187,6 +187,9 @@ static struct kvm_shared_msrs __percpu *shared_msrs; u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); +bool __read_mostly allow_smaller_maxphyaddr; +EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); + static u64 __read_mostly host_xss; u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); @@ -3574,6 +3577,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; + case KVM_CAP_SMALLER_MAXPHYADDR: + r = (int) allow_smaller_maxphyaddr; + break; default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ff9b335620d0..2c73dcfb3dbb 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1033,6 +1033,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HALT_POLL 182 #define KVM_CAP_ASYNC_PF_INT 183 #define KVM_CAP_LAST_CPU 184 +#define KVM_CAP_SMALLER_MAXPHYADDR 185 + #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 47e33c05f9f07cac3de833e531bcac9ae052c7ca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Jun 2020 15:42:46 -0700 Subject: seccomp: Fix ioctl number for SECCOMP_IOCTL_NOTIF_ID_VALID When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced it had the wrong direction flag set. While this isn't a big deal as nothing currently enforces these bits in the kernel, it should be defined correctly. Fix the define and provide support for the old command until it is no longer needed for backward compatibility. Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 3 ++- kernel/seccomp.c | 9 +++++++++ tools/testing/selftests/seccomp/seccomp_bpf.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index c1735455bc53..965290f7dcc2 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -123,5 +123,6 @@ struct seccomp_notif_resp { #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) -#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) + #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f0e3f3a7a5d..0ed57e8c49d0 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -44,6 +44,14 @@ #include #include +/* + * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the + * wrong direction flag in the ioctl number. This is the broken one, + * which the kernel needs to keep supporting until all userspaces stop + * using the wrong command number. + */ +#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64) + enum notify_state { SECCOMP_NOTIFY_INIT, SECCOMP_NOTIFY_SENT, @@ -1236,6 +1244,7 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, return seccomp_notify_recv(filter, buf); case SECCOMP_IOCTL_NOTIF_SEND: return seccomp_notify_send(filter, buf); + case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); default: diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 43884b6625fc..61f9ac200001 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -186,7 +186,7 @@ struct seccomp_metadata { #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) -#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) struct seccomp_notif { __u64 id; -- cgit v1.2.3 From 21249616f02dc3f4c5efc3b3e9ba48e428b0131c Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Wed, 8 Jul 2020 12:15:57 +0800 Subject: gpio: uapi: fix misplaced comment line The second line of the description for event_type is before the first. Move it to after the first line. Signed-off-by: Kent Gibson Signed-off-by: Bartosz Golaszewski --- include/uapi/linux/gpio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 0206383c0383..9c27cecf406f 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -71,8 +71,8 @@ enum { * of a GPIO line * @info: updated line information * @timestamp: estimate of time of status change occurrence, in nanoseconds - * and GPIOLINE_CHANGED_CONFIG * @event_type: one of GPIOLINE_CHANGED_REQUESTED, GPIOLINE_CHANGED_RELEASED + * and GPIOLINE_CHANGED_CONFIG * * Note: struct gpioline_info embedded here has 32-bit alignment on its own, * but it works fine with 64-bit alignment too. With its 72 byte size, we can -- cgit v1.2.3 From c1326210477ecc06c53221f0005c64419aba30d6 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:20 +0000 Subject: nfs,nfsd: NFSv4.2 extended attribute protocol definitions Add definitions for the new operations, errors and flags as defined in RFC 8276 (File System Extended Attributes in NFSv4). Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever --- include/linux/nfs4.h | 20 ++++++++++++++++++++ include/uapi/linux/nfs4.h | 3 +++ 2 files changed, 23 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 4dba3c948932..e6ca9d1d2e76 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -150,6 +150,12 @@ enum nfs_opnum4 { OP_WRITE_SAME = 70, OP_CLONE = 71, + /* xattr support (RFC8726) */ + OP_GETXATTR = 72, + OP_SETXATTR = 73, + OP_LISTXATTRS = 74, + OP_REMOVEXATTR = 75, + OP_ILLEGAL = 10044, }; @@ -280,6 +286,10 @@ enum nfsstat4 { NFS4ERR_WRONG_LFS = 10092, NFS4ERR_BADLABEL = 10093, NFS4ERR_OFFLOAD_NO_REQS = 10094, + + /* xattr (RFC8276) */ + NFS4ERR_NOXATTR = 10095, + NFS4ERR_XATTR2BIG = 10096, }; static inline bool seqid_mutating_err(u32 err) @@ -452,6 +462,7 @@ enum change_attr_type4 { #define FATTR4_WORD2_CHANGE_ATTR_TYPE (1UL << 15) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) #define FATTR4_WORD2_MODE_UMASK (1UL << 17) +#define FATTR4_WORD2_XATTR_SUPPORT (1UL << 18) /* MDS threshold bitmap bits */ #define THRESHOLD_RD (1UL << 0) @@ -700,4 +711,13 @@ struct nl4_server { struct nfs42_netaddr nl4_addr; /* NL4_NETADDR */ } u; }; + +/* + * Options for setxattr. These match the flags for setxattr(2). + */ +enum nfs4_setxattr_options { + SETXATTR4_EITHER = 0, + SETXATTR4_CREATE = 1, + SETXATTR4_REPLACE = 2, +}; #endif diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index 8572930cf5b0..bf197e99b98f 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -33,6 +33,9 @@ #define NFS4_ACCESS_EXTEND 0x0008 #define NFS4_ACCESS_DELETE 0x0010 #define NFS4_ACCESS_EXECUTE 0x0020 +#define NFS4_ACCESS_XAREAD 0x0040 +#define NFS4_ACCESS_XAWRITE 0x0080 +#define NFS4_ACCESS_XALIST 0x0100 #define NFS4_FH_PERSISTENT 0x0000 #define NFS4_FH_NOEXPIRE_WITH_OPEN 0x0001 -- cgit v1.2.3 From 95ad37f90c338e3fd4abf61cecfe02b6f3e080f0 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:04 +0000 Subject: NFSv4.2: add client side xattr caching. Implement client side caching for NFSv4.2 extended attributes. The cache is a per-inode hashtable, with name/value entries. There is one special entry for the listxattr cache. NFS inodes have a pointer to a cache structure. The cache structure is allocated on demand, freed when the cache is invalidated. Memory shrinkers keep the size in check. Large entries (> PAGE_SIZE) are collected by a separate shrinker, and freed more aggressively than others. Signed-off-by: Frank van der Linden Signed-off-by: Trond Myklebust --- fs/nfs/Makefile | 2 +- fs/nfs/inode.c | 9 +- fs/nfs/nfs42proc.c | 12 + fs/nfs/nfs42xattr.c | 1083 +++++++++++++++++++++++++++++++++++++++++++ fs/nfs/nfs4_fs.h | 22 + fs/nfs/nfs4proc.c | 42 +- fs/nfs/nfs4super.c | 10 + include/linux/nfs_fs.h | 6 + include/uapi/linux/nfs_fs.h | 1 + 9 files changed, 1179 insertions(+), 8 deletions(-) create mode 100644 fs/nfs/nfs42xattr.c (limited to 'include/uapi') diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 2433c3e03cfa..22d11fdc6deb 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -30,7 +30,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o -nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o +nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o nfs42xattr.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 629af798dfc9..10048eed485d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -193,6 +193,7 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) return nfs_check_cache_invalid_not_delegated(inode, flags); } +EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { @@ -234,11 +235,13 @@ static void nfs_zap_caches_locked(struct inode *inode) | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_XATTR | NFS_INO_REVAL_PAGECACHE); } else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_XATTR | NFS_INO_REVAL_PAGECACHE); nfs_zap_label_cache_locked(nfsi); } @@ -1897,7 +1900,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (!(have_writers || have_delegation)) { invalid |= NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_XATTR; /* Force revalidate of all attributes */ save_cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME @@ -2100,6 +2104,9 @@ struct inode *nfs_alloc_inode(struct super_block *sb) #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ +#ifdef CONFIG_NFS_V4_2 + nfsi->xattr_cache = NULL; +#endif return &nfsi->vfs_inode; } EXPORT_SYMBOL_GPL(nfs_alloc_inode); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 8c2e52bc986a..e200522469af 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1182,6 +1182,18 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, if (ret < 0) return ret; + /* + * Normally, the caching is done one layer up, but for successful + * RPCS, always cache the result here, even if the caller was + * just querying the length, or if the reply was too big for + * the caller. This avoids a second RPC in the case of the + * common query-alloc-retrieve cycle for xattrs. + * + * Note that xattr_len is always capped to XATTR_SIZE_MAX. + */ + + nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len); + if (buflen) { if (res.xattr_len > buflen) return -ERANGE; diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c new file mode 100644 index 000000000000..23fdab977a2a --- /dev/null +++ b/fs/nfs/nfs42xattr.c @@ -0,0 +1,1083 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2019, 2020 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * User extended attribute client side cache functions. + * + * Author: Frank van der Linden + */ +#include +#include +#include +#include +#include + +#include "nfs4_fs.h" +#include "internal.h" + +/* + * User extended attributes client side caching is implemented by having + * a cache structure attached to NFS inodes. This structure is allocated + * when needed, and freed when the cache is zapped. + * + * The cache structure contains as hash table of entries, and a pointer + * to a special-cased entry for the listxattr cache. + * + * Accessing and allocating / freeing the caches is done via reference + * counting. The cache entries use a similar refcounting scheme. + * + * This makes freeing a cache, both from the shrinker and from the + * zap cache path, easy. It also means that, in current use cases, + * the large majority of inodes will not waste any memory, as they + * will never have any user extended attributes assigned to them. + * + * Attribute entries are hashed in to a simple hash table. They are + * also part of an LRU. + * + * There are three shrinkers. + * + * Two shrinkers deal with the cache entries themselves: one for + * large entries (> PAGE_SIZE), and one for smaller entries. The + * shrinker for the larger entries works more aggressively than + * those for the smaller entries. + * + * The other shrinker frees the cache structures themselves. + */ + +/* + * 64 buckets is a good default. There is likely no reasonable + * workload that uses more than even 64 user extended attributes. + * You can certainly add a lot more - but you get what you ask for + * in those circumstances. + */ +#define NFS4_XATTR_HASH_SIZE 64 + +#define NFSDBG_FACILITY NFSDBG_XATTRCACHE + +struct nfs4_xattr_cache; +struct nfs4_xattr_entry; + +struct nfs4_xattr_bucket { + spinlock_t lock; + struct hlist_head hlist; + struct nfs4_xattr_cache *cache; + bool draining; +}; + +struct nfs4_xattr_cache { + struct kref ref; + spinlock_t hash_lock; /* protects hashtable and lru */ + struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE]; + struct list_head lru; + struct list_head dispose; + atomic_long_t nent; + spinlock_t listxattr_lock; + struct inode *inode; + struct nfs4_xattr_entry *listxattr; + struct work_struct work; +}; + +struct nfs4_xattr_entry { + struct kref ref; + struct hlist_node hnode; + struct list_head lru; + struct list_head dispose; + char *xattr_name; + void *xattr_value; + size_t xattr_size; + struct nfs4_xattr_bucket *bucket; + uint32_t flags; +}; + +#define NFS4_XATTR_ENTRY_EXTVAL 0x0001 + +/* + * LRU list of NFS inodes that have xattr caches. + */ +static struct list_lru nfs4_xattr_cache_lru; +static struct list_lru nfs4_xattr_entry_lru; +static struct list_lru nfs4_xattr_large_entry_lru; + +static struct kmem_cache *nfs4_xattr_cache_cachep; + +static struct workqueue_struct *nfs4_xattr_cache_wq; + +/* + * Hashing helper functions. + */ +static void +nfs4_xattr_hash_init(struct nfs4_xattr_cache *cache) +{ + unsigned int i; + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + INIT_HLIST_HEAD(&cache->buckets[i].hlist); + spin_lock_init(&cache->buckets[i].lock); + cache->buckets[i].cache = cache; + cache->buckets[i].draining = false; + } +} + +/* + * Locking order: + * 1. inode i_lock or bucket lock + * 2. list_lru lock (taken by list_lru_* functions) + */ + +/* + * Wrapper functions to add a cache entry to the right LRU. + */ +static bool +nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry) +{ + struct list_lru *lru; + + lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + return list_lru_add(lru, &entry->lru); +} + +static bool +nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) +{ + struct list_lru *lru; + + lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + return list_lru_del(lru, &entry->lru); +} + +/* + * This function allocates cache entries. They are the normal + * extended attribute name/value pairs, but may also be a listxattr + * cache. Those allocations use the same entry so that they can be + * treated as one by the memory shrinker. + * + * xattr cache entries are allocated together with names. If the + * value fits in to one page with the entry structure and the name, + * it will also be part of the same allocation (kmalloc). This is + * expected to be the vast majority of cases. Larger allocations + * have a value pointer that is allocated separately by kvmalloc. + * + * Parameters: + * + * @name: Name of the extended attribute. NULL for listxattr cache + * entry. + * @value: Value of attribute, or listxattr cache. NULL if the + * value is to be copied from pages instead. + * @pages: Pages to copy the value from, if not NULL. Passed in to + * make it easier to copy the value after an RPC, even if + * the value will not be passed up to application (e.g. + * for a 'query' getxattr with NULL buffer). + * @len: Length of the value. Can be 0 for zero-length attribues. + * @value and @pages will be NULL if @len is 0. + */ +static struct nfs4_xattr_entry * +nfs4_xattr_alloc_entry(const char *name, const void *value, + struct page **pages, size_t len) +{ + struct nfs4_xattr_entry *entry; + void *valp; + char *namep; + size_t alloclen, slen; + char *buf; + uint32_t flags; + + BUILD_BUG_ON(sizeof(struct nfs4_xattr_entry) + + XATTR_NAME_MAX + 1 > PAGE_SIZE); + + alloclen = sizeof(struct nfs4_xattr_entry); + if (name != NULL) { + slen = strlen(name) + 1; + alloclen += slen; + } else + slen = 0; + + if (alloclen + len <= PAGE_SIZE) { + alloclen += len; + flags = 0; + } else { + flags = NFS4_XATTR_ENTRY_EXTVAL; + } + + buf = kmalloc(alloclen, GFP_KERNEL_ACCOUNT | GFP_NOFS); + if (buf == NULL) + return NULL; + entry = (struct nfs4_xattr_entry *)buf; + + if (name != NULL) { + namep = buf + sizeof(struct nfs4_xattr_entry); + memcpy(namep, name, slen); + } else { + namep = NULL; + } + + + if (flags & NFS4_XATTR_ENTRY_EXTVAL) { + valp = kvmalloc(len, GFP_KERNEL_ACCOUNT | GFP_NOFS); + if (valp == NULL) { + kfree(buf); + return NULL; + } + } else if (len != 0) { + valp = buf + sizeof(struct nfs4_xattr_entry) + slen; + } else + valp = NULL; + + if (valp != NULL) { + if (value != NULL) + memcpy(valp, value, len); + else + _copy_from_pages(valp, pages, 0, len); + } + + entry->flags = flags; + entry->xattr_value = valp; + kref_init(&entry->ref); + entry->xattr_name = namep; + entry->xattr_size = len; + entry->bucket = NULL; + INIT_LIST_HEAD(&entry->lru); + INIT_LIST_HEAD(&entry->dispose); + INIT_HLIST_NODE(&entry->hnode); + + return entry; +} + +static void +nfs4_xattr_free_entry(struct nfs4_xattr_entry *entry) +{ + if (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) + kvfree(entry->xattr_value); + kfree(entry); +} + +static void +nfs4_xattr_free_entry_cb(struct kref *kref) +{ + struct nfs4_xattr_entry *entry; + + entry = container_of(kref, struct nfs4_xattr_entry, ref); + + if (WARN_ON(!list_empty(&entry->lru))) + return; + + nfs4_xattr_free_entry(entry); +} + +static void +nfs4_xattr_free_cache_cb(struct kref *kref) +{ + struct nfs4_xattr_cache *cache; + int i; + + cache = container_of(kref, struct nfs4_xattr_cache, ref); + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + if (WARN_ON(!hlist_empty(&cache->buckets[i].hlist))) + return; + cache->buckets[i].draining = false; + } + + cache->listxattr = NULL; + + kmem_cache_free(nfs4_xattr_cache_cachep, cache); + +} + +static struct nfs4_xattr_cache * +nfs4_xattr_alloc_cache(void) +{ + struct nfs4_xattr_cache *cache; + + cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, + GFP_KERNEL_ACCOUNT | GFP_NOFS); + if (cache == NULL) + return NULL; + + kref_init(&cache->ref); + atomic_long_set(&cache->nent, 0); + + return cache; +} + +/* + * Set the listxattr cache, which is a special-cased cache entry. + * The special value ERR_PTR(-ESTALE) is used to indicate that + * the cache is being drained - this prevents a new listxattr + * cache from being added to what is now a stale cache. + */ +static int +nfs4_xattr_set_listcache(struct nfs4_xattr_cache *cache, + struct nfs4_xattr_entry *new) +{ + struct nfs4_xattr_entry *old; + int ret = 1; + + spin_lock(&cache->listxattr_lock); + + old = cache->listxattr; + + if (old == ERR_PTR(-ESTALE)) { + ret = 0; + goto out; + } + + cache->listxattr = new; + if (new != NULL && new != ERR_PTR(-ESTALE)) + nfs4_xattr_entry_lru_add(new); + + if (old != NULL) { + nfs4_xattr_entry_lru_del(old); + kref_put(&old->ref, nfs4_xattr_free_entry_cb); + } +out: + spin_unlock(&cache->listxattr_lock); + + return ret; +} + +/* + * Unlink a cache from its parent inode, clearing out an invalid + * cache. Must be called with i_lock held. + */ +static struct nfs4_xattr_cache * +nfs4_xattr_cache_unlink(struct inode *inode) +{ + struct nfs_inode *nfsi; + struct nfs4_xattr_cache *oldcache; + + nfsi = NFS_I(inode); + + oldcache = nfsi->xattr_cache; + if (oldcache != NULL) { + list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru); + oldcache->inode = NULL; + } + nfsi->xattr_cache = NULL; + nfsi->cache_validity &= ~NFS_INO_INVALID_XATTR; + + return oldcache; + +} + +/* + * Discard a cache. Usually called by a worker, since walking all + * the entries can take up some cycles that we don't want to waste + * in the I/O path. Can also be called from the shrinker callback. + * + * The cache is dead, it has already been unlinked from its inode, + * and no longer appears on the cache LRU list. + * + * Mark all buckets as draining, so that no new entries are added. This + * could still happen in the unlikely, but possible case that another + * thread had grabbed a reference before it was unlinked from the inode, + * and is still holding it for an add operation. + * + * Remove all entries from the LRU lists, so that there is no longer + * any way to 'find' this cache. Then, remove the entries from the hash + * table. + * + * At that point, the cache will remain empty and can be freed when the final + * reference drops, which is very likely the kref_put at the end of + * this function, or the one called immediately afterwards in the + * shrinker callback. + */ +static void +nfs4_xattr_discard_cache(struct nfs4_xattr_cache *cache) +{ + unsigned int i; + struct nfs4_xattr_entry *entry; + struct nfs4_xattr_bucket *bucket; + struct hlist_node *n; + + nfs4_xattr_set_listcache(cache, ERR_PTR(-ESTALE)); + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + bucket = &cache->buckets[i]; + + spin_lock(&bucket->lock); + bucket->draining = true; + hlist_for_each_entry_safe(entry, n, &bucket->hlist, hnode) { + nfs4_xattr_entry_lru_del(entry); + hlist_del_init(&entry->hnode); + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } + spin_unlock(&bucket->lock); + } + + atomic_long_set(&cache->nent, 0); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +static void +nfs4_xattr_discard_cache_worker(struct work_struct *work) +{ + struct nfs4_xattr_cache *cache = container_of(work, + struct nfs4_xattr_cache, work); + + nfs4_xattr_discard_cache(cache); +} + +static void +nfs4_xattr_reap_cache(struct nfs4_xattr_cache *cache) +{ + queue_work(nfs4_xattr_cache_wq, &cache->work); +} + +/* + * Get a referenced copy of the cache structure. Avoid doing allocs + * while holding i_lock. Which means that we do some optimistic allocation, + * and might have to free the result in rare cases. + * + * This function only checks the NFS_INO_INVALID_XATTR cache validity bit + * and acts accordingly, replacing the cache when needed. For the read case + * (!add), this means that the caller must make sure that the cache + * is valid before caling this function. getxattr and listxattr call + * revalidate_inode to do this. The attribute cache timeout (for the + * non-delegated case) is expected to be dealt with in the revalidate + * call. + */ + +static struct nfs4_xattr_cache * +nfs4_xattr_get_cache(struct inode *inode, int add) +{ + struct nfs_inode *nfsi; + struct nfs4_xattr_cache *cache, *oldcache, *newcache; + + nfsi = NFS_I(inode); + + cache = oldcache = NULL; + + spin_lock(&inode->i_lock); + + if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) + oldcache = nfs4_xattr_cache_unlink(inode); + else + cache = nfsi->xattr_cache; + + if (cache != NULL) + kref_get(&cache->ref); + + spin_unlock(&inode->i_lock); + + if (add && cache == NULL) { + newcache = NULL; + + cache = nfs4_xattr_alloc_cache(); + if (cache == NULL) + goto out; + + spin_lock(&inode->i_lock); + if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) { + /* + * The cache was invalidated again. Give up, + * since what we want to enter is now likely + * outdated anyway. + */ + spin_unlock(&inode->i_lock); + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + cache = NULL; + goto out; + } + + /* + * Check if someone beat us to it. + */ + if (nfsi->xattr_cache != NULL) { + newcache = nfsi->xattr_cache; + kref_get(&newcache->ref); + } else { + kref_get(&cache->ref); + nfsi->xattr_cache = cache; + cache->inode = inode; + list_lru_add(&nfs4_xattr_cache_lru, &cache->lru); + } + + spin_unlock(&inode->i_lock); + + /* + * If there was a race, throw away the cache we just + * allocated, and use the new one allocated by someone + * else. + */ + if (newcache != NULL) { + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + cache = newcache; + } + } + +out: + /* + * Discarding an old cache is done via a workqueue. + */ + if (oldcache != NULL) + nfs4_xattr_reap_cache(oldcache); + + return cache; +} + +static inline struct nfs4_xattr_bucket * +nfs4_xattr_hash_bucket(struct nfs4_xattr_cache *cache, const char *name) +{ + return &cache->buckets[jhash(name, strlen(name), 0) & + (ARRAY_SIZE(cache->buckets) - 1)]; +} + +static struct nfs4_xattr_entry * +nfs4_xattr_get_entry(struct nfs4_xattr_bucket *bucket, const char *name) +{ + struct nfs4_xattr_entry *entry; + + entry = NULL; + + hlist_for_each_entry(entry, &bucket->hlist, hnode) { + if (!strcmp(entry->xattr_name, name)) + break; + } + + return entry; +} + +static int +nfs4_xattr_hash_add(struct nfs4_xattr_cache *cache, + struct nfs4_xattr_entry *entry) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *oldentry = NULL; + int ret = 1; + + bucket = nfs4_xattr_hash_bucket(cache, entry->xattr_name); + entry->bucket = bucket; + + spin_lock(&bucket->lock); + + if (bucket->draining) { + ret = 0; + goto out; + } + + oldentry = nfs4_xattr_get_entry(bucket, entry->xattr_name); + if (oldentry != NULL) { + hlist_del_init(&oldentry->hnode); + nfs4_xattr_entry_lru_del(oldentry); + } else { + atomic_long_inc(&cache->nent); + } + + hlist_add_head(&entry->hnode, &bucket->hlist); + nfs4_xattr_entry_lru_add(entry); + +out: + spin_unlock(&bucket->lock); + + if (oldentry != NULL) + kref_put(&oldentry->ref, nfs4_xattr_free_entry_cb); + + return ret; +} + +static void +nfs4_xattr_hash_remove(struct nfs4_xattr_cache *cache, const char *name) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *entry; + + bucket = nfs4_xattr_hash_bucket(cache, name); + + spin_lock(&bucket->lock); + + entry = nfs4_xattr_get_entry(bucket, name); + if (entry != NULL) { + hlist_del_init(&entry->hnode); + nfs4_xattr_entry_lru_del(entry); + atomic_long_dec(&cache->nent); + } + + spin_unlock(&bucket->lock); + + if (entry != NULL) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); +} + +static struct nfs4_xattr_entry * +nfs4_xattr_hash_find(struct nfs4_xattr_cache *cache, const char *name) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *entry; + + bucket = nfs4_xattr_hash_bucket(cache, name); + + spin_lock(&bucket->lock); + + entry = nfs4_xattr_get_entry(bucket, name); + if (entry != NULL) + kref_get(&entry->ref); + + spin_unlock(&bucket->lock); + + return entry; +} + +/* + * Entry point to retrieve an entry from the cache. + */ +ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, char *buf, + ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + ssize_t ret; + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return -ENOENT; + + ret = 0; + entry = nfs4_xattr_hash_find(cache, name); + + if (entry != NULL) { + dprintk("%s: cache hit '%s', len %lu\n", __func__, + entry->xattr_name, (unsigned long)entry->xattr_size); + if (buflen == 0) { + /* Length probe only */ + ret = entry->xattr_size; + } else if (buflen < entry->xattr_size) + ret = -ERANGE; + else { + memcpy(buf, entry->xattr_value, entry->xattr_size); + ret = entry->xattr_size; + } + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } else { + dprintk("%s: cache miss '%s'\n", __func__, name); + ret = -ENOENT; + } + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + + return ret; +} + +/* + * Retrieve a cached list of xattrs from the cache. + */ +ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + ssize_t ret; + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return -ENOENT; + + spin_lock(&cache->listxattr_lock); + + entry = cache->listxattr; + + if (entry != NULL && entry != ERR_PTR(-ESTALE)) { + if (buflen == 0) { + /* Length probe only */ + ret = entry->xattr_size; + } else if (entry->xattr_size > buflen) + ret = -ERANGE; + else { + memcpy(buf, entry->xattr_value, entry->xattr_size); + ret = entry->xattr_size; + } + } else { + ret = -ENOENT; + } + + spin_unlock(&cache->listxattr_lock); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + + return ret; +} + +/* + * Add an xattr to the cache. + * + * This also invalidates the xattr list cache. + */ +void nfs4_xattr_cache_add(struct inode *inode, const char *name, + const char *buf, struct page **pages, ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + + dprintk("%s: add '%s' len %lu\n", __func__, + name, (unsigned long)buflen); + + cache = nfs4_xattr_get_cache(inode, 1); + if (cache == NULL) + return; + + entry = nfs4_xattr_alloc_entry(name, buf, pages, buflen); + if (entry == NULL) + goto out; + + (void)nfs4_xattr_set_listcache(cache, NULL); + + if (!nfs4_xattr_hash_add(cache, entry)) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + +out: + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + + +/* + * Remove an xattr from the cache. + * + * This also invalidates the xattr list cache. + */ +void nfs4_xattr_cache_remove(struct inode *inode, const char *name) +{ + struct nfs4_xattr_cache *cache; + + dprintk("%s: remove '%s'\n", __func__, name); + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return; + + (void)nfs4_xattr_set_listcache(cache, NULL); + nfs4_xattr_hash_remove(cache, name); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +/* + * Cache listxattr output, replacing any possible old one. + */ +void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf, + ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + + cache = nfs4_xattr_get_cache(inode, 1); + if (cache == NULL) + return; + + entry = nfs4_xattr_alloc_entry(NULL, buf, NULL, buflen); + if (entry == NULL) + goto out; + + /* + * This is just there to be able to get to bucket->cache, + * which is obviously the same for all buckets, so just + * use bucket 0. + */ + entry->bucket = &cache->buckets[0]; + + if (!nfs4_xattr_set_listcache(cache, entry)) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + +out: + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +/* + * Zap the entire cache. Called when an inode is evicted. + */ +void nfs4_xattr_cache_zap(struct inode *inode) +{ + struct nfs4_xattr_cache *oldcache; + + spin_lock(&inode->i_lock); + oldcache = nfs4_xattr_cache_unlink(inode); + spin_unlock(&inode->i_lock); + + if (oldcache) + nfs4_xattr_discard_cache(oldcache); +} + +/* + * The entry LRU is shrunk more aggressively than the cache LRU, + * by settings @seeks to 1. + * + * Cache structures are freed only when they've become empty, after + * pruning all but one entry. + */ + +static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, + struct shrink_control *sc); + +static struct shrinker nfs4_xattr_cache_shrinker = { + .count_objects = nfs4_xattr_cache_count, + .scan_objects = nfs4_xattr_cache_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_MEMCG_AWARE, +}; + +static struct shrinker nfs4_xattr_entry_shrinker = { + .count_objects = nfs4_xattr_entry_count, + .scan_objects = nfs4_xattr_entry_scan, + .seeks = DEFAULT_SEEKS, + .batch = 512, + .flags = SHRINKER_MEMCG_AWARE, +}; + +static struct shrinker nfs4_xattr_large_entry_shrinker = { + .count_objects = nfs4_xattr_entry_count, + .scan_objects = nfs4_xattr_entry_scan, + .seeks = 1, + .batch = 512, + .flags = SHRINKER_MEMCG_AWARE, +}; + +static enum lru_status +cache_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *dispose = arg; + struct inode *inode; + struct nfs4_xattr_cache *cache = container_of(item, + struct nfs4_xattr_cache, lru); + + if (atomic_long_read(&cache->nent) > 1) + return LRU_SKIP; + + /* + * If a cache structure is on the LRU list, we know that + * its inode is valid. Try to lock it to break the link. + * Since we're inverting the lock order here, only try. + */ + inode = cache->inode; + + if (!spin_trylock(&inode->i_lock)) + return LRU_SKIP; + + kref_get(&cache->ref); + + cache->inode = NULL; + NFS_I(inode)->xattr_cache = NULL; + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_XATTR; + list_lru_isolate(lru, &cache->lru); + + spin_unlock(&inode->i_lock); + + list_add_tail(&cache->dispose, dispose); + return LRU_REMOVED; +} + +static unsigned long +nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + struct nfs4_xattr_cache *cache; + + freed = list_lru_shrink_walk(&nfs4_xattr_cache_lru, sc, + cache_lru_isolate, &dispose); + while (!list_empty(&dispose)) { + cache = list_first_entry(&dispose, struct nfs4_xattr_cache, + dispose); + list_del_init(&cache->dispose); + nfs4_xattr_discard_cache(cache); + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + } + + return freed; +} + + +static unsigned long +nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long count; + + count = list_lru_count(&nfs4_xattr_cache_lru); + return vfs_pressure_ratio(count); +} + +static enum lru_status +entry_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *dispose = arg; + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry = container_of(item, + struct nfs4_xattr_entry, lru); + + bucket = entry->bucket; + cache = bucket->cache; + + /* + * Unhook the entry from its parent (either a cache bucket + * or a cache structure if it's a listxattr buf), so that + * it's no longer found. Then add it to the isolate list, + * to be freed later. + * + * In both cases, we're reverting lock order, so use + * trylock and skip the entry if we can't get the lock. + */ + if (entry->xattr_name != NULL) { + /* Regular cache entry */ + if (!spin_trylock(&bucket->lock)) + return LRU_SKIP; + + kref_get(&entry->ref); + + hlist_del_init(&entry->hnode); + atomic_long_dec(&cache->nent); + list_lru_isolate(lru, &entry->lru); + + spin_unlock(&bucket->lock); + } else { + /* Listxattr cache entry */ + if (!spin_trylock(&cache->listxattr_lock)) + return LRU_SKIP; + + kref_get(&entry->ref); + + cache->listxattr = NULL; + list_lru_isolate(lru, &entry->lru); + + spin_unlock(&cache->listxattr_lock); + } + + list_add_tail(&entry->dispose, dispose); + return LRU_REMOVED; +} + +static unsigned long +nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + struct nfs4_xattr_entry *entry; + struct list_lru *lru; + + lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose); + + while (!list_empty(&dispose)) { + entry = list_first_entry(&dispose, struct nfs4_xattr_entry, + dispose); + list_del_init(&entry->dispose); + + /* + * Drop two references: the one that we just grabbed + * in entry_lru_isolate, and the one that was set + * when the entry was first allocated. + */ + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } + + return freed; +} + +static unsigned long +nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long count; + struct list_lru *lru; + + lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + count = list_lru_count(lru); + return vfs_pressure_ratio(count); +} + + +static void nfs4_xattr_cache_init_once(void *p) +{ + struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p; + + spin_lock_init(&cache->listxattr_lock); + atomic_long_set(&cache->nent, 0); + nfs4_xattr_hash_init(cache); + cache->listxattr = NULL; + INIT_WORK(&cache->work, nfs4_xattr_discard_cache_worker); + INIT_LIST_HEAD(&cache->lru); + INIT_LIST_HEAD(&cache->dispose); +} + +int __init nfs4_xattr_cache_init(void) +{ + int ret = 0; + + nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache", + sizeof(struct nfs4_xattr_cache), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT), + nfs4_xattr_cache_init_once); + if (nfs4_xattr_cache_cachep == NULL) + return -ENOMEM; + + ret = list_lru_init_memcg(&nfs4_xattr_large_entry_lru, + &nfs4_xattr_large_entry_shrinker); + if (ret) + goto out4; + + ret = list_lru_init_memcg(&nfs4_xattr_entry_lru, + &nfs4_xattr_entry_shrinker); + if (ret) + goto out3; + + ret = list_lru_init_memcg(&nfs4_xattr_cache_lru, + &nfs4_xattr_cache_shrinker); + if (ret) + goto out2; + + nfs4_xattr_cache_wq = alloc_workqueue("nfs4_xattr", WQ_MEM_RECLAIM, 0); + if (nfs4_xattr_cache_wq == NULL) + goto out1; + + ret = register_shrinker(&nfs4_xattr_cache_shrinker); + if (ret) + goto out0; + + ret = register_shrinker(&nfs4_xattr_entry_shrinker); + if (ret) + goto out; + + ret = register_shrinker(&nfs4_xattr_large_entry_shrinker); + if (!ret) + return 0; + + unregister_shrinker(&nfs4_xattr_entry_shrinker); +out: + unregister_shrinker(&nfs4_xattr_cache_shrinker); +out0: + destroy_workqueue(nfs4_xattr_cache_wq); +out1: + list_lru_destroy(&nfs4_xattr_cache_lru); +out2: + list_lru_destroy(&nfs4_xattr_entry_lru); +out3: + list_lru_destroy(&nfs4_xattr_large_entry_lru); +out4: + kmem_cache_destroy(nfs4_xattr_cache_cachep); + + return ret; +} + +void nfs4_xattr_cache_exit(void) +{ + unregister_shrinker(&nfs4_xattr_entry_shrinker); + unregister_shrinker(&nfs4_xattr_cache_shrinker); + list_lru_destroy(&nfs4_xattr_entry_lru); + list_lru_destroy(&nfs4_xattr_cache_lru); + kmem_cache_destroy(nfs4_xattr_cache_cachep); + destroy_workqueue(nfs4_xattr_cache_wq); +} diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 2fa9e4ea98d2..7e16a586f3fc 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -626,12 +626,34 @@ static inline bool nfs4_state_match_open_stateid_other(const struct nfs4_state * nfs4_stateid_match_other(&state->open_stateid, stateid); } +/* nfs42xattr.c */ +#ifdef CONFIG_NFS_V4_2 +extern int __init nfs4_xattr_cache_init(void); +extern void nfs4_xattr_cache_exit(void); +extern void nfs4_xattr_cache_add(struct inode *inode, const char *name, + const char *buf, struct page **pages, + ssize_t buflen); +extern void nfs4_xattr_cache_remove(struct inode *inode, const char *name); +extern ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, + char *buf, ssize_t buflen); +extern void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf, + ssize_t buflen); +extern ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, + ssize_t buflen); +extern void nfs4_xattr_cache_zap(struct inode *inode); #else +static inline void nfs4_xattr_cache_zap(struct inode *inode) +{ +} +#endif /* CONFIG_NFS_V4_2 */ + +#else /* CONFIG_NFS_V4 */ #define nfs4_close_state(a, b) do { } while (0) #define nfs4_close_sync(a, b) do { } while (0) #define nfs4_state_protect(a, b, c, d) do { } while (0) #define nfs4_state_protect_write(a, b, c, d) do { } while (0) + #endif /* CONFIG_NFS_V4 */ #endif /* __LINUX_FS_NFS_NFS4_FS.H */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 92a07956f07b..f670ff64b31e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7448,6 +7448,7 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, size_t buflen, int flags) { struct nfs_access_entry cache; + int ret; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) return -EOPNOTSUPP; @@ -7466,10 +7467,17 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, return -EACCES; } - if (buf == NULL) - return nfs42_proc_removexattr(inode, key); - else - return nfs42_proc_setxattr(inode, key, buf, buflen, flags); + if (buf == NULL) { + ret = nfs42_proc_removexattr(inode, key); + if (!ret) + nfs4_xattr_cache_remove(inode, key); + } else { + ret = nfs42_proc_setxattr(inode, key, buf, buflen, flags); + if (!ret) + nfs4_xattr_cache_add(inode, key, buf, NULL, buflen); + } + + return ret; } static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, @@ -7477,6 +7485,7 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, const char *key, void *buf, size_t buflen) { struct nfs_access_entry cache; + ssize_t ret; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) return -EOPNOTSUPP; @@ -7486,7 +7495,17 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, return -EACCES; } - return nfs42_proc_getxattr(inode, key, buf, buflen); + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret) + return ret; + + ret = nfs4_xattr_cache_get(inode, key, buf, buflen); + if (ret >= 0 || (ret < 0 && ret != -ENOENT)) + return ret; + + ret = nfs42_proc_getxattr(inode, key, buf, buflen); + + return ret; } static ssize_t @@ -7494,7 +7513,7 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) { u64 cookie; bool eof; - int ret, size; + ssize_t ret, size; char *buf; size_t buflen; struct nfs_access_entry cache; @@ -7507,6 +7526,14 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) return 0; } + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret) + return ret; + + ret = nfs4_xattr_cache_list(inode, list, list_len); + if (ret >= 0 || (ret < 0 && ret != -ENOENT)) + return ret; + cookie = 0; eof = false; buflen = list_len ? list_len : XATTR_LIST_MAX; @@ -7526,6 +7553,9 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) size += ret; } + if (list_len) + nfs4_xattr_cache_set_list(inode, list, size); + return size; } diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 1475f932d7da..0c1ab846b83d 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -69,6 +69,7 @@ static void nfs4_evict_inode(struct inode *inode) pnfs_destroy_layout(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); + nfs4_xattr_cache_zap(inode); } struct nfs_referral_count { @@ -268,6 +269,12 @@ static int __init init_nfs_v4(void) if (err) goto out1; +#ifdef CONFIG_NFS_V4_2 + err = nfs4_xattr_cache_init(); + if (err) + goto out2; +#endif + err = nfs4_register_sysctl(); if (err) goto out2; @@ -288,6 +295,9 @@ static void __exit exit_nfs_v4(void) nfs4_pnfs_v3_ds_connect_unload(); unregister_nfs_version(&nfs_v4); +#ifdef CONFIG_NFS_V4_2 + nfs4_xattr_cache_exit(); +#endif nfs4_unregister_sysctl(); nfs_idmap_quit(); nfs_dns_resolver_destroy(); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 943ee750d68c..a2c6455ea3fa 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -102,6 +102,8 @@ struct nfs_delegation; struct posix_acl; +struct nfs4_xattr_cache; + /* * nfs fs inode data in memory */ @@ -188,6 +190,10 @@ struct nfs_inode { struct fscache_cookie *fscache; #endif struct inode vfs_inode; + +#ifdef CONFIG_NFS_V4_2 + struct nfs4_xattr_cache *xattr_cache; +#endif }; struct nfs4_copy_state { diff --git a/include/uapi/linux/nfs_fs.h b/include/uapi/linux/nfs_fs.h index 7bcc8cd6831d..3afe3767c55d 100644 --- a/include/uapi/linux/nfs_fs.h +++ b/include/uapi/linux/nfs_fs.h @@ -56,6 +56,7 @@ #define NFSDBG_PNFS 0x1000 #define NFSDBG_PNFS_LD 0x2000 #define NFSDBG_STATE 0x4000 +#define NFSDBG_XATTRCACHE 0x8000 #define NFSDBG_ALL 0xFFFF -- cgit v1.2.3 From 8aa5a33578e9685d06020bd10d1637557423e945 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Wed, 8 Jul 2020 07:28:33 +0000 Subject: xsk: Add new statistics It can be useful for the user to know the reason behind a dropped packet. Introduce new counters which track drops on the receive path caused by: 1. rx ring being full 2. fill ring being empty Also, on the tx path introduce a counter which tracks the number of times we attempt pull from the tx ring when it is empty. Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200708072835.4427-2-ciara.loftus@intel.com --- include/net/xdp_sock.h | 4 ++++ include/uapi/linux/if_xdp.h | 5 ++++- net/xdp/xsk.c | 36 +++++++++++++++++++++++++++++++----- net/xdp/xsk_buff_pool.c | 1 + net/xdp/xsk_queue.h | 6 ++++++ tools/include/uapi/linux/if_xdp.h | 5 ++++- 6 files changed, 50 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 96bfc5f5f24e..c9d87cc40c11 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -69,7 +69,11 @@ struct xdp_sock { spinlock_t tx_completion_lock; /* Protects generic receive. */ spinlock_t rx_lock; + + /* Statistics */ u64 rx_dropped; + u64 rx_queue_full; + struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index be328c59389d..a78a8096f4ce 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -73,9 +73,12 @@ struct xdp_umem_reg { }; struct xdp_statistics { - __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_dropped; /* Dropped for other reasons */ __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 rx_ring_full; /* Dropped due to rx ring being full */ + __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */ + __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */ }; struct xdp_options { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3700266229f6..26e3bba8c204 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -123,7 +123,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) addr = xp_get_handle(xskb); err = xskq_prod_reserve_desc(xs->rx, addr, len); if (err) { - xs->rx_dropped++; + xs->rx_queue_full++; return err; } @@ -274,8 +274,10 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { - if (!xskq_cons_peek_desc(xs->tx, desc, umem)) + if (!xskq_cons_peek_desc(xs->tx, desc, umem)) { + xs->tx->queue_empty_descs++; continue; + } /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed @@ -387,6 +389,8 @@ static int xsk_generic_xmit(struct sock *sk) sent_frame = true; } + xs->tx->queue_empty_descs++; + out: if (sent_frame) sk->sk_write_space(sk); @@ -812,6 +816,12 @@ static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) ring->desc = offsetof(struct xdp_umem_ring, desc); } +struct xdp_statistics_v1 { + __u64 rx_dropped; + __u64 rx_invalid_descs; + __u64 tx_invalid_descs; +}; + static int xsk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -831,19 +841,35 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, case XDP_STATISTICS: { struct xdp_statistics stats; + bool extra_stats = true; + size_t stats_size; - if (len < sizeof(stats)) + if (len < sizeof(struct xdp_statistics_v1)) { return -EINVAL; + } else if (len < sizeof(stats)) { + extra_stats = false; + stats_size = sizeof(struct xdp_statistics_v1); + } else { + stats_size = sizeof(stats); + } mutex_lock(&xs->mutex); stats.rx_dropped = xs->rx_dropped; + if (extra_stats) { + stats.rx_ring_full = xs->rx_queue_full; + stats.rx_fill_ring_empty_descs = + xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); + } else { + stats.rx_dropped += xs->rx_queue_full; + } stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); mutex_unlock(&xs->mutex); - if (copy_to_user(optval, &stats, sizeof(stats))) + if (copy_to_user(optval, &stats, stats_size)) return -EFAULT; - if (put_user(sizeof(stats), optlen)) + if (put_user(stats_size, optlen)) return -EFAULT; return 0; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 540ed75e4482..89cf3551d3e9 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -235,6 +235,7 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) for (;;) { if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + pool->fq->queue_empty_descs++; xp_release(xskb); return NULL; } diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5b5d24d2dd37..bf42cfd74b89 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -38,6 +38,7 @@ struct xsk_queue { u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; + u64 queue_empty_descs; }; /* The structure of the shared state of the rings are the same as the @@ -354,6 +355,11 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } +static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q) +{ + return q ? q->queue_empty_descs : 0; +} + struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index be328c59389d..a78a8096f4ce 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -73,9 +73,12 @@ struct xdp_umem_reg { }; struct xdp_statistics { - __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_dropped; /* Dropped for other reasons */ __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 rx_ring_full; /* Dropped due to rx ring being full */ + __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */ + __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */ }; struct xdp_options { -- cgit v1.2.3 From 0d80cb4612aa32dc0faa17fa3ab6f96f33e2b4a7 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Wed, 8 Jul 2020 07:28:35 +0000 Subject: xsk: Add xdp statistics to xsk_diag Add xdp statistics to the information dumped through the xsk_diag interface Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200708072835.4427-4-ciara.loftus@intel.com --- include/uapi/linux/xdp_diag.h | 11 +++++++++++ net/xdp/xsk_diag.c | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/xdp_diag.h b/include/uapi/linux/xdp_diag.h index 78b2591a7782..66b9973b4f4c 100644 --- a/include/uapi/linux/xdp_diag.h +++ b/include/uapi/linux/xdp_diag.h @@ -30,6 +30,7 @@ struct xdp_diag_msg { #define XDP_SHOW_RING_CFG (1 << 1) #define XDP_SHOW_UMEM (1 << 2) #define XDP_SHOW_MEMINFO (1 << 3) +#define XDP_SHOW_STATS (1 << 4) enum { XDP_DIAG_NONE, @@ -41,6 +42,7 @@ enum { XDP_DIAG_UMEM_FILL_RING, XDP_DIAG_UMEM_COMPLETION_RING, XDP_DIAG_MEMINFO, + XDP_DIAG_STATS, __XDP_DIAG_MAX, }; @@ -69,4 +71,13 @@ struct xdp_diag_umem { __u32 refs; }; +struct xdp_diag_stats { + __u64 n_rx_dropped; + __u64 n_rx_invalid; + __u64 n_rx_full; + __u64 n_fill_ring_empty; + __u64 n_tx_invalid; + __u64 n_tx_ring_empty; +}; + #endif /* _LINUX_XDP_DIAG_H */ diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 0163b26aaf63..21e9c2d123ee 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -76,6 +76,19 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) return err; } +static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_diag_stats du = {}; + + du.n_rx_dropped = xs->rx_dropped; + du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx); + du.n_rx_full = xs->rx_queue_full; + du.n_fill_ring_empty = xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx); + du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx); + return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du); +} + static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, struct xdp_diag_req *req, struct user_namespace *user_ns, @@ -118,6 +131,10 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) goto out_nlmsg_trim; + if ((req->xdiag_show & XDP_SHOW_STATS) && + xsk_diag_put_stats(xs, nlskb)) + goto out_nlmsg_trim; + mutex_unlock(&xs->mutex); nlmsg_end(nlskb, nlh); return 0; -- cgit v1.2.3 From ed757328c34015be4ec51861a90bb3bcc807ad58 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Mon, 13 Jul 2020 12:24:18 +0200 Subject: atm: Replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: David S. Miller --- drivers/atm/solos-pci.c | 2 +- include/uapi/linux/atmioc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c index c32f7dd9879a..b7646ae55942 100644 --- a/drivers/atm/solos-pci.c +++ b/drivers/atm/solos-pci.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the Solos PCI ADSL2+ card, designed to support Linux by - * Traverse Technologies -- http://www.traverse.com.au/ + * Traverse Technologies -- https://www.traverse.com.au/ * Xrio Limited -- http://www.xrio.com/ * * Copyright © 2008 Traverse Technologies diff --git a/include/uapi/linux/atmioc.h b/include/uapi/linux/atmioc.h index cd7655e40c77..a9030bcc8d56 100644 --- a/include/uapi/linux/atmioc.h +++ b/include/uapi/linux/atmioc.h @@ -5,7 +5,7 @@ /* - * See http://icawww1.epfl.ch/linux-atm/magic.html for the complete list of + * See https://icawww1.epfl.ch/linux-atm/magic.html for the complete list of * "magic" ioctl numbers. */ -- cgit v1.2.3 From 2801758391ba6b0c20e253b956355e1b15ad85a2 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 14 Jul 2020 09:34:48 +0200 Subject: bridge: uapi: mrp: Extend MRP attributes for MRP interconnect Extend the existing MRP netlink attributes to allow to configure MRP Interconnect: IFLA_BRIDGE_MRP_IN_ROLE - the parameter type is br_mrp_in_role which contains the interconnect id, the ring id, the interconnect role(MIM or MIC) and the port ifindex that represents the interconnect port. IFLA_BRIDGE_MRP_IN_STATE - the parameter type is br_mrp_in_state which contains the interconnect id and the interconnect state. IFLA_BRIDGE_MRP_IN_TEST - the parameter type is br_mrp_start_in_test which contains the interconnect id, the interval at which to send MRP_InTest frames, how many test frames can be missed before declaring the interconnect ring open and the period which represents for how long to send MRP_InTest frames. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 53 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/mrp_bridge.h | 38 +++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index c114c1c2bd53..d840a3e37a37 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -167,6 +167,9 @@ enum { IFLA_BRIDGE_MRP_RING_ROLE, IFLA_BRIDGE_MRP_START_TEST, IFLA_BRIDGE_MRP_INFO, + IFLA_BRIDGE_MRP_IN_ROLE, + IFLA_BRIDGE_MRP_IN_STATE, + IFLA_BRIDGE_MRP_START_IN_TEST, __IFLA_BRIDGE_MRP_MAX, }; @@ -245,6 +248,37 @@ enum { #define IFLA_BRIDGE_MRP_INFO_MAX (__IFLA_BRIDGE_MRP_INFO_MAX - 1) +enum { + IFLA_BRIDGE_MRP_IN_STATE_UNSPEC, + IFLA_BRIDGE_MRP_IN_STATE_IN_ID, + IFLA_BRIDGE_MRP_IN_STATE_STATE, + __IFLA_BRIDGE_MRP_IN_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_IN_STATE_MAX (__IFLA_BRIDGE_MRP_IN_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_IN_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_IN_ROLE_RING_ID, + IFLA_BRIDGE_MRP_IN_ROLE_IN_ID, + IFLA_BRIDGE_MRP_IN_ROLE_ROLE, + IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX, + __IFLA_BRIDGE_MRP_IN_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_IN_ROLE_MAX (__IFLA_BRIDGE_MRP_IN_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_START_IN_TEST_UNSPEC, + IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID, + IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL, + IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD, + __IFLA_BRIDGE_MRP_START_IN_TEST_MAX, +}; + +#define IFLA_BRIDGE_MRP_START_IN_TEST_MAX (__IFLA_BRIDGE_MRP_START_IN_TEST_MAX - 1) + struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; @@ -270,6 +304,25 @@ struct br_mrp_start_test { __u32 monitor; }; +struct br_mrp_in_state { + __u32 in_state; + __u16 in_id; +}; + +struct br_mrp_in_role { + __u32 ring_id; + __u32 in_role; + __u32 i_ifindex; + __u16 in_id; +}; + +struct br_mrp_start_in_test { + __u32 interval; + __u32 max_miss; + __u32 period; + __u16 in_id; +}; + struct bridge_stp_xstats { __u64 transition_blk; __u64 transition_fwd; diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h index bee366540212..6aeb13ef0b1e 100644 --- a/include/uapi/linux/mrp_bridge.h +++ b/include/uapi/linux/mrp_bridge.h @@ -21,11 +21,22 @@ enum br_mrp_ring_role_type { BR_MRP_RING_ROLE_MRA, }; +enum br_mrp_in_role_type { + BR_MRP_IN_ROLE_DISABLED, + BR_MRP_IN_ROLE_MIC, + BR_MRP_IN_ROLE_MIM, +}; + enum br_mrp_ring_state_type { BR_MRP_RING_STATE_OPEN, BR_MRP_RING_STATE_CLOSED, }; +enum br_mrp_in_state_type { + BR_MRP_IN_STATE_OPEN, + BR_MRP_IN_STATE_CLOSED, +}; + enum br_mrp_port_state_type { BR_MRP_PORT_STATE_DISABLED, BR_MRP_PORT_STATE_BLOCKED, @@ -36,6 +47,7 @@ enum br_mrp_port_state_type { enum br_mrp_port_role_type { BR_MRP_PORT_ROLE_PRIMARY, BR_MRP_PORT_ROLE_SECONDARY, + BR_MRP_PORT_ROLE_INTER, }; enum br_mrp_tlv_header_type { @@ -45,6 +57,10 @@ enum br_mrp_tlv_header_type { BR_MRP_TLV_HEADER_RING_TOPO = 0x3, BR_MRP_TLV_HEADER_RING_LINK_DOWN = 0x4, BR_MRP_TLV_HEADER_RING_LINK_UP = 0x5, + BR_MRP_TLV_HEADER_IN_TEST = 0x6, + BR_MRP_TLV_HEADER_IN_TOPO = 0x7, + BR_MRP_TLV_HEADER_IN_LINK_DOWN = 0x8, + BR_MRP_TLV_HEADER_IN_LINK_UP = 0x9, BR_MRP_TLV_HEADER_OPTION = 0x7f, }; @@ -118,4 +134,26 @@ struct br_mrp_oui_hdr { __u8 oui[MRP_OUI_LENGTH]; }; +struct br_mrp_in_test_hdr { + __be16 id; + __u8 sa[ETH_ALEN]; + __be16 port_role; + __be16 state; + __be16 transitions; + __be32 timestamp; +}; + +struct br_mrp_in_topo_hdr { + __u8 sa[ETH_ALEN]; + __be16 id; + __be16 interval; +}; + +struct br_mrp_in_link_hdr { + __u8 sa[ETH_ALEN]; + __be16 port_role; + __be16 id; + __be16 interval; +}; + #endif -- cgit v1.2.3 From 559139cb0405d38816e5e725adee9000db993235 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 14 Jul 2020 09:34:56 +0200 Subject: bridge: uapi: mrp: Extend MRP_INFO attributes for interconnect status Extend the existing MRP_INFO to return status of MRP interconnect. In case there is no MRP interconnect on the node then the role will be disabled so the other attributes can be ignored. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index d840a3e37a37..c1227aecd38f 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -243,6 +243,11 @@ enum { IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL, IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS, IFLA_BRIDGE_MRP_INFO_TEST_MONITOR, + IFLA_BRIDGE_MRP_INFO_I_IFINDEX, + IFLA_BRIDGE_MRP_INFO_IN_STATE, + IFLA_BRIDGE_MRP_INFO_IN_ROLE, + IFLA_BRIDGE_MRP_INFO_IN_TEST_INTERVAL, + IFLA_BRIDGE_MRP_INFO_IN_TEST_MAX_MISS, __IFLA_BRIDGE_MRP_INFO_MAX, }; -- cgit v1.2.3 From ffb3adba64801f70c472303c9e386eb5eaec193d Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 14 Jul 2020 09:34:58 +0200 Subject: net: bridge: Add port attribute IFLA_BRPORT_MRP_IN_OPEN This patch adds a new port attribute, IFLA_BRPORT_MRP_IN_OPEN, which allows to notify the userspace when the node lost the contiuity of MRP_InTest frames. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_netlink.c | 3 +++ tools/include/uapi/linux/if_link.h | 1 + 3 files changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index cc185a007ade..26842ffd0501 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -344,6 +344,7 @@ enum { IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, IFLA_BRPORT_MRP_RING_OPEN, + IFLA_BRPORT_MRP_IN_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c532fa65c983..147d52596e17 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void) #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + 0; } @@ -216,6 +217,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, !!(p->flags & BR_NEIGH_SUPPRESS)) || nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & BR_MRP_LOST_CONT)) || + nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, + !!(p->flags & BR_MRP_LOST_IN_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index cafedbbfefbe..781e482dc499 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -344,6 +344,7 @@ enum { IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, IFLA_BRPORT_MRP_RING_OPEN, + IFLA_BRPORT_MRP_IN_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3 From 7cf97b12545503992020796c74bd84078eb39299 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Tue, 2 Jun 2020 18:10:43 -0700 Subject: seccomp: Introduce addfd ioctl to seccomp user notifier The current SECCOMP_RET_USER_NOTIF API allows for syscall supervision over an fd. It is often used in settings where a supervising task emulates syscalls on behalf of a supervised task in userspace, either to further restrict the supervisee's syscall abilities or to circumvent kernel enforced restrictions the supervisor deems safe to lift (e.g. actually performing a mount(2) for an unprivileged container). While SECCOMP_RET_USER_NOTIF allows for the interception of any syscall, only a certain subset of syscalls could be correctly emulated. Over the last few development cycles, the set of syscalls which can't be emulated has been reduced due to the addition of pidfd_getfd(2). With this we are now able to, for example, intercept syscalls that require the supervisor to operate on file descriptors of the supervisee such as connect(2). However, syscalls that cause new file descriptors to be installed can not currently be correctly emulated since there is no way for the supervisor to inject file descriptors into the supervisee. This patch adds a new addfd ioctl to remove this restriction by allowing the supervisor to install file descriptors into the intercepted task. By implementing this feature via seccomp the supervisor effectively instructs the supervisee to install a set of file descriptors into its own file descriptor table during the intercepted syscall. This way it is possible to intercept syscalls such as open() or accept(), and install (or replace, like dup2(2)) the supervisor's resulting fd into the supervisee. One replacement use-case would be to redirect the stdout and stderr of a supervisee into log file descriptors opened by the supervisor. The ioctl handling is based on the discussions[1] of how Extensible Arguments should interact with ioctls. Instead of building size into the addfd structure, make it a function of the ioctl command (which is how sizes are normally passed to ioctls). To support forward and backward compatibility, just mask out the direction and size, and match everything. The size (and any future direction) checks are done along with copy_struct_from_user() logic. As a note, the seccomp_notif_addfd structure is laid out based on 8-byte alignment without requiring packing as there have been packing issues with uapi highlighted before[2][3]. Although we could overload the newfd field and use -1 to indicate that it is not to be used, doing so requires changing the size of the fd field, and introduces struct packing complexity. [1]: https://lore.kernel.org/lkml/87o8w9bcaf.fsf@mid.deneb.enyo.de/ [2]: https://lore.kernel.org/lkml/a328b91d-fd8f-4f27-b3c2-91a9c45f18c0@rasmusvillemoes.dk/ [3]: https://lore.kernel.org/lkml/20200612104629.GA15814@ircssh-2.c.rugged-nimbus-611.internal Cc: Christoph Hellwig Cc: Christian Brauner Cc: Tycho Andersen Cc: Jann Horn Cc: Robert Sesek Cc: Chris Palmer Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-api@vger.kernel.org Suggested-by: Matt Denton Link: https://lore.kernel.org/r/20200603011044.7972-4-sargun@sargun.me Signed-off-by: Sargun Dhillon Reviewed-by: Will Drewry Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- include/linux/seccomp.h | 4 + include/uapi/linux/seccomp.h | 22 ++++++ kernel/seccomp.c | 175 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 199 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index babcd6c02d09..881c90b6aa25 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -10,6 +10,10 @@ SECCOMP_FILTER_FLAG_NEW_LISTENER | \ SECCOMP_FILTER_FLAG_TSYNC_ESRCH) +/* sizeof() the first published struct seccomp_notif_addfd */ +#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24 +#define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0 + #ifdef CONFIG_SECCOMP #include diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 965290f7dcc2..6ba18b82a02e 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -113,6 +113,25 @@ struct seccomp_notif_resp { __u32 flags; }; +/* valid flags for seccomp_notif_addfd */ +#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ + +/** + * struct seccomp_notif_addfd + * @id: The ID of the seccomp notification + * @flags: SECCOMP_ADDFD_FLAG_* + * @srcfd: The local fd number + * @newfd: Optional remote FD number if SETFD option is set, otherwise 0. + * @newfd_flags: The O_* flags the remote FD should have applied + */ +struct seccomp_notif_addfd { + __u64 id; + __u32 flags; + __u32 srcfd; + __u32 newfd; + __u32 newfd_flags; +}; + #define SECCOMP_IOC_MAGIC '!' #define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr) #define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type) @@ -124,5 +143,8 @@ struct seccomp_notif_resp { #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) #define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) +/* On success, the return value is the remote process's added fd number */ +#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \ + struct seccomp_notif_addfd) #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 866a432cd746..3ee59ce0a323 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -87,10 +87,42 @@ struct seccomp_knotif { long val; u32 flags; - /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ + /* + * Signals when this has changed states, such as the listener + * dying, a new seccomp addfd message, or changing to REPLIED + */ struct completion ready; struct list_head list; + + /* outstanding addfd requests */ + struct list_head addfd; +}; + +/** + * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages + * + * @file: A reference to the file to install in the other task + * @fd: The fd number to install it at. If the fd number is -1, it means the + * installing process should allocate the fd as normal. + * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC + * is allowed. + * @ret: The return value of the installing process. It is set to the fd num + * upon success (>= 0). + * @completion: Indicates that the installing process has completed fd + * installation, or gone away (either due to successful + * reply, or signal) + * + */ +struct seccomp_kaddfd { + struct file *file; + int fd; + unsigned int flags; + + /* To only be set on reply */ + int ret; + struct completion completion; + struct list_head list; }; /** @@ -793,6 +825,17 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) +{ + /* + * Remove the notification, and reset the list pointers, indicating + * that it has been handled. + */ + list_del_init(&addfd->list); + addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + complete(&addfd->completion); +} + static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) @@ -801,6 +844,7 @@ static int seccomp_do_user_notification(int this_syscall, u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; + struct seccomp_kaddfd *addfd, *tmp; mutex_lock(&match->notify_lock); err = -ENOSYS; @@ -813,6 +857,7 @@ static int seccomp_do_user_notification(int this_syscall, n.id = seccomp_next_notify_id(match); init_completion(&n.ready); list_add(&n.list, &match->notif->notifications); + INIT_LIST_HEAD(&n.addfd); up(&match->notif->request); wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); @@ -821,17 +866,34 @@ static int seccomp_do_user_notification(int this_syscall, /* * This is where we wait for a reply from userspace. */ +wait: err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); if (err == 0) { + /* Check if we were woken up by a addfd message */ + addfd = list_first_entry_or_null(&n.addfd, + struct seccomp_kaddfd, list); + if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) { + seccomp_handle_addfd(addfd); + mutex_unlock(&match->notify_lock); + goto wait; + } ret = n.val; err = n.error; flags = n.flags; } + /* If there were any pending addfd calls, clear them out */ + list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { + /* The process went away before we got a chance to handle it */ + addfd->ret = -ESRCH; + list_del_init(&addfd->list); + complete(&addfd->completion); + } + /* * Note that it's possible the listener died in between the time when - * we were notified of a respons (or a signal) and when we were able to + * we were notified of a response (or a signal) and when we were able to * re-acquire the lock, so only delete from the list if the * notification actually exists. * @@ -1069,6 +1131,11 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) knotif->error = -ENOSYS; knotif->val = 0; + /* + * We do not need to wake up any pending addfd messages, as + * the notifier will do that for us, as this just looks + * like a standard reply. + */ complete(&knotif->ready); } @@ -1233,12 +1300,109 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, return ret; } +static long seccomp_notify_addfd(struct seccomp_filter *filter, + struct seccomp_notif_addfd __user *uaddfd, + unsigned int size) +{ + struct seccomp_notif_addfd addfd; + struct seccomp_knotif *knotif; + struct seccomp_kaddfd kaddfd; + int ret; + + BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0); + BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST); + + if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE) + return -EINVAL; + + ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size); + if (ret) + return ret; + + if (addfd.newfd_flags & ~O_CLOEXEC) + return -EINVAL; + + if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD) + return -EINVAL; + + if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) + return -EINVAL; + + kaddfd.file = fget(addfd.srcfd); + if (!kaddfd.file) + return -EBADF; + + kaddfd.flags = addfd.newfd_flags; + kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ? + addfd.newfd : -1; + init_completion(&kaddfd.completion); + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + goto out; + + knotif = find_notification(filter, addfd.id); + if (!knotif) { + ret = -ENOENT; + goto out_unlock; + } + + /* + * We do not want to allow for FD injection to occur before the + * notification has been picked up by a userspace handler, or after + * the notification has been replied to. + */ + if (knotif->state != SECCOMP_NOTIFY_SENT) { + ret = -EINPROGRESS; + goto out_unlock; + } + + list_add(&kaddfd.list, &knotif->addfd); + complete(&knotif->ready); + mutex_unlock(&filter->notify_lock); + + /* Now we wait for it to be processed or be interrupted */ + ret = wait_for_completion_interruptible(&kaddfd.completion); + if (ret == 0) { + /* + * We had a successful completion. The other side has already + * removed us from the addfd queue, and + * wait_for_completion_interruptible has a memory barrier upon + * success that lets us read this value directly without + * locking. + */ + ret = kaddfd.ret; + goto out; + } + + mutex_lock(&filter->notify_lock); + /* + * Even though we were woken up by a signal and not a successful + * completion, a completion may have happened in the mean time. + * + * We need to check again if the addfd request has been handled, + * and if not, we will remove it from the queue. + */ + if (list_empty(&kaddfd.list)) + ret = kaddfd.ret; + else + list_del(&kaddfd.list); + +out_unlock: + mutex_unlock(&filter->notify_lock); +out: + fput(kaddfd.file); + + return ret; +} + static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seccomp_filter *filter = file->private_data; void __user *buf = (void __user *)arg; + /* Fixed-size ioctls */ switch (cmd) { case SECCOMP_IOCTL_NOTIF_RECV: return seccomp_notify_recv(filter, buf); @@ -1247,6 +1411,13 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); + } + + /* Extensible Argument ioctls */ +#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) + switch (EA_IOCTL(cmd)) { + case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD): + return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd)); default: return -EINVAL; } -- cgit v1.2.3 From 66137f54ccd7eb7510e86a8bec59aab313350794 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 14 Jul 2020 22:23:47 -0700 Subject: drm: i915_drm.h: delete duplicated words in comments Drop doubled words "the" and "be" in comments. Signed-off-by: Randy Dunlap Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20200715052349.23319-5-rdunlap@infradead.org --- include/uapi/drm/i915_drm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 14b67cd6b54b..00546062e023 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -55,7 +55,7 @@ extern "C" { * cause the related events to not be seen. * * I915_RESET_UEVENT - Event is generated just before an attempt to reset the - * the GPU. The value supplied with the event is always 1. NOTE: Disable + * GPU. The value supplied with the event is always 1. NOTE: Disable * reset via module parameter will cause this event to not be seen. */ #define I915_L3_PARITY_UEVENT "L3_PARITY_ERROR" @@ -1934,7 +1934,7 @@ enum drm_i915_perf_property_id { /** * The value specifies which set of OA unit metrics should be - * be configured, defining the contents of any OA unit reports. + * configured, defining the contents of any OA unit reports. * * This property is available in perf revision 1. */ -- cgit v1.2.3 From afae47af0cb73e36d61b61c8d93cb9d755853050 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 14 Jul 2020 22:23:48 -0700 Subject: drm: msm_drm.h: delete duplicated words in comments Drop the doubled word "to" in comments. Signed-off-by: Randy Dunlap Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20200715052349.23319-6-rdunlap@infradead.org --- include/uapi/drm/msm_drm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h index 19806eb3a8e8..a6c1f3eb2623 100644 --- a/include/uapi/drm/msm_drm.h +++ b/include/uapi/drm/msm_drm.h @@ -252,8 +252,8 @@ struct drm_msm_gem_submit { __u64 cmds; /* in, ptr to array of submit_cmd's */ __s32 fence_fd; /* in/out fence fd (see MSM_SUBMIT_FENCE_FD_IN/OUT) */ __u32 queueid; /* in, submitqueue id */ - __u64 in_syncobjs; /* in, ptr to to array of drm_msm_gem_submit_syncobj */ - __u64 out_syncobjs; /* in, ptr to to array of drm_msm_gem_submit_syncobj */ + __u64 in_syncobjs; /* in, ptr to array of drm_msm_gem_submit_syncobj */ + __u64 out_syncobjs; /* in, ptr to array of drm_msm_gem_submit_syncobj */ __u32 nr_in_syncobjs; /* in, number of entries in in_syncobj */ __u32 nr_out_syncobjs; /* in, number of entries in out_syncobj. */ __u32 syncobj_stride; /* in, stride of syncobj arrays. */ -- cgit v1.2.3 From 938a0650aae6275ba8e924685836bdee2c6aa3db Mon Sep 17 00:00:00 2001 From: Amber Lin Date: Wed, 13 May 2020 08:19:29 -0400 Subject: drm/amdkfd: Provide SMI events watch When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register devices and subscribe events they are interested. After registered, the user can use annoymous file descriptor's poll function with wait-time specified and wait for events to happen. Once an event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h v3: send the event msg in text than in binary v4: support multiple clients v5: move events enablement from ioctl to fd write v6: sparse fix Signed-off-by: Amber Lin Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 18 ++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 + drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 215 +++++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 29 +++ include/uapi/linux/kfd_ioctl.h | 16 +- 9 files changed, 293 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 61474627a32c..e1e4115dcf78 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,6 +53,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ + $(AMDKFD_PATH)/kfd_smi_events.o \ $(AMDKFD_PATH)/kfd_crat.o ifneq ($(CONFIG_AMD_IOMMU_V2),) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba93cfe0..24b471734117 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(&info, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index cf0017f4d9d5..e9b96ad3d9a5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1740,6 +1741,20 @@ err_unlock: return r; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpuid); + if (!dev) + return -EINVAL; + + return kfd_smi_event_open(dev, &args->anon_fd); +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -1835,6 +1850,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 3a646e757560..4bfedaab183f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -635,6 +635,11 @@ static int kfd_gws_init(struct kfd_dev *kfd) return ret; } +static void kfd_smi_init(struct kfd_dev *dev) { + INIT_LIST_HEAD(&dev->smi_clients); + spin_lock_init(&dev->smi_lock); +} + bool kgd2kfd_device_init(struct kfd_dev *kfd, struct drm_device *ddev, const struct kgd2kfd_shared_resources *gpu_resources) @@ -749,6 +754,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto kfd_topology_add_device_error; } + kfd_smi_init(kfd); + kfd->init_complete = true; dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor, kfd->pdev->device); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index fce6ccabe38b..241bd6ff79f4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "soc15_int.h" #include "kfd_device_queue_manager.h" +#include "kfd_smi_events.h" static bool event_interrupt_isr_v9(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, info.prot_read = ring_id & 0x10; info.prot_write = ring_id & 0x20; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); kfd_signal_vm_fault_event(dev, pasid, &info); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 67b9c398f0e4..6727e9de5b8b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -305,6 +305,10 @@ struct kfd_dev { /* Global GWS resource shared between processes */ void *gws; + + /* Clients watching SMI events */ + struct list_head smi_clients; + spinlock_t smi_lock; }; enum kfd_mempool { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c new file mode 100644 index 000000000000..bfc9330df987 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -0,0 +1,215 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include "amdgpu_vm.h" +#include "kfd_priv.h" +#include "kfd_smi_events.h" + +struct kfd_smi_client { + struct list_head list; + struct kfifo fifo; + wait_queue_head_t wait_queue; + /* events enabled */ + uint64_t events; + struct kfd_dev *dev; + spinlock_t lock; +}; + +#define MAX_KFIFO_SIZE 1024 + +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *); +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *); +static ssize_t kfd_smi_ev_write(struct file *, const char __user *, size_t, + loff_t *); +static int kfd_smi_ev_release(struct inode *, struct file *); + +static const char kfd_smi_name[] = "kfd_smi_ev"; + +static const struct file_operations kfd_smi_ev_fops = { + .owner = THIS_MODULE, + .poll = kfd_smi_ev_poll, + .read = kfd_smi_ev_read, + .write = kfd_smi_ev_write, + .release = kfd_smi_ev_release +}; + +static __poll_t kfd_smi_ev_poll(struct file *filep, + struct poll_table_struct *wait) +{ + struct kfd_smi_client *client = filep->private_data; + __poll_t mask = 0; + + poll_wait(filep, &client->wait_queue, wait); + + spin_lock(&client->lock); + if (!kfifo_is_empty(&client->fifo)) + mask = EPOLLIN | EPOLLRDNORM; + spin_unlock(&client->lock); + + return mask; +} + +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, + size_t size, loff_t *offset) +{ + int ret; + size_t to_copy; + struct kfd_smi_client *client = filep->private_data; + unsigned char buf[MAX_KFIFO_SIZE]; + + BUILD_BUG_ON(MAX_KFIFO_SIZE > 1024); + + /* kfifo_to_user can sleep so we can't use spinlock protection around + * it. Instead, we kfifo out as spinlocked then copy them to the user. + */ + spin_lock(&client->lock); + to_copy = kfifo_len(&client->fifo); + if (!to_copy) { + spin_unlock(&client->lock); + return -EAGAIN; + } + to_copy = min3(size, sizeof(buf), to_copy); + ret = kfifo_out(&client->fifo, buf, to_copy); + spin_unlock(&client->lock); + if (ret <= 0) + return -EAGAIN; + + ret = copy_to_user(user, buf, to_copy); + if (ret) + return -EFAULT; + + return to_copy; +} + +static ssize_t kfd_smi_ev_write(struct file *filep, const char __user *user, + size_t size, loff_t *offset) +{ + struct kfd_smi_client *client = filep->private_data; + uint64_t events; + + if (!access_ok(user, size) || size < sizeof(events)) + return -EFAULT; + if (copy_from_user(&events, user, sizeof(events))) + return -EFAULT; + + WRITE_ONCE(client->events, events); + + return sizeof(events); +} + +static int kfd_smi_ev_release(struct inode *inode, struct file *filep) +{ + struct kfd_smi_client *client = filep->private_data; + struct kfd_dev *dev = client->dev; + + spin_lock(&dev->smi_lock); + list_del_rcu(&client->list); + spin_unlock(&dev->smi_lock); + + synchronize_rcu(); + kfifo_free(&client->fifo); + kfree(client); + + return 0; +} + +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; + struct amdgpu_task_info task_info; + /* VmFault msg = (hex)uint32_pid(8) + :(1) + task name(16) = 25 */ + /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43 + */ + char fifo_in[43]; + struct kfd_smi_client *client; + int len; + + if (list_empty(&dev->smi_clients)) + return; + + memset(&task_info, 0, sizeof(struct amdgpu_task_info)); + amdgpu_vm_get_task_info(adev, pasid, &task_info); + /* Report VM faults from user applications, not retry from kernel */ + if (!task_info.pid) + return; + + len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, + task_info.pid, task_info.task_name); + + rcu_read_lock(); + + list_for_each_entry_rcu(client, &dev->smi_clients, list) { + if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT)) + continue; + spin_lock(&client->lock); + if (kfifo_avail(&client->fifo) >= len) { + kfifo_in(&client->fifo, fifo_in, len); + wake_up_all(&client->wait_queue); + } + else + pr_debug("smi_event(vmfault): no space left\n"); + spin_unlock(&client->lock); + } + + rcu_read_unlock(); +} + +int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) +{ + struct kfd_smi_client *client; + int ret; + + client = kzalloc(sizeof(struct kfd_smi_client), GFP_KERNEL); + if (!client) + return -ENOMEM; + INIT_LIST_HEAD(&client->list); + + ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL); + if (ret) { + kfree(client); + return ret; + } + + ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client, + O_RDWR); + if (ret < 0) { + kfifo_free(&client->fifo); + kfree(client); + return ret; + } + *fd = ret; + + init_waitqueue_head(&client->wait_queue); + spin_lock_init(&client->lock); + client->events = 0; + client->dev = dev; + + spin_lock(&dev->smi_lock); + list_add_rcu(&client->list, &dev->smi_clients); + spin_unlock(&dev->smi_lock); + + return 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h new file mode 100644 index 000000000000..a9cb218fef96 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -0,0 +1,29 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_SMI_EVENTS_H_INCLUDED +#define KFD_SMI_EVENTS_H_INCLUDED + +int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); + +#endif diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index b6be62356d34..733c183d165e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -442,6 +442,17 @@ struct kfd_ioctl_import_dmabuf_args { __u32 dmabuf_fd; /* to KFD */ }; +/* + * KFD SMI(System Management Interface) events + */ +/* Event type (defined by bitmask) */ +#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001 + +struct kfd_ioctl_smi_events_args { + __u32 gpuid; /* to KFD */ + __u32 anon_fd; /* from KFD */ +}; + /* Register offset inside the remapped mmio page */ enum kfd_mmio_remap { @@ -546,7 +557,10 @@ enum kfd_mmio_remap { #define AMDKFD_IOC_ALLOC_QUEUE_GWS \ AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) +#define AMDKFD_IOC_SMI_EVENTS \ + AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x1F +#define AMDKFD_COMMAND_END 0x20 #endif -- cgit v1.2.3 From 91e2c1919230c719c32a3076657f51d4c6074513 Mon Sep 17 00:00:00 2001 From: Amber Lin Date: Mon, 20 Apr 2020 19:42:46 -0400 Subject: include/uapi/linux: Update KFD ioctl version Bump KFD ioctl after adding SMI events support Signed-off-by: Amber Lin Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 733c183d165e..f738c3b53f4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -26,8 +26,12 @@ #include #include +/* + * - 1.1 - initial version + * - 1.3 - Add SMI events support + */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 1 +#define KFD_IOCTL_MINOR_VERSION 3 struct kfd_ioctl_get_version_args { __u32 major_version; /* from KFD */ -- cgit v1.2.3 From c201324b54553aebb32845193680f21eb493c6e5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 09:42:42 -0700 Subject: net: caif: drop duplicate words in comments Drop doubled words "or" and "the" in several comments. Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Jakub Kicinski --- include/net/caif/caif_layer.h | 4 ++-- include/uapi/linux/caif/caif_socket.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/caif/caif_layer.h b/include/net/caif/caif_layer.h index 064094101cb5..51f7bb42a936 100644 --- a/include/net/caif/caif_layer.h +++ b/include/net/caif/caif_layer.h @@ -156,7 +156,7 @@ struct cflayer { * CAIF packets upwards in the stack. * Packet handling rules: * - The CAIF packet (cfpkt) ownership is passed to the - * called receive function. This means that the the + * called receive function. This means that the * packet cannot be accessed after passing it to the * above layer using up->receive(). * @@ -184,7 +184,7 @@ struct cflayer { * CAIF packet downwards in the stack. * Packet handling rules: * - The CAIF packet (cfpkt) ownership is passed to the - * transmit function. This means that the the packet + * transmit function. This means that the packet * cannot be accessed after passing it to the below * layer using dn->transmit(). * diff --git a/include/uapi/linux/caif/caif_socket.h b/include/uapi/linux/caif/caif_socket.h index 10ec1d1cf68e..d9970bbaa156 100644 --- a/include/uapi/linux/caif/caif_socket.h +++ b/include/uapi/linux/caif/caif_socket.h @@ -169,7 +169,7 @@ struct sockaddr_caif { * @CAIFSO_LINK_SELECT: Selector used if multiple CAIF Link layers are * available. Either a high bandwidth * link can be selected (CAIF_LINK_HIGH_BANDW) or - * or a low latency link (CAIF_LINK_LOW_LATENCY). + * a low latency link (CAIF_LINK_LOW_LATENCY). * This option is of type __u32. * Alternatively SO_BINDTODEVICE can be used. * -- cgit v1.2.3 From 644bfe51fa49c22244d24e896cd3fe3ee2f2cfd1 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:37 +0200 Subject: cpumap: Formalize map value as a named struct As it has been already done for devmap, introduce 'struct bpf_cpumap_val' to formalize the expected values that can be passed in for a CPUMAP. Update cpumap code to use the struct. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/754f950674665dae6139c061d28c1d982aaf4170.1594734381.git.lorenzo@kernel.org --- include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/cpumap.c | 28 +++++++++++++++------------- tools/include/uapi/linux/bpf.h | 9 +++++++++ 3 files changed, 33 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5e386389913a..109623527358 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3849,6 +3849,15 @@ struct bpf_devmap_val { } bpf_prog; }; +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ +}; + enum sk_action { SK_DROP = 0, SK_PASS, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 323c91c4fab0..ff48dc00e8d0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -52,7 +52,6 @@ struct xdp_bulk_queue { struct bpf_cpu_map_entry { u32 cpu; /* kthread CPU and map index */ int map_id; /* Back reference to map */ - u32 qsize; /* Queue size placeholder for map lookup */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; @@ -62,10 +61,13 @@ struct bpf_cpu_map_entry { /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; - struct work_struct kthread_stop_wq; + + struct bpf_cpumap_val value; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; + + struct work_struct kthread_stop_wq; }; struct bpf_cpu_map { @@ -307,8 +309,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, - int map_id) +static struct bpf_cpu_map_entry * +__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; @@ -338,13 +340,13 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->queue) goto free_bulkq; - err = ptr_ring_init(rcpu->queue, qsize, gfp); + err = ptr_ring_init(rcpu->queue, value->qsize, gfp); if (err) goto free_queue; rcpu->cpu = cpu; rcpu->map_id = map_id; - rcpu->qsize = qsize; + rcpu->value.qsize = value->qsize; /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, @@ -437,12 +439,12 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpumap_val cpumap_value = {}; struct bpf_cpu_map_entry *rcpu; - /* Array index key correspond to CPU number */ u32 key_cpu = *(u32 *)key; - /* Value is the queue size */ - u32 qsize = *(u32 *)value; + + memcpy(&cpumap_value, value, map->value_size); if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -450,18 +452,18 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ return -EOVERFLOW; /* Make sure CPU is a valid possible cpu */ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) return -ENODEV; - if (qsize == 0) { + if (cpumap_value.qsize == 0) { rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ - rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); if (!rcpu) return -ENOMEM; rcpu->cmap = cmap; @@ -523,7 +525,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_cpu_map_entry *rcpu = __cpu_map_lookup_elem(map, *(u32 *)key); - return rcpu ? &rcpu->qsize : NULL; + return rcpu ? &rcpu->value : NULL; } static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5e386389913a..109623527358 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3849,6 +3849,15 @@ struct bpf_devmap_val { } bpf_prog; }; +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ +}; + enum sk_action { SK_DROP = 0, SK_PASS, -- cgit v1.2.3 From 9216477449f33cdbc9c9a99d49f500b7fbb81702 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:38 +0200 Subject: bpf: cpumap: Add the possibility to attach an eBPF program to cpumap Introduce the capability to attach an eBPF program to cpumap entries. The idea behind this feature is to add the possibility to define on which CPU run the eBPF program if the underlying hw does not support RSS. Current supported verdicts are XDP_DROP and XDP_PASS. This patch has been tested on Marvell ESPRESSObin using xdp_redirect_cpu sample available in the kernel tree to identify possible performance regressions. Results show there are no observable differences in packet-per-second: $./xdp_redirect_cpu --progname xdp_cpu_map0 --dev eth0 --cpu 1 rx: 354.8 Kpps rx: 356.0 Kpps rx: 356.8 Kpps rx: 356.3 Kpps rx: 356.6 Kpps rx: 356.6 Kpps rx: 356.7 Kpps rx: 355.8 Kpps rx: 356.8 Kpps rx: 356.8 Kpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/5c9febdf903d810b3415732e5cd98491d7d9067a.1594734381.git.lorenzo@kernel.org --- include/linux/bpf.h | 6 ++ include/net/xdp.h | 5 ++ include/trace/events/xdp.h | 14 +++-- include/uapi/linux/bpf.h | 5 ++ kernel/bpf/cpumap.c | 121 ++++++++++++++++++++++++++++++++++++----- net/core/dev.c | 9 +++ tools/include/uapi/linux/bpf.h | 5 ++ 7 files changed, 148 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c67c88ad35f8..54ad426dbea1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1272,6 +1272,7 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); +bool cpu_map_prog_allowed(struct bpf_map *map); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) @@ -1432,6 +1433,11 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, return 0; } +static inline bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return false; +} + static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { diff --git a/include/net/xdp.h b/include/net/xdp.h index 5b383c450858..83b9e0142b52 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -98,6 +98,11 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +struct xdp_cpumap_stats { + unsigned int pass; + unsigned int drop; +}; + /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b73d3e141323..e2c99f5bee39 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -177,9 +177,9 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, - int sched), + int sched, struct xdp_cpumap_stats *xdp_stats), - TP_ARGS(map_id, processed, drops, sched), + TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) @@ -188,6 +188,8 @@ TRACE_EVENT(xdp_cpumap_kthread, __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) + __field(unsigned int, xdp_pass) + __field(unsigned int, xdp_drop) ), TP_fast_assign( @@ -197,16 +199,20 @@ TRACE_EVENT(xdp_cpumap_kthread, __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; + __entry->xdp_pass = xdp_stats->pass; + __entry->xdp_drop = xdp_stats->drop; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" - " sched=%d", + " sched=%d" + " xdp_pass=%u xdp_drop=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, - __entry->sched) + __entry->sched, + __entry->xdp_pass, __entry->xdp_drop) ); TRACE_EVENT(xdp_cpumap_enqueue, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 109623527358..c010b57fce3f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -227,6 +227,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, __MAX_BPF_ATTACH_TYPE }; @@ -3856,6 +3857,10 @@ struct bpf_devmap_val { */ struct bpf_cpumap_val { __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; }; enum sk_action { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ff48dc00e8d0..b3a8aea81ee5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -63,6 +63,7 @@ struct bpf_cpu_map_entry { struct task_struct *kthread; struct bpf_cpumap_val value; + struct bpf_prog *prog; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; @@ -82,6 +83,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { + u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; int err = -ENOMEM; u64 cost; @@ -92,7 +94,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + (value_size != offsetofend(struct bpf_cpumap_val, qsize) && + value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || + attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); cmap = kzalloc(sizeof(*cmap), GFP_USER); @@ -214,6 +218,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { if (atomic_dec_and_test(&rcpu->refcnt)) { + if (rcpu->prog) + bpf_prog_put(rcpu->prog); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); @@ -222,6 +228,62 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, + void **frames, int n, + struct xdp_cpumap_stats *stats) +{ + struct xdp_rxq_info rxq; + struct xdp_buff xdp; + int i, nframes = 0; + + if (!rcpu->prog) + return n; + + rcu_read_lock(); + + xdp_set_return_frame_no_direct(); + xdp.rxq = &rxq; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + rxq.dev = xdpf->dev_rx; + rxq.mem = xdpf->mem; + /* TODO: report queue_index to xdp_rxq_info */ + + xdp_convert_frame_to_buff(xdpf, &xdp); + + act = bpf_prog_run_xdp(rcpu->prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (err < 0) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + frames[nframes++] = xdpf; + stats->pass++; + } + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fallthrough */ + case XDP_DROP: + xdp_return_frame(xdpf); + stats->drop++; + break; + } + } + + xdp_clear_return_frame_no_direct(); + + rcu_read_unlock(); + + return nframes; +} + #define CPUMAP_BATCH 8 static int cpu_map_kthread_run(void *data) @@ -236,11 +298,12 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + struct xdp_cpumap_stats stats = {}; /* zero stats */ + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m; + int i, n, m, nframes; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -261,8 +324,8 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - + n = __ptr_ring_consume_batched(rcpu->queue, frames, + CPUMAP_BATCH); for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page = virt_to_page(f); @@ -274,15 +337,19 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } - m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < n; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - drops = n; + /* Support running another XDP prog on this CPU */ + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); + if (nframes) { + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < nframes; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops += nframes; + } } local_bh_disable(); - for (i = 0; i < n; i++) { + for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; int ret; @@ -299,7 +366,7 @@ static int cpu_map_kthread_run(void *data) drops++; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats); local_bh_enable(); /* resched point, may call do_softirq() */ } @@ -309,13 +376,38 @@ static int cpu_map_kthread_run(void *data) return 0; } +bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_CPUMAP && + map->value_size != offsetofend(struct bpf_cpumap_val, qsize); +} + +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +{ + struct bpf_prog *prog; + + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + bpf_prog_put(prog); + return -EINVAL; + } + + rcpu->value.bpf_prog.id = prog->aux->id; + rcpu->prog = prog; + + return 0; +} + static struct bpf_cpu_map_entry * __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { + int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; - int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -357,6 +449,9 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + goto free_ptr_ring; + /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); diff --git a/net/core/dev.c b/net/core/dev.c index b61075828358..b820527f0a8d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5448,6 +5448,8 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) for (i = 0; i < new->aux->used_map_cnt; i++) { if (dev_map_can_have_prog(new->aux->used_maps[i])) return -EINVAL; + if (cpu_map_prog_allowed(new->aux->used_maps[i])) + return -EINVAL; } } @@ -8875,6 +8877,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_CPUMAP) { + NL_SET_ERR_MSG(extack, + "BPF_XDP_CPUMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 109623527358..c010b57fce3f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -227,6 +227,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, __MAX_BPF_ATTACH_TYPE }; @@ -3856,6 +3857,10 @@ struct bpf_devmap_val { */ struct bpf_cpumap_val { __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; }; enum sk_action { -- cgit v1.2.3 From bfdfa51702dec67e9fcd52568b4cf3c7f799db8b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 18:29:11 -0700 Subject: bpf: Drop duplicated words in uapi helper comments Drop doubled words "will" and "attach". Signed-off-by: Randy Dunlap Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/6b9f71ae-4f8e-0259-2c5d-187ddaefe6eb@infradead.org --- include/uapi/linux/bpf.h | 6 +++--- tools/include/uapi/linux/bpf.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c010b57fce3f..7ac3992dacfe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2420,7 +2420,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -2457,7 +2457,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -4000,7 +4000,7 @@ struct bpf_link_info { /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on - * attach attach type). + * attach type). */ struct bpf_sock_addr { __u32 user_family; /* Allows 4-byte read, but no write. */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c010b57fce3f..7ac3992dacfe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2420,7 +2420,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -2457,7 +2457,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -4000,7 +4000,7 @@ struct bpf_link_info { /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on - * attach attach type). + * attach type). */ struct bpf_sock_addr { __u32 user_family; /* Allows 4-byte read, but no write. */ -- cgit v1.2.3 From bbe4f4245271bd0f21bf826996c0c5d87a3529c9 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Tue, 7 Jul 2020 09:30:59 +0300 Subject: RDMA/qedr: Add EDPM mode type for user-fw compatibility In older FW versions the completion flag was treated as the ack flag in edpm messages. commit ff937b916eb6 ("qed: Add EDPM mode type for user-fw compatibility") exposed the FW option of setting which mode the QP is in by adding a flag to the qedr <-> qed API. This patch adds the qedr <-> libqedr interface so that the libqedr can set the flag appropriately and qedr can pass it down to FW. Flag is added for backward compatibility with libqedr. For older libs, this flag didn't exist and therefore set to zero. Fixes: ac1b36e55a51 ("qedr: Add support for user context verbs") Link: https://lore.kernel.org/r/20200707063100.3811-2-michal.kalderon@marvell.com Signed-off-by: Yuval Bason Signed-off-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/qedr.h | 1 + drivers/infiniband/hw/qedr/verbs.c | 11 ++++++++--- include/uapi/rdma/qedr-abi.h | 4 ++-- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index aa332027da86..460292179b32 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -235,6 +235,7 @@ struct qedr_ucontext { u32 dpi_size; u16 dpi; bool db_rec; + u8 edpm_mode; }; union db_prod32 { diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 42273aa0b5e1..5008149ea116 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -275,7 +275,8 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) DP_ERR(dev, "Problem copying data from user space\n"); return -EFAULT; } - + ctx->edpm_mode = !!(ureq.context_flags & + QEDR_ALLOC_UCTX_EDPM_MODE); ctx->db_rec = !!(ureq.context_flags & QEDR_ALLOC_UCTX_DB_REC); } @@ -316,7 +317,8 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) uresp.dpm_flags = QEDR_DPM_TYPE_IWARP_LEGACY; else uresp.dpm_flags = QEDR_DPM_TYPE_ROCE_ENHANCED | - QEDR_DPM_TYPE_ROCE_LEGACY; + QEDR_DPM_TYPE_ROCE_LEGACY | + QEDR_DPM_TYPE_ROCE_EDPM_MODE; uresp.dpm_flags |= QEDR_DPM_SIZES_SET; uresp.ldpm_limit_size = QEDR_LDPM_MAX_SIZE; @@ -1750,7 +1752,7 @@ static int qedr_create_user_qp(struct qedr_dev *dev, struct qed_rdma_create_qp_out_params out_params; struct qedr_pd *pd = get_qedr_pd(ibpd); struct qedr_create_qp_uresp uresp; - struct qedr_ucontext *ctx = NULL; + struct qedr_ucontext *ctx = pd ? pd->uctx : NULL; struct qedr_create_qp_ureq ureq; int alloc_and_init = rdma_protocol_roce(&dev->ibdev, 1); int rc = -EINVAL; @@ -1788,6 +1790,9 @@ static int qedr_create_user_qp(struct qedr_dev *dev, in_params.rq_pbl_ptr = qp->urq.pbl_tbl->pa; } + if (ctx) + SET_FIELD(in_params.flags, QED_ROCE_EDPM_MODE, ctx->edpm_mode); + qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx, &in_params, &out_params); diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h index a0b83c9d4498..b261c9fca07b 100644 --- a/include/uapi/rdma/qedr-abi.h +++ b/include/uapi/rdma/qedr-abi.h @@ -39,7 +39,7 @@ /* user kernel communication data structures. */ enum qedr_alloc_ucontext_flags { - QEDR_ALLOC_UCTX_RESERVED = 1 << 0, + QEDR_ALLOC_UCTX_EDPM_MODE = 1 << 0, QEDR_ALLOC_UCTX_DB_REC = 1 << 1 }; @@ -56,7 +56,7 @@ enum qedr_rdma_dpm_type { QEDR_DPM_TYPE_ROCE_ENHANCED = 1 << 0, QEDR_DPM_TYPE_ROCE_LEGACY = 1 << 1, QEDR_DPM_TYPE_IWARP_LEGACY = 1 << 2, - QEDR_DPM_TYPE_RESERVED = 1 << 3, + QEDR_DPM_TYPE_ROCE_EDPM_MODE = 1 << 3, QEDR_DPM_SIZES_SET = 1 << 4, }; -- cgit v1.2.3 From eb7f84e379daad69b4c92538baeaf93bbf493c14 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Tue, 7 Jul 2020 09:31:00 +0300 Subject: RDMA/qedr: Add EDPM max size to alloc ucontext response User space should receive the maximum edpm size from kernel driver, similar to other edpm/ldpm related limits. Add an additional parameter to the alloc_ucontext_resp structure for the edpm maximum size. In addition, pass an indication from user-space to kernel (and not just kernel to user) that the DPM sizes are supported. This is for supporting backward-forward compatibility between driver and lib for everything related to DPM transaction and limit sizes. This should have been part of commit mentioned in Fixes tag. Link: https://lore.kernel.org/r/20200707063100.3811-3-michal.kalderon@marvell.com Fixes: 93a3d05f9d68 ("RDMA/qedr: Add kernel capability flags for dpm enabled mode") Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/verbs.c | 9 ++++++--- include/uapi/rdma/qedr-abi.h | 6 +++++- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 5008149ea116..fcf2eaa3b459 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -320,9 +320,12 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) QEDR_DPM_TYPE_ROCE_LEGACY | QEDR_DPM_TYPE_ROCE_EDPM_MODE; - uresp.dpm_flags |= QEDR_DPM_SIZES_SET; - uresp.ldpm_limit_size = QEDR_LDPM_MAX_SIZE; - uresp.edpm_trans_size = QEDR_EDPM_TRANS_SIZE; + if (ureq.context_flags & QEDR_SUPPORT_DPM_SIZES) { + uresp.dpm_flags |= QEDR_DPM_SIZES_SET; + uresp.ldpm_limit_size = QEDR_LDPM_MAX_SIZE; + uresp.edpm_trans_size = QEDR_EDPM_TRANS_SIZE; + uresp.edpm_limit_size = QEDR_EDPM_MAX_SIZE; + } uresp.wids_enabled = 1; uresp.wid_count = oparams.wid_count; diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h index b261c9fca07b..bf7333b2b5d7 100644 --- a/include/uapi/rdma/qedr-abi.h +++ b/include/uapi/rdma/qedr-abi.h @@ -40,7 +40,8 @@ /* user kernel communication data structures. */ enum qedr_alloc_ucontext_flags { QEDR_ALLOC_UCTX_EDPM_MODE = 1 << 0, - QEDR_ALLOC_UCTX_DB_REC = 1 << 1 + QEDR_ALLOC_UCTX_DB_REC = 1 << 1, + QEDR_SUPPORT_DPM_SIZES = 1 << 2, }; struct qedr_alloc_ucontext_req { @@ -50,6 +51,7 @@ struct qedr_alloc_ucontext_req { #define QEDR_LDPM_MAX_SIZE (8192) #define QEDR_EDPM_TRANS_SIZE (64) +#define QEDR_EDPM_MAX_SIZE (ROCE_REQ_MAX_INLINE_DATA_SIZE) enum qedr_rdma_dpm_type { QEDR_DPM_TYPE_NONE = 0, @@ -77,6 +79,8 @@ struct qedr_alloc_ucontext_resp { __u16 ldpm_limit_size; __u8 edpm_trans_size; __u8 reserved; + __u16 edpm_limit_size; + __u8 padding[6]; }; struct qedr_alloc_pd_ureq { -- cgit v1.2.3 From e3a5a1e8b6548f5d37328e2d3571edc5c9e6d7c0 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Thu, 16 Jul 2020 12:12:35 -0700 Subject: tcp: add SNMP counter for no. of duplicate segments reported by DSACK There are two existing SNMP counters, TCPDSACKRecv and TCPDSACKOfoRecv, which are incremented depending on whether the DSACKed range is below the cumulative ACK sequence number or not. Unfortunately, these both implicitly assume each DSACK covers only one segment. This makes these counters unusable for estimating spurious retransmit rates, or real/non-spurious loss rate. This patch introduces a new SNMP counter, TCPDSACKRecvSegs, which tracks the estimated number of duplicate segments based on: (DSACKed sequence range) / MSS. This counter is usable for estimating spurious retransmit rates, or real/non-spurious loss rate. Signed-off-by: Priyaranjan Jha Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_input.c | 1 + 3 files changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 7d91f4debc48..cee9f8e6fce3 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -287,6 +287,7 @@ enum LINUX_MIB_TCPFASTOPENPASSIVEALTKEY, /* TCPFastOpenPassiveAltKey */ LINUX_MIB_TCPTIMEOUTREHASH, /* TCPTimeoutRehash */ LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ + LINUX_MIB_TCPDSACKRECVSEGS, /* TCPDSACKRecvSegs */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 75545a829a2b..1074df726ec0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -292,6 +292,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), + SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5d6bbcb1e570..82906deb7874 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1153,6 +1153,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); /* D-SACK for already forgotten data... Do dumb counting. */ if (tp->undo_marker && tp->undo_retrans > 0 && -- cgit v1.2.3 From e9ddbb7707ff5891616240026062b8c1e29864ca Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:23 +0200 Subject: bpf: Introduce SK_LOOKUP program type with a dedicated attach point Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer when looking up a listening socket for a new connection request for connection oriented protocols, or when looking up an unconnected socket for a packet for connection-less protocols. When called, SK_LOOKUP BPF program can select a socket that will receive the packet. This serves as a mechanism to overcome the limits of what bind() API allows to express. Two use-cases driving this work are: (1) steer packets destined to an IP range, on fixed port to a socket 192.0.2.0/24, port 80 -> NGINX socket (2) steer packets destined to an IP address, on any port to a socket 198.51.100.1, any port -> L7 proxy socket In its run-time context program receives information about the packet that triggered the socket lookup. Namely IP version, L4 protocol identifier, and address 4-tuple. Context can be further extended to include ingress interface identifier. To select a socket BPF program fetches it from a map holding socket references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...) helper to record the selection. Transport layer then uses the selected socket as a result of socket lookup. In its basic form, SK_LOOKUP acts as a filter and hence must return either SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should look for a socket to receive the packet, or use the one selected by the program if available, while SK_DROP informs the transport layer that the lookup should fail. This patch only enables the user to attach an SK_LOOKUP program to a network namespace. Subsequent patches hook it up to run on local delivery path in ipv4 and ipv6 stacks. Suggested-by: Marek Majkowski Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com --- include/linux/bpf-netns.h | 3 + include/linux/bpf.h | 1 + include/linux/bpf_types.h | 2 + include/linux/filter.h | 17 +++++ include/uapi/linux/bpf.h | 77 +++++++++++++++++++ kernel/bpf/net_namespace.c | 5 ++ kernel/bpf/syscall.c | 9 +++ kernel/bpf/verifier.c | 13 +++- net/core/filter.c | 180 +++++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 9 ++- 10 files changed, 312 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index 47d5b0c708c9..722f799c1a2e 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -8,6 +8,7 @@ enum netns_bpf_attach_type { NETNS_BPF_INVALID = -1, NETNS_BPF_FLOW_DISSECTOR = 0, + NETNS_BPF_SK_LOOKUP, MAX_NETNS_BPF_ATTACH_TYPE }; @@ -17,6 +18,8 @@ to_netns_bpf_attach_type(enum bpf_attach_type attach_type) switch (attach_type) { case BPF_FLOW_DISSECTOR: return NETNS_BPF_FLOW_DISSECTOR; + case BPF_SK_LOOKUP: + return NETNS_BPF_SK_LOOKUP; default: return NETNS_BPF_INVALID; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c8c9eabcd106..adb16bdc5f0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -249,6 +249,7 @@ enum bpf_arg_type { ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ + ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a18ae82a298a..a52a5688418e 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -64,6 +64,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport, struct sk_reuseport_md, struct sk_reuseport_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_LOOKUP, sk_lookup, + struct bpf_sk_lookup, struct bpf_sk_lookup_kern) #endif #if defined(CONFIG_BPF_JIT) BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops, diff --git a/include/linux/filter.h b/include/linux/filter.h index 0b0144752d78..fa1ea12ad2cd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1278,4 +1278,21 @@ struct bpf_sockopt_kern { s32 retval; }; +struct bpf_sk_lookup_kern { + u16 family; + u16 protocol; + struct { + __be32 saddr; + __be32 daddr; + } v4; + struct { + const struct in6_addr *saddr; + const struct in6_addr *daddr; + } v6; + __be16 sport; + u16 dport; + struct sock *selected_sk; + bool no_reuseport; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7ac3992dacfe..54d0c886e3ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -189,6 +189,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, }; enum bpf_attach_type { @@ -228,6 +229,7 @@ enum bpf_attach_type { BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, BPF_XDP_CPUMAP, + BPF_SK_LOOKUP, __MAX_BPF_ATTACH_TYPE }; @@ -3069,6 +3071,10 @@ union bpf_attr { * * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, * will cause *skb* to be delivered to the specified socket. @@ -3094,6 +3100,56 @@ union bpf_attr { * **-ESOCKTNOSUPPORT** if the socket type is not supported * (reuseport). * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. + * + * Select the *sk* as a result of a socket lookup. + * + * For the operation to succeed passed socket must be compatible + * with the packet description provided by the *ctx* object. + * + * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must + * be an exact match. While IP family (**AF_INET** or + * **AF_INET6**) must be compatible, that is IPv6 sockets + * that are not v6-only can be selected for IPv4 packets. + * + * Only TCP listeners and UDP unconnected sockets can be + * selected. *sk* can also be NULL to reset any previous + * selection. + * + * *flags* argument can combination of following values: + * + * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous + * socket selection, potentially done by a BPF program + * that ran before us. + * + * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip + * load-balancing within reuseport group for the socket + * being selected. + * + * On success *ctx->sk* will point to the selected socket. + * + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EAFNOSUPPORT** if socket family (*sk->family*) is + * not compatible with packet family (*ctx->family*). + * + * * **-EEXIST** if socket has been already selected, + * potentially by another program, and + * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. + * + * * **-EINVAL** if unsupported flags were specified. + * + * * **-EPROTOTYPE** if socket L4 protocol + * (*sk->protocol*) doesn't match packet protocol + * (*ctx->protocol*). + * + * * **-ESOCKTNOSUPPORT** if socket is not in allowed + * state (TCP listening or UDP unconnected). + * * u64 bpf_ktime_get_boot_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. @@ -3607,6 +3663,12 @@ enum { BPF_RINGBUF_HDR_SZ = 8, }; +/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ +enum { + BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), + BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, @@ -4349,4 +4411,19 @@ struct bpf_pidns_info { __u32 pid; __u32 tgid; }; + +/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ +struct bpf_sk_lookup { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + + __u32 family; /* Protocol family (AF_INET, AF_INET6) */ + __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ + __u32 remote_ip4; /* Network byte order */ + __u32 remote_ip6[4]; /* Network byte order */ + __u32 remote_port; /* Network byte order */ + __u32 local_ip4; /* Network byte order */ + __u32 local_ip6[4]; /* Network byte order */ + __u32 local_port; /* Host byte order */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index e9c8e26ac8f2..38b368bccda2 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -373,6 +373,8 @@ static int netns_bpf_max_progs(enum netns_bpf_attach_type type) switch (type) { case NETNS_BPF_FLOW_DISSECTOR: return 1; + case NETNS_BPF_SK_LOOKUP: + return 64; default: return 0; } @@ -403,6 +405,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, case NETNS_BPF_FLOW_DISSECTOR: err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; + case NETNS_BPF_SK_LOOKUP: + err = 0; /* nothing to check */ + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7ea9dfbebd8c..d07417d17712 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2022,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_SK_LOOKUP: + if (expected_attach_type == BPF_SK_LOOKUP) + return 0; + return -EINVAL; case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; @@ -2756,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: if (!capable(CAP_NET_ADMIN)) @@ -2817,6 +2822,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: return BPF_PROG_TYPE_TRACING; + case BPF_SK_LOOKUP: + return BPF_PROG_TYPE_SK_LOOKUP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -2953,6 +2960,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: + case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; @@ -3891,6 +3899,7 @@ static int link_create(union bpf_attr *attr) ret = tracing_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3c1efc9d08fd..9a6703bc3f36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3878,10 +3878,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } meta->ref_obj_id = reg->ref_obj_id; } - } else if (arg_type == ARG_PTR_TO_SOCKET) { + } else if (arg_type == ARG_PTR_TO_SOCKET || + arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { + if (type != expected_type) + goto err_type; + } } else if (arg_type == ARG_PTR_TO_BTF_ID) { expected_type = PTR_TO_BTF_ID; if (type != expected_type) @@ -7354,6 +7358,9 @@ static int check_return_code(struct bpf_verifier_env *env) return -ENOTSUPP; } break; + case BPF_PROG_TYPE_SK_LOOKUP: + range = tnum_range(SK_DROP, SK_PASS); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. diff --git a/net/core/filter.c b/net/core/filter.c index bdd2382e655d..d099436b3ff5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9229,6 +9229,186 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; + +BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, + struct sock *, sk, u64, flags) +{ + if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | + BPF_SK_LOOKUP_F_NO_REUSEPORT))) + return -EINVAL; + if (unlikely(sk && sk_is_refcounted(sk))) + return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ + if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED)) + return -ESOCKTNOSUPPORT; /* reject connected sockets */ + + /* Check if socket is suitable for packet L3/L4 protocol */ + if (sk && sk->sk_protocol != ctx->protocol) + return -EPROTOTYPE; + if (sk && sk->sk_family != ctx->family && + (sk->sk_family == AF_INET || ipv6_only_sock(sk))) + return -EAFNOSUPPORT; + + if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) + return -EEXIST; + + /* Select socket as lookup result */ + ctx->selected_sk = sk; + ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; + return 0; +} + +static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { + .func = bpf_sk_lookup_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_lookup_assign_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool sk_lookup_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) + return false; + if (off % size != 0) + return false; + if (type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sk_lookup, sk): + info->reg_type = PTR_TO_SOCKET_OR_NULL; + return size == sizeof(__u64); + + case bpf_ctx_range(struct bpf_sk_lookup, family): + case bpf_ctx_range(struct bpf_sk_lookup, protocol): + case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): + case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): + case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + case bpf_ctx_range(struct bpf_sk_lookup, local_port): + bpf_ctx_record_field_size(info, sizeof(__u32)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + + default: + return false; + } +} + +static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sk_lookup, sk): + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, selected_sk)); + break; + + case offsetof(struct bpf_sk_lookup, family): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + family, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + protocol, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, remote_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.saddr, 4, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.daddr, 4, target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sk_lookup, + remote_ip6[0], remote_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.saddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case bpf_ctx_range_till(struct bpf_sk_lookup, + local_ip6[0], local_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.daddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case offsetof(struct bpf_sk_lookup, remote_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + sport, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + dport, 2, target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_prog_ops sk_lookup_prog_ops = { +}; + +const struct bpf_verifier_ops sk_lookup_verifier_ops = { + .get_func_proto = sk_lookup_func_proto, + .is_valid_access = sk_lookup_is_valid_access, + .convert_ctx_access = sk_lookup_convert_ctx_access, +}; + #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6843376733df..5bfa448b4704 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -404,6 +404,7 @@ class PrinterHelpers(Printer): type_fwds = [ 'struct bpf_fib_lookup', + 'struct bpf_sk_lookup', 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', @@ -450,6 +451,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_sk_lookup', 'struct bpf_sock', 'struct bpf_sock_addr', 'struct bpf_sock_ops', @@ -487,6 +489,11 @@ class PrinterHelpers(Printer): 'struct sk_msg_buff': 'struct sk_msg_md', 'struct xdp_buff': 'struct xdp_md', } + # Helpers overloaded for different context types. + overloaded_helpers = [ + 'bpf_get_socket_cookie', + 'bpf_sk_assign', + ] def print_header(self): header = '''\ @@ -543,7 +550,7 @@ class PrinterHelpers(Printer): for i, a in enumerate(proto['args']): t = a['type'] n = a['name'] - if proto['name'] == 'bpf_get_socket_cookie' and i == 0: + if proto['name'] in self.overloaded_helpers and i == 0: t = 'void' n = 'ctx' one_arg = '{}{}'.format(comma, self.map_type(t)) -- cgit v1.2.3 From b3ab1c6058fad8cd5726f24e9ed9053e43bb2af4 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Wed, 24 Jun 2020 21:28:00 +0200 Subject: media: Add V4L2_TYPE_IS_CAPTURE helper It's all too easy to get confused by the V4L2_TYPE_IS_OUTPUT macro, when it's used as !V4L2_TYPE_IS_OUTPUT. Reduce the risk of confusion with macro to explicitly check for the CAPTURE queue type case. This change does not affect functionality, and it's only intended to make the code more readable. Suggested-by: Nicolas Dufresne Signed-off-by: Ezequiel Garcia Signed-off-by: Hans Verkuil [hverkuil-cisco@xs4all.nl: checkpatch: align with parenthesis] Signed-off-by: Mauro Carvalho Chehab --- drivers/media/common/videobuf2/videobuf2-v4l2.c | 4 ++-- drivers/media/platform/exynos-gsc/gsc-core.c | 2 +- drivers/media/platform/exynos-gsc/gsc-m2m.c | 2 +- drivers/media/platform/mtk-jpeg/mtk_jpeg_core.c | 2 +- drivers/media/platform/mtk-mdp/mtk_mdp_m2m.c | 7 +++---- drivers/media/platform/rcar_jpu.c | 2 +- drivers/media/platform/sti/hva/hva-v4l2.c | 2 +- drivers/media/platform/ti-vpe/vpe.c | 2 +- drivers/media/test-drivers/vicodec/vicodec-core.c | 6 +++--- drivers/media/v4l2-core/v4l2-mem2mem.c | 6 +++--- drivers/staging/media/hantro/hantro_v4l2.c | 2 +- drivers/staging/media/rkvdec/rkvdec.c | 2 +- include/uapi/linux/videodev2.h | 2 ++ 13 files changed, 21 insertions(+), 20 deletions(-) (limited to 'include/uapi') diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index 57aa183bd198..30caad27281e 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -97,7 +97,7 @@ static int __verify_length(struct vb2_buffer *vb, const struct v4l2_buffer *b) unsigned int bytesused; unsigned int plane; - if (!V4L2_TYPE_IS_OUTPUT(b->type)) + if (V4L2_TYPE_IS_CAPTURE(b->type)) return 0; if (V4L2_TYPE_IS_MULTIPLANAR(b->type)) { @@ -311,7 +311,7 @@ static int vb2_fill_vb2_v4l2_buffer(struct vb2_buffer *vb, struct v4l2_buffer *b /* Zero flags that we handle */ vbuf->flags = b->flags & ~V4L2_BUFFER_MASK_FLAGS; - if (!vb->vb2_queue->copy_timestamp || !V4L2_TYPE_IS_OUTPUT(b->type)) { + if (!vb->vb2_queue->copy_timestamp || V4L2_TYPE_IS_CAPTURE(b->type)) { /* * Non-COPY timestamps and non-OUTPUT queues will get * their timestamp and timestamp source flags from the diff --git a/drivers/media/platform/exynos-gsc/gsc-core.c b/drivers/media/platform/exynos-gsc/gsc-core.c index f6650b45bc3d..9f41c2e7097a 100644 --- a/drivers/media/platform/exynos-gsc/gsc-core.c +++ b/drivers/media/platform/exynos-gsc/gsc-core.c @@ -577,7 +577,7 @@ int gsc_try_selection(struct gsc_ctx *ctx, struct v4l2_selection *s) v4l_bound_align_image(&tmp_w, min_w, max_w, mod_x, &tmp_h, min_h, max_h, mod_y, 0); - if (!V4L2_TYPE_IS_OUTPUT(s->type) && + if (V4L2_TYPE_IS_CAPTURE(s->type) && (ctx->gsc_ctrls.rotate->val == 90 || ctx->gsc_ctrls.rotate->val == 270)) gsc_check_crop_change(tmp_h, tmp_w, diff --git a/drivers/media/platform/exynos-gsc/gsc-m2m.c b/drivers/media/platform/exynos-gsc/gsc-m2m.c index e2c162635f72..27a3c92c73bc 100644 --- a/drivers/media/platform/exynos-gsc/gsc-m2m.c +++ b/drivers/media/platform/exynos-gsc/gsc-m2m.c @@ -255,7 +255,7 @@ static int gsc_m2m_buf_prepare(struct vb2_buffer *vb) if (IS_ERR(frame)) return PTR_ERR(frame); - if (!V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type)) { + if (V4L2_TYPE_IS_CAPTURE(vb->vb2_queue->type)) { for (i = 0; i < frame->fmt->num_planes; i++) vb2_set_plane_payload(vb, i, frame->payload[i]); } diff --git a/drivers/media/platform/mtk-jpeg/mtk_jpeg_core.c b/drivers/media/platform/mtk-jpeg/mtk_jpeg_core.c index f82a81a3bdee..61fed1e35a00 100644 --- a/drivers/media/platform/mtk-jpeg/mtk_jpeg_core.c +++ b/drivers/media/platform/mtk-jpeg/mtk_jpeg_core.c @@ -731,7 +731,7 @@ static void mtk_jpeg_stop_streaming(struct vb2_queue *q) * subsampling. Update capture queue when the stream is off. */ if (ctx->state == MTK_JPEG_SOURCE_CHANGE && - !V4L2_TYPE_IS_OUTPUT(q->type)) { + V4L2_TYPE_IS_CAPTURE(q->type)) { struct mtk_jpeg_src_buf *src_buf; vb = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx); diff --git a/drivers/media/platform/mtk-mdp/mtk_mdp_m2m.c b/drivers/media/platform/mtk-mdp/mtk_mdp_m2m.c index bb9caaf513bc..724c7333b6e5 100644 --- a/drivers/media/platform/mtk-mdp/mtk_mdp_m2m.c +++ b/drivers/media/platform/mtk-mdp/mtk_mdp_m2m.c @@ -193,7 +193,7 @@ static const struct mtk_mdp_fmt *mtk_mdp_try_fmt_mplane(struct mtk_mdp_ctx *ctx, pix_mp->field = V4L2_FIELD_NONE; pix_mp->pixelformat = fmt->pixelformat; - if (!V4L2_TYPE_IS_OUTPUT(f->type)) { + if (V4L2_TYPE_IS_CAPTURE(f->type)) { pix_mp->colorspace = ctx->colorspace; pix_mp->xfer_func = ctx->xfer_func; pix_mp->ycbcr_enc = ctx->ycbcr_enc; @@ -327,9 +327,8 @@ static int mtk_mdp_try_crop(struct mtk_mdp_ctx *ctx, u32 type, mtk_mdp_bound_align_image(&new_w, min_w, max_w, align_w, &new_h, min_h, max_h, align_h); - if (!V4L2_TYPE_IS_OUTPUT(type) && - (ctx->ctrls.rotate->val == 90 || - ctx->ctrls.rotate->val == 270)) + if (V4L2_TYPE_IS_CAPTURE(type) && + (ctx->ctrls.rotate->val == 90 || ctx->ctrls.rotate->val == 270)) mtk_mdp_check_crop_change(new_h, new_w, &r->width, &r->height); else diff --git a/drivers/media/platform/rcar_jpu.c b/drivers/media/platform/rcar_jpu.c index 5250a14324e9..9b99ff368698 100644 --- a/drivers/media/platform/rcar_jpu.c +++ b/drivers/media/platform/rcar_jpu.c @@ -1066,7 +1066,7 @@ static int jpu_buf_prepare(struct vb2_buffer *vb) } /* decoder capture queue */ - if (!ctx->encoder && !V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type)) + if (!ctx->encoder && V4L2_TYPE_IS_CAPTURE(vb->vb2_queue->type)) vb2_set_plane_payload(vb, i, size); } diff --git a/drivers/media/platform/sti/hva/hva-v4l2.c b/drivers/media/platform/sti/hva/hva-v4l2.c index 197b99d8fd9c..bb34d6997d99 100644 --- a/drivers/media/platform/sti/hva/hva-v4l2.c +++ b/drivers/media/platform/sti/hva/hva-v4l2.c @@ -1087,7 +1087,7 @@ static void hva_stop_streaming(struct vb2_queue *vq) if ((V4L2_TYPE_IS_OUTPUT(vq->type) && vb2_is_streaming(&ctx->fh.m2m_ctx->cap_q_ctx.q)) || - (!V4L2_TYPE_IS_OUTPUT(vq->type) && + (V4L2_TYPE_IS_CAPTURE(vq->type) && vb2_is_streaming(&ctx->fh.m2m_ctx->out_q_ctx.q))) { dev_dbg(dev, "%s %s out=%d cap=%d\n", ctx->name, to_type_str(vq->type), diff --git a/drivers/media/platform/ti-vpe/vpe.c b/drivers/media/platform/ti-vpe/vpe.c index cff2fcd6d812..346f8212791c 100644 --- a/drivers/media/platform/ti-vpe/vpe.c +++ b/drivers/media/platform/ti-vpe/vpe.c @@ -1576,7 +1576,7 @@ static int vpe_g_fmt(struct file *file, void *priv, struct v4l2_format *f) *f = q_data->format; - if (!V4L2_TYPE_IS_OUTPUT(f->type)) { + if (V4L2_TYPE_IS_CAPTURE(f->type)) { struct vpe_q_data *s_q_data; struct v4l2_pix_format_mplane *spix; diff --git a/drivers/media/test-drivers/vicodec/vicodec-core.c b/drivers/media/test-drivers/vicodec/vicodec-core.c index e879290727ef..8941d73f6611 100644 --- a/drivers/media/test-drivers/vicodec/vicodec-core.c +++ b/drivers/media/test-drivers/vicodec/vicodec-core.c @@ -1442,7 +1442,7 @@ static void vicodec_buf_queue(struct vb2_buffer *vb) .u.src_change.changes = V4L2_EVENT_SRC_CH_RESOLUTION, }; - if (!V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type) && + if (V4L2_TYPE_IS_CAPTURE(vb->vb2_queue->type) && vb2_is_streaming(vb->vb2_queue) && v4l2_m2m_dst_buf_is_last(ctx->fh.m2m_ctx)) { unsigned int i; @@ -1479,7 +1479,7 @@ static void vicodec_buf_queue(struct vb2_buffer *vb) * in the compressed stream */ if (ctx->is_stateless || ctx->is_enc || - !V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type)) { + V4L2_TYPE_IS_CAPTURE(vb->vb2_queue->type)) { v4l2_m2m_buf_queue(ctx->fh.m2m_ctx, vbuf); return; } @@ -1574,7 +1574,7 @@ static int vicodec_start_streaming(struct vb2_queue *q, state->gop_cnt = 0; if ((V4L2_TYPE_IS_OUTPUT(q->type) && !ctx->is_enc) || - (!V4L2_TYPE_IS_OUTPUT(q->type) && ctx->is_enc)) + (V4L2_TYPE_IS_CAPTURE(q->type) && ctx->is_enc)) return 0; if (info->id == V4L2_PIX_FMT_FWHT || diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c index 62ac9424c92a..95a8f2dc5341 100644 --- a/drivers/media/v4l2-core/v4l2-mem2mem.c +++ b/drivers/media/v4l2-core/v4l2-mem2mem.c @@ -556,7 +556,7 @@ int v4l2_m2m_querybuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, ret = vb2_querybuf(vq, buf); /* Adjust MMAP memory offsets for the CAPTURE queue */ - if (buf->memory == V4L2_MEMORY_MMAP && !V4L2_TYPE_IS_OUTPUT(vq->type)) { + if (buf->memory == V4L2_MEMORY_MMAP && V4L2_TYPE_IS_CAPTURE(vq->type)) { if (V4L2_TYPE_IS_MULTIPLANAR(vq->type)) { for (i = 0; i < buf->length; ++i) buf->m.planes[i].m.mem_offset @@ -712,7 +712,7 @@ int v4l2_m2m_qbuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, int ret; vq = v4l2_m2m_get_vq(m2m_ctx, buf->type); - if (!V4L2_TYPE_IS_OUTPUT(vq->type) && + if (V4L2_TYPE_IS_CAPTURE(vq->type) && (buf->flags & V4L2_BUF_FLAG_REQUEST_FD)) { dprintk("%s: requests cannot be used with capture buffers\n", __func__); @@ -729,7 +729,7 @@ int v4l2_m2m_qbuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, * buffer as DONE with LAST flag since it won't be queued on the * device. */ - if (!V4L2_TYPE_IS_OUTPUT(vq->type) && + if (V4L2_TYPE_IS_CAPTURE(vq->type) && vb2_is_streaming(vq) && !vb2_start_streaming_called(vq) && (v4l2_m2m_has_stopped(m2m_ctx) || v4l2_m2m_dst_buf_is_last(m2m_ctx))) v4l2_m2m_force_last_buf_done(m2m_ctx, vq); diff --git a/drivers/staging/media/hantro/hantro_v4l2.c b/drivers/staging/media/hantro/hantro_v4l2.c index f28a94e2fa93..63859e8a0923 100644 --- a/drivers/staging/media/hantro/hantro_v4l2.c +++ b/drivers/staging/media/hantro/hantro_v4l2.c @@ -237,7 +237,7 @@ static int hantro_try_fmt(const struct hantro_ctx *ctx, enum v4l2_buf_type type) { const struct hantro_fmt *fmt, *vpu_fmt; - bool capture = !V4L2_TYPE_IS_OUTPUT(type); + bool capture = V4L2_TYPE_IS_CAPTURE(type); bool coded; coded = capture == hantro_is_encoder_ctx(ctx); diff --git a/drivers/staging/media/rkvdec/rkvdec.c b/drivers/staging/media/rkvdec/rkvdec.c index 225eeca73356..fd68671f0286 100644 --- a/drivers/staging/media/rkvdec/rkvdec.c +++ b/drivers/staging/media/rkvdec/rkvdec.c @@ -489,7 +489,7 @@ static int rkvdec_start_streaming(struct vb2_queue *q, unsigned int count) const struct rkvdec_coded_fmt_desc *desc; int ret; - if (!V4L2_TYPE_IS_OUTPUT(q->type)) + if (V4L2_TYPE_IS_CAPTURE(q->type)) return 0; desc = ctx->coded_fmt_desc; diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 303805438814..c7b70ff53bc1 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -171,6 +171,8 @@ enum v4l2_buf_type { || (type) == V4L2_BUF_TYPE_SDR_OUTPUT \ || (type) == V4L2_BUF_TYPE_META_OUTPUT) +#define V4L2_TYPE_IS_CAPTURE(type) (!V4L2_TYPE_IS_OUTPUT(type)) + enum v4l2_tuner_type { V4L2_TUNER_RADIO = 1, V4L2_TUNER_ANALOG_TV = 2, -- cgit v1.2.3 From 124ea650d3072b005457faed69909221c2905a1f Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 19 Jul 2020 12:04:11 +0200 Subject: capabilities: Introduce CAP_CHECKPOINT_RESTORE This patch introduces CAP_CHECKPOINT_RESTORE, a new capability facilitating checkpoint/restore for non-root users. Over the last years, The CRIU (Checkpoint/Restore In Userspace) team has been asked numerous times if it is possible to checkpoint/restore a process as non-root. The answer usually was: 'almost'. The main blocker to restore a process as non-root was to control the PID of the restored process. This feature available via the clone3 system call, or via /proc/sys/kernel/ns_last_pid is unfortunately guarded by CAP_SYS_ADMIN. In the past two years, requests for non-root checkpoint/restore have increased due to the following use cases: * Checkpoint/Restore in an HPC environment in combination with a resource manager distributing jobs where users are always running as non-root. There is a desire to provide a way to checkpoint and restore long running jobs. * Container migration as non-root * We have been in contact with JVM developers who are integrating CRIU into a Java VM to decrease the startup time. These checkpoint/restore applications are not meant to be running with CAP_SYS_ADMIN. We have seen the following workarounds: * Use a setuid wrapper around CRIU: See https://github.com/FredHutch/slurm-examples/blob/master/checkpointer/lib/checkpointer/checkpointer-suid.c * Use a setuid helper that writes to ns_last_pid. Unfortunately, this helper delegation technique is impossible to use with clone3, and is thus prone to races. See https://github.com/twosigma/set_ns_last_pid * Cycle through PIDs with fork() until the desired PID is reached: This has been demonstrated to work with cycling rates of 100,000 PIDs/s See https://github.com/twosigma/set_ns_last_pid * Patch out the CAP_SYS_ADMIN check from the kernel * Run the desired application in a new user and PID namespace to provide a local CAP_SYS_ADMIN for controlling PIDs. This technique has limited use in typical container environments (e.g., Kubernetes) as /proc is typically protected with read-only layers (e.g., /proc/sys) for hardening purposes. Read-only layers prevent additional /proc mounts (due to proc's SB_I_USERNS_VISIBLE property), making the use of new PID namespaces limited as certain applications need access to /proc matching their PID namespace. The introduced capability allows to: * Control PIDs when the current user is CAP_CHECKPOINT_RESTORE capable for the corresponding PID namespace via ns_last_pid/clone3. * Open files in /proc/pid/map_files when the current user is CAP_CHECKPOINT_RESTORE capable in the root namespace, useful for recovering files that are unreachable via the file system such as deleted files, or memfd files. See corresponding selftest for an example with clone3(). Signed-off-by: Adrian Reber Signed-off-by: Nicolas Viennot Reviewed-by: Serge Hallyn Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200719100418.2112740-2-areber@redhat.com Signed-off-by: Christian Brauner --- include/linux/capability.h | 6 ++++++ include/uapi/linux/capability.h | 9 ++++++++- security/selinux/include/classmap.h | 5 +++-- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/capability.h b/include/linux/capability.h index b4345b38a6be..1e7fe311cabe 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -261,6 +261,12 @@ static inline bool bpf_capable(void) return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); } +static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) +{ + return ns_capable(ns, CAP_CHECKPOINT_RESTORE) || + ns_capable(ns, CAP_SYS_ADMIN); +} + /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 48ff0757ae5e..395dd0df8d08 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -408,7 +408,14 @@ struct vfs_ns_cap_data { */ #define CAP_BPF 39 -#define CAP_LAST_CAP CAP_BPF + +/* Allow checkpoint/restore related operations */ +/* Allow PID selection during clone3() */ +/* Allow writing to ns_last_pid */ + +#define CAP_CHECKPOINT_RESTORE 40 + +#define CAP_LAST_CAP CAP_CHECKPOINT_RESTORE #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 98e1513b608a..40cebde62856 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,10 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf" + "wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf", \ + "checkpoint_restore" -#if CAP_LAST_CAP > CAP_BPF +#if CAP_LAST_CAP > CAP_CHECKPOINT_RESTORE #error New capability defined, please update COMMON_CAP2_PERMS. #endif -- cgit v1.2.3 From c4471ad9a50d5548e66ae4511acfb1dc23a48744 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 20 Jul 2020 00:03:33 +0200 Subject: net: phy: add USXGMII link partner ability constants The constants are taken from the USXGMII Singleport Copper Interface specification. The naming are based on the SGMII ones, but with an MDIO_ prefix. Signed-off-by: Michael Walle Reviewed-by: Russell King Signed-off-by: David S. Miller --- include/uapi/linux/mdio.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index 4bcb41c71b8c..3f302e2523b2 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -324,4 +324,30 @@ static inline __u16 mdio_phy_id_c45(int prtad, int devad) return MDIO_PHY_ID_C45 | (prtad << 5) | devad; } +/* UsxgmiiChannelInfo[15:0] for USXGMII in-band auto-negotiation.*/ +#define MDIO_USXGMII_EEE_CLK_STP 0x0080 /* EEE clock stop supported */ +#define MDIO_USXGMII_EEE 0x0100 /* EEE supported */ +#define MDIO_USXGMII_SPD_MASK 0x0e00 /* USXGMII speed mask */ +#define MDIO_USXGMII_FULL_DUPLEX 0x1000 /* USXGMII full duplex */ +#define MDIO_USXGMII_DPX_SPD_MASK 0x1e00 /* USXGMII duplex and speed bits */ +#define MDIO_USXGMII_10 0x0000 /* 10Mbps */ +#define MDIO_USXGMII_10HALF 0x0000 /* 10Mbps half-duplex */ +#define MDIO_USXGMII_10FULL 0x1000 /* 10Mbps full-duplex */ +#define MDIO_USXGMII_100 0x0200 /* 100Mbps */ +#define MDIO_USXGMII_100HALF 0x0200 /* 100Mbps half-duplex */ +#define MDIO_USXGMII_100FULL 0x1200 /* 100Mbps full-duplex */ +#define MDIO_USXGMII_1000 0x0400 /* 1000Mbps */ +#define MDIO_USXGMII_1000HALF 0x0400 /* 1000Mbps half-duplex */ +#define MDIO_USXGMII_1000FULL 0x1400 /* 1000Mbps full-duplex */ +#define MDIO_USXGMII_10G 0x0600 /* 10Gbps */ +#define MDIO_USXGMII_10GHALF 0x0600 /* 10Gbps half-duplex */ +#define MDIO_USXGMII_10GFULL 0x1600 /* 10Gbps full-duplex */ +#define MDIO_USXGMII_2500 0x0800 /* 2500Mbps */ +#define MDIO_USXGMII_2500HALF 0x0800 /* 2500Mbps half-duplex */ +#define MDIO_USXGMII_2500FULL 0x1800 /* 2500Mbps full-duplex */ +#define MDIO_USXGMII_5000 0x0a00 /* 5000Mbps */ +#define MDIO_USXGMII_5000HALF 0x0a00 /* 5000Mbps half-duplex */ +#define MDIO_USXGMII_5000FULL 0x1a00 /* 5000Mbps full-duplex */ +#define MDIO_USXGMII_LINK 0x8000 /* PHY link with copper-side partner */ + #endif /* _UAPI__LINUX_MDIO_H__ */ -- cgit v1.2.3 From 55db9c0e853421fa71cac5e6855898601f78a1f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jul 2020 08:23:15 +0200 Subject: net: remove compat_sys_{get,set}sockopt Now that the ->compat_{get,set}sockopt proto_ops methods are gone there is no good reason left to keep the compat syscalls separate. This fixes the odd use of unsigned int for the compat_setsockopt optlen and the missing sock_use_custom_sol_socket. It would also easily allow running the eBPF hooks for the compat syscalls, but such a large change in behavior does not belong into a consolidation patch like this one. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- arch/arm64/include/asm/unistd32.h | 4 +- arch/mips/kernel/syscalls/syscall_n32.tbl | 4 +- arch/mips/kernel/syscalls/syscall_o32.tbl | 4 +- arch/parisc/kernel/syscalls/syscall.tbl | 4 +- arch/powerpc/kernel/syscalls/syscall.tbl | 4 +- arch/s390/kernel/syscalls/syscall.tbl | 4 +- arch/sparc/kernel/sys32.S | 12 ++-- arch/sparc/kernel/syscalls/syscall.tbl | 4 +- arch/x86/entry/syscall_x32.c | 7 ++ arch/x86/entry/syscalls/syscall_32.tbl | 4 +- arch/x86/entry/syscalls/syscall_64.tbl | 4 +- include/linux/compat.h | 4 -- include/linux/syscalls.h | 4 ++ include/uapi/asm-generic/unistd.h | 4 +- net/compat.c | 79 +--------------------- net/socket.c | 25 ++++--- tools/include/uapi/asm-generic/unistd.h | 4 +- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 4 +- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 4 +- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 4 +- 20 files changed, 62 insertions(+), 125 deletions(-) (limited to 'include/uapi') diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 6d95d0c8bf2f..166e36903110 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -599,9 +599,9 @@ __SYSCALL(__NR_recvfrom, compat_sys_recvfrom) #define __NR_shutdown 293 __SYSCALL(__NR_shutdown, sys_shutdown) #define __NR_setsockopt 294 -__SYSCALL(__NR_setsockopt, compat_sys_setsockopt) +__SYSCALL(__NR_setsockopt, sys_setsockopt) #define __NR_getsockopt 295 -__SYSCALL(__NR_getsockopt, compat_sys_getsockopt) +__SYSCALL(__NR_getsockopt, sys_getsockopt) #define __NR_sendmsg 296 __SYSCALL(__NR_sendmsg, compat_sys_sendmsg) #define __NR_recvmsg 297 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index f777141f5256..8488b0d0a99e 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -60,8 +60,8 @@ 50 n32 getsockname sys_getsockname 51 n32 getpeername sys_getpeername 52 n32 socketpair sys_socketpair -53 n32 setsockopt compat_sys_setsockopt -54 n32 getsockopt compat_sys_getsockopt +53 n32 setsockopt sys_setsockopt +54 n32 getsockopt sys_getsockopt 55 n32 clone __sys_clone 56 n32 fork __sys_fork 57 n32 execve compat_sys_execve diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 13280625d312..b20522f813f9 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -184,7 +184,7 @@ 170 o32 connect sys_connect 171 o32 getpeername sys_getpeername 172 o32 getsockname sys_getsockname -173 o32 getsockopt sys_getsockopt compat_sys_getsockopt +173 o32 getsockopt sys_getsockopt sys_getsockopt 174 o32 listen sys_listen 175 o32 recv sys_recv compat_sys_recv 176 o32 recvfrom sys_recvfrom compat_sys_recvfrom @@ -192,7 +192,7 @@ 178 o32 send sys_send 179 o32 sendmsg sys_sendmsg compat_sys_sendmsg 180 o32 sendto sys_sendto -181 o32 setsockopt sys_setsockopt compat_sys_setsockopt +181 o32 setsockopt sys_setsockopt sys_setsockopt 182 o32 shutdown sys_shutdown 183 o32 socket sys_socket 184 o32 socketpair sys_socketpair diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 5a758fa6ec52..3494e4fa1a17 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -198,8 +198,8 @@ 178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 180 common chown sys_chown -181 common setsockopt sys_setsockopt compat_sys_setsockopt -182 common getsockopt sys_getsockopt compat_sys_getsockopt +181 common setsockopt sys_setsockopt sys_setsockopt +182 common getsockopt sys_getsockopt sys_getsockopt 183 common sendmsg sys_sendmsg compat_sys_sendmsg 184 common recvmsg sys_recvmsg compat_sys_recvmsg 185 common semop sys_semop diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index f833a3190822..94eb5b27ef65 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -433,8 +433,8 @@ 336 common recv sys_recv compat_sys_recv 337 common recvfrom sys_recvfrom compat_sys_recvfrom 338 common shutdown sys_shutdown -339 common setsockopt sys_setsockopt compat_sys_setsockopt -340 common getsockopt sys_getsockopt compat_sys_getsockopt +339 common setsockopt sys_setsockopt sys_setsockopt +340 common getsockopt sys_getsockopt sys_getsockopt 341 common sendmsg sys_sendmsg compat_sys_sendmsg 342 common recvmsg sys_recvmsg compat_sys_recvmsg 343 32 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index bfdcb7633957..0d63c71fc544 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -372,8 +372,8 @@ 362 common connect sys_connect sys_connect 363 common listen sys_listen sys_listen 364 common accept4 sys_accept4 sys_accept4 -365 common getsockopt sys_getsockopt compat_sys_getsockopt -366 common setsockopt sys_setsockopt compat_sys_setsockopt +365 common getsockopt sys_getsockopt sys_getsockopt +366 common setsockopt sys_setsockopt sys_setsockopt 367 common getsockname sys_getsockname sys_getsockname 368 common getpeername sys_getpeername sys_getpeername 369 common sendto sys_sendto sys_sendto diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S index 489ffab918a8..a45f0f31fe51 100644 --- a/arch/sparc/kernel/sys32.S +++ b/arch/sparc/kernel/sys32.S @@ -157,22 +157,22 @@ do_sys_shutdown: /* sys_shutdown(int, int) */ nop nop nop -do_sys_setsockopt: /* compat_sys_setsockopt(int, int, int, char *, int) */ +do_sys_setsockopt: /* sys_setsockopt(int, int, int, char *, int) */ 47: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(compat_sys_setsockopt), %g1 + sethi %hi(sys_setsockopt), %g1 48: ldswa [%o1 + 0x8] %asi, %o2 49: lduwa [%o1 + 0xc] %asi, %o3 50: ldswa [%o1 + 0x10] %asi, %o4 - jmpl %g1 + %lo(compat_sys_setsockopt), %g0 + jmpl %g1 + %lo(sys_setsockopt), %g0 51: ldswa [%o1 + 0x4] %asi, %o1 nop -do_sys_getsockopt: /* compat_sys_getsockopt(int, int, int, u32, u32) */ +do_sys_getsockopt: /* sys_getsockopt(int, int, int, u32, u32) */ 52: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(compat_sys_getsockopt), %g1 + sethi %hi(sys_getsockopt), %g1 53: ldswa [%o1 + 0x8] %asi, %o2 54: lduwa [%o1 + 0xc] %asi, %o3 55: lduwa [%o1 + 0x10] %asi, %o4 - jmpl %g1 + %lo(compat_sys_getsockopt), %g0 + jmpl %g1 + %lo(sys_getsockopt), %g0 56: ldswa [%o1 + 0x4] %asi, %o1 nop do_sys_sendmsg: /* compat_sys_sendmsg(int, struct compat_msghdr *, unsigned int) */ diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 8004a276cb74..c59b37965add 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -147,7 +147,7 @@ 115 32 getgroups32 sys_getgroups 116 common gettimeofday sys_gettimeofday compat_sys_gettimeofday 117 common getrusage sys_getrusage compat_sys_getrusage -118 common getsockopt sys_getsockopt compat_sys_getsockopt +118 common getsockopt sys_getsockopt sys_getsockopt 119 common getcwd sys_getcwd 120 common readv sys_readv compat_sys_readv 121 common writev sys_writev compat_sys_writev @@ -425,7 +425,7 @@ 352 common userfaultfd sys_userfaultfd 353 common bind sys_bind 354 common listen sys_listen -355 common setsockopt sys_setsockopt compat_sys_setsockopt +355 common setsockopt sys_setsockopt sys_setsockopt 356 common mlock2 sys_mlock2 357 common copy_file_range sys_copy_file_range 358 common preadv2 sys_preadv2 compat_sys_preadv2 diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index 3d8d70d3896c..1583831f61a9 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -8,6 +8,13 @@ #include #include +/* + * Reuse the 64-bit entry points for the x32 versions that occupy different + * slots in the syscall table. + */ +#define __x32_sys_getsockopt __x64_sys_getsockopt +#define __x32_sys_setsockopt __x64_sys_setsockopt + #define __SYSCALL_64(nr, sym) #define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *); diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index d8f8a1a69ed1..43742a69dba1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -376,8 +376,8 @@ 362 i386 connect sys_connect 363 i386 listen sys_listen 364 i386 accept4 sys_accept4 -365 i386 getsockopt sys_getsockopt compat_sys_getsockopt -366 i386 setsockopt sys_setsockopt compat_sys_setsockopt +365 i386 getsockopt sys_getsockopt sys_getsockopt +366 i386 setsockopt sys_setsockopt sys_setsockopt 367 i386 getsockname sys_getsockname 368 i386 getpeername sys_getpeername 369 i386 sendto sys_sendto diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 78847b32e137..e008d638e641 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -396,8 +396,8 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt +541 x32 setsockopt sys_setsockopt +542 x32 getsockopt sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat diff --git a/include/linux/compat.h b/include/linux/compat.h index e90100c0de72..c4255d8a4a8a 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -737,10 +737,6 @@ asmlinkage long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg); asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, compat_size_t len, unsigned flags, struct sockaddr __user *addr, int __user *addrlen); -asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, - char __user *optval, unsigned int optlen); -asmlinkage long compat_sys_getsockopt(int fd, int level, int optname, - char __user *optval, int __user *optlen); asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags); asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index b951a87da987..aa46825c6f9d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1424,4 +1424,8 @@ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, unsigned int nsops, const struct old_timespec32 __user *timeout); +int __sys_getsockopt(int fd, int level, int optname, char __user *optval, + int __user *optlen); +int __sys_setsockopt(int fd, int level, int optname, char __user *optval, + int optlen); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index f4a01305d9a6..c8c189a5f0a6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -606,9 +606,9 @@ __SYSCALL(__NR_sendto, sys_sendto) #define __NR_recvfrom 207 __SC_COMP(__NR_recvfrom, sys_recvfrom, compat_sys_recvfrom) #define __NR_setsockopt 208 -__SC_COMP(__NR_setsockopt, sys_setsockopt, compat_sys_setsockopt) +__SC_COMP(__NR_setsockopt, sys_setsockopt, sys_setsockopt) #define __NR_getsockopt 209 -__SC_COMP(__NR_getsockopt, sys_getsockopt, compat_sys_getsockopt) +__SC_COMP(__NR_getsockopt, sys_getsockopt, sys_getsockopt) #define __NR_shutdown 210 __SYSCALL(__NR_shutdown, sys_shutdown) #define __NR_sendmsg 211 diff --git a/net/compat.c b/net/compat.c index 3e6c2c5ff260..091875bd6210 100644 --- a/net/compat.c +++ b/net/compat.c @@ -335,77 +335,6 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) __scm_destroy(scm); } -static int __compat_sys_setsockopt(int fd, int level, int optname, - char __user *optval, unsigned int optlen) -{ - int err; - struct socket *sock; - - if (optlen > INT_MAX) - return -EINVAL; - - sock = sockfd_lookup(fd, &err); - if (sock) { - err = security_socket_setsockopt(sock, level, optname); - if (err) { - sockfd_put(sock); - return err; - } - - if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, - optname, optval, optlen); - else if (sock->ops->compat_setsockopt) - err = sock->ops->compat_setsockopt(sock, level, - optname, optval, optlen); - else - err = sock->ops->setsockopt(sock, level, - optname, optval, optlen); - sockfd_put(sock); - } - return err; -} - -COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, - char __user *, optval, unsigned int, optlen) -{ - return __compat_sys_setsockopt(fd, level, optname, optval, optlen); -} - -static int __compat_sys_getsockopt(int fd, int level, int optname, - char __user *optval, - int __user *optlen) -{ - int err; - struct socket *sock = sockfd_lookup(fd, &err); - - if (sock) { - err = security_socket_getsockopt(sock, level, optname); - if (err) { - sockfd_put(sock); - return err; - } - - if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, - optname, optval, optlen); - else if (sock->ops->compat_getsockopt) - err = sock->ops->compat_getsockopt(sock, level, - optname, optval, optlen); - else - err = sock->ops->getsockopt(sock, level, - optname, optval, optlen); - sockfd_put(sock); - } - return err; -} - -COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, - char __user *, optval, int __user *, optlen) -{ - return __compat_sys_getsockopt(fd, level, optname, optval, optlen); -} - /* Argument list sizes for compat_sys_socketcall */ #define AL(x) ((x) * sizeof(u32)) static unsigned char nas[21] = { @@ -565,13 +494,11 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) ret = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: - ret = __compat_sys_setsockopt(a0, a1, a[2], - compat_ptr(a[3]), a[4]); + ret = __sys_setsockopt(a0, a1, a[2], compat_ptr(a[3]), a[4]); break; case SYS_GETSOCKOPT: - ret = __compat_sys_getsockopt(a0, a1, a[2], - compat_ptr(a[3]), - compat_ptr(a[4])); + ret = __sys_getsockopt(a0, a1, a[2], compat_ptr(a[3]), + compat_ptr(a[4])); break; case SYS_SENDMSG: ret = __compat_sys_sendmsg(a0, compat_ptr(a1), a[2]); diff --git a/net/socket.c b/net/socket.c index b79376b17b45..dec345982abb 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2094,9 +2094,8 @@ static bool sock_use_custom_sol_socket(const struct socket *sock) * Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. */ - -static int __sys_setsockopt(int fd, int level, int optname, - char __user *optval, int optlen) +int __sys_setsockopt(int fd, int level, int optname, char __user *optval, + int optlen) { mm_segment_t oldfs = get_fs(); char *kernel_optval = NULL; @@ -2114,8 +2113,10 @@ static int __sys_setsockopt(int fd, int level, int optname, if (err) goto out_put; - err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, - optval, &optlen, &kernel_optval); + if (!in_compat_syscall()) + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, + optval, &optlen, + &kernel_optval); if (err < 0) goto out_put; if (err > 0) { @@ -2154,9 +2155,8 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. */ - -static int __sys_getsockopt(int fd, int level, int optname, - char __user *optval, int __user *optlen) +int __sys_getsockopt(int fd, int level, int optname, char __user *optval, + int __user *optlen) { int err, fput_needed; struct socket *sock; @@ -2170,7 +2170,8 @@ static int __sys_getsockopt(int fd, int level, int optname, if (err) goto out_put; - max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + if (!in_compat_syscall()) + max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, optlen); @@ -2178,8 +2179,10 @@ static int __sys_getsockopt(int fd, int level, int optname, err = sock->ops->getsockopt(sock, level, optname, optval, optlen); - err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, optval, - optlen, max_optlen, err); + if (!in_compat_syscall()) + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, + optval, optlen, max_optlen, + err); out_put: fput_light(sock->file, fput_needed); return err; diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index f4a01305d9a6..c8c189a5f0a6 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -606,9 +606,9 @@ __SYSCALL(__NR_sendto, sys_sendto) #define __NR_recvfrom 207 __SC_COMP(__NR_recvfrom, sys_recvfrom, compat_sys_recvfrom) #define __NR_setsockopt 208 -__SC_COMP(__NR_setsockopt, sys_setsockopt, compat_sys_setsockopt) +__SC_COMP(__NR_setsockopt, sys_setsockopt, sys_setsockopt) #define __NR_getsockopt 209 -__SC_COMP(__NR_getsockopt, sys_getsockopt, compat_sys_getsockopt) +__SC_COMP(__NR_getsockopt, sys_getsockopt, sys_getsockopt) #define __NR_shutdown 210 __SYSCALL(__NR_shutdown, sys_shutdown) #define __NR_sendmsg 211 diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 35b61bfc1b1a..b190f2eb2611 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -427,8 +427,8 @@ 336 common recv sys_recv compat_sys_recv 337 common recvfrom sys_recvfrom compat_sys_recvfrom 338 common shutdown sys_shutdown -339 common setsockopt sys_setsockopt compat_sys_setsockopt -340 common getsockopt sys_getsockopt compat_sys_getsockopt +339 common setsockopt sys_setsockopt sys_setsockopt +340 common getsockopt sys_getsockopt sys_getsockopt 341 common sendmsg sys_sendmsg compat_sys_sendmsg 342 common recvmsg sys_recvmsg compat_sys_recvmsg 343 32 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index b38d48464368..56ae24b6e4be 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -372,8 +372,8 @@ 362 common connect sys_connect compat_sys_connect 363 common listen sys_listen sys_listen 364 common accept4 sys_accept4 compat_sys_accept4 -365 common getsockopt sys_getsockopt compat_sys_getsockopt -366 common setsockopt sys_setsockopt compat_sys_setsockopt +365 common getsockopt sys_getsockopt sys_getsockopt +366 common setsockopt sys_setsockopt sys_setsockopt 367 common getsockname sys_getsockname compat_sys_getsockname 368 common getpeername sys_getpeername compat_sys_getpeername 369 common sendto sys_sendto compat_sys_sendto diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 78847b32e137..e008d638e641 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -396,8 +396,8 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt +541 x32 setsockopt sys_setsockopt +542 x32 getsockopt sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat -- cgit v1.2.3 From eba75c587e811d3249c8bd50d22bb2266ccd3c0f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 10 Jul 2020 09:29:02 -0400 Subject: icmp: support rfc 4884 Add setsockopt SOL_IP/IP_RECVERR_4884 to return the offset to an extension struct if present. ICMP messages may include an extension structure after the original datagram. RFC 4884 standardized this behavior. It stores the offset in words to the extension header in u8 icmphdr.un.reserved[1]. The field is valid only for ICMP types destination unreachable, time exceeded and parameter problem, if length is at least 128 bytes and entire packet does not exceed 576 bytes. Return the offset to the start of the extension struct when reading an ICMP error from the error queue, if it matches the above constraints. Do not return the raw u8 field. Return the offset from the start of the user buffer, in bytes. The kernel does not return the network and transport headers, so subtract those. Also validate the headers. Return the offset regardless of validation, as an invalid extension must still not be misinterpreted as part of the original datagram. Note that !invalid does not imply valid. If the extension version does not match, no validation can take place, for instance. For backward compatibility, make this optional, set by setsockopt SOL_IP/IP_RECVERR_RFC4884. For API example and feature test, see github.com/wdebruij/kerneltools/blob/master/tests/recv_icmp_v2.c For forward compatibility, reserve only setsockopt value 1, leaving other bits for additional icmp extensions. Changes v1->v2: - convert word offset to byte offset from start of user buffer - return in ee_data as u8 may be insufficient - define extension struct and object header structs - return len only if constraints met - if returning len, also validate Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/icmp.h | 4 +++ include/net/inet_sock.h | 1 + include/uapi/linux/errqueue.h | 14 ++++++++- include/uapi/linux/icmp.h | 22 ++++++++++++++ include/uapi/linux/in.h | 1 + net/ipv4/icmp.c | 71 +++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_sockglue.c | 12 ++++++++ 7 files changed, 124 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/linux/icmp.h b/include/linux/icmp.h index 81ca84ce3119..8fc38a34cb20 100644 --- a/include/linux/icmp.h +++ b/include/linux/icmp.h @@ -15,6 +15,7 @@ #include #include +#include static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) { @@ -35,4 +36,7 @@ static inline bool icmp_is_err(int type) return false; } +void ip_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out); + #endif /* _LINUX_ICMP_H */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a7ce00af6c44..a3702d1d4875 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -225,6 +225,7 @@ struct inet_sock { mc_all:1, nodefrag:1; __u8 bind_address_no_port:1, + recverr_rfc4884:1, defer_connect:1; /* Indicates that fastopen_connect is set * and cookie exists so we defer connect * until first data frame is written diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index ca5cb3e3c6df..3c70e8ac14b8 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -5,6 +5,13 @@ #include #include +/* RFC 4884: return offset to extension struct + validation */ +struct sock_ee_data_rfc4884 { + __u16 len; + __u8 flags; + __u8 reserved; +}; + struct sock_extended_err { __u32 ee_errno; __u8 ee_origin; @@ -12,7 +19,10 @@ struct sock_extended_err { __u8 ee_code; __u8 ee_pad; __u32 ee_info; - __u32 ee_data; + union { + __u32 ee_data; + struct sock_ee_data_rfc4884 ee_rfc4884; + }; }; #define SO_EE_ORIGIN_NONE 0 @@ -31,6 +41,8 @@ struct sock_extended_err { #define SO_EE_CODE_TXTIME_INVALID_PARAM 1 #define SO_EE_CODE_TXTIME_MISSED 2 +#define SO_EE_RFC4884_FLAG_INVALID 1 + /** * struct scm_timestamping - timestamps exposed through cmsg * diff --git a/include/uapi/linux/icmp.h b/include/uapi/linux/icmp.h index 5589eeb791ca..fb169a50895e 100644 --- a/include/uapi/linux/icmp.h +++ b/include/uapi/linux/icmp.h @@ -19,6 +19,7 @@ #define _UAPI_LINUX_ICMP_H #include +#include #define ICMP_ECHOREPLY 0 /* Echo Reply */ #define ICMP_DEST_UNREACH 3 /* Destination Unreachable */ @@ -95,5 +96,26 @@ struct icmp_filter { __u32 data; }; +/* RFC 4884 extension struct: one per message */ +struct icmp_ext_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 reserved1:4, + version:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 version:4, + reserved1:4; +#else +#error "Please fix " +#endif + __u8 reserved2; + __sum16 checksum; +}; + +/* RFC 4884 extension object header: one for each object */ +struct icmp_extobj_hdr { + __be16 length; + __u8 class_num; + __u8 class_type; +}; #endif /* _UAPI_LINUX_ICMP_H */ diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 8533bf07450f..3d0d8231dc19 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -123,6 +123,7 @@ struct in_addr { #define IP_CHECKSUM 23 #define IP_BIND_ADDRESS_NO_PORT 24 #define IP_RECVFRAGSIZE 25 +#define IP_RECVERR_RFC4884 26 /* IP_MTU_DISCOVER values */ #define IP_PMTUDISC_DONT 0 /* Never send DF frames */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e30515f89802..793aebf07c2a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1116,6 +1116,77 @@ error: goto drop; } +static bool ip_icmp_error_rfc4884_validate(const struct sk_buff *skb, int off) +{ + struct icmp_extobj_hdr *objh, _objh; + struct icmp_ext_hdr *exth, _exth; + u16 olen; + + exth = skb_header_pointer(skb, off, sizeof(_exth), &_exth); + if (!exth) + return false; + if (exth->version != 2) + return true; + + if (exth->checksum && + csum_fold(skb_checksum(skb, off, skb->len - off, 0))) + return false; + + off += sizeof(_exth); + while (off < skb->len) { + objh = skb_header_pointer(skb, off, sizeof(_objh), &_objh); + if (!objh) + return false; + + olen = ntohs(objh->length); + if (olen < sizeof(_objh)) + return false; + + off += olen; + if (off > skb->len) + return false; + } + + return true; +} + +void ip_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out) +{ + int hlen, off; + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return; + } + + /* outer headers up to inner iph. skb->data is at inner payload */ + hlen = -skb_transport_offset(skb) - sizeof(struct icmphdr); + + /* per rfc 791: maximum packet length of 576 bytes */ + if (hlen + skb->len > 576) + return; + + /* per rfc 4884: minimal datagram length of 128 bytes */ + off = icmp_hdr(skb)->un.reserved[1] * sizeof(u32); + if (off < 128) + return; + + /* kernel has stripped headers: return payload offset in bytes */ + off -= hlen; + if (off + sizeof(struct icmp_ext_hdr) > skb->len) + return; + + out->len = off; + + if (!ip_icmp_error_rfc4884_validate(skb, off)) + out->flags |= SO_EE_RFC4884_FLAG_INVALID; +} + int icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 86b3b9a7cea3..a5ea02d7a183 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -411,6 +411,9 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; if (skb_pull(skb, payload - skb->data)) { + if (inet_sk(sk)->recverr_rfc4884) + ip_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb) == 0) return; @@ -904,6 +907,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_RECVORIGDSTADDR: case IP_CHECKSUM: case IP_RECVFRAGSIZE: + case IP_RECVERR_RFC4884: if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; @@ -1063,6 +1067,11 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (!val) skb_queue_purge(&sk->sk_error_queue); break; + case IP_RECVERR_RFC4884: + if (val < 0 || val > 1) + goto e_inval; + inet->recverr_rfc4884 = !!val; + break; case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) goto e_inval; @@ -1611,6 +1620,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_RECVERR: val = inet->recverr; break; + case IP_RECVERR_RFC4884: + val = inet->recverr_rfc4884; + break; case IP_MULTICAST_TTL: val = inet->mc_ttl; break; -- cgit v1.2.3 From f65b71aa25a65e13cf3d10445a48c63d3eeb942e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Jul 2020 01:45:29 +0300 Subject: ptp: add ability to configure duty cycle for periodic output There are external event timestampers (PHCs with support for PTP_EXTTS_REQUEST) that timestamp both event edges. When those edges are very close (such as in the case of a short pulse), there is a chance that the collected timestamp might be of the rising, or of the falling edge, we never know. There are also PHCs capable of generating periodic output with a configurable duty cycle. This is good news, because we can space the rising and falling edge out enough in time, that the risks to overrun the 1-entry timestamp FIFO of the extts PHC are lower (example: the perout PHC can be configured for a period of 1 second, and an "on" time of 0.5 seconds, resulting in a duty cycle of 50%). A flag is introduced for signaling that an on time is present in the perout request structure, for preserving compatibility. Logically speaking, the duty cycle cannot exceed 100% and the PTP core checks for this. PHC drivers that don't support this flag emit a periodic output of an unspecified duty cycle, same as before. The duty cycle is encoded as an "on" time, similar to the "start" and "period" times, and reuses the reserved space while preserving overall binary layout. Pahole reported before: struct ptp_perout_request { struct ptp_clock_time start; /* 0 16 */ struct ptp_clock_time period; /* 16 16 */ unsigned int index; /* 32 4 */ unsigned int flags; /* 36 4 */ unsigned int rsv[4]; /* 40 16 */ /* size: 56, cachelines: 1, members: 5 */ /* last cacheline: 56 bytes */ }; And now: struct ptp_perout_request { struct ptp_clock_time start; /* 0 16 */ struct ptp_clock_time period; /* 16 16 */ unsigned int index; /* 32 4 */ unsigned int flags; /* 36 4 */ union { struct ptp_clock_time on; /* 40 16 */ unsigned int rsv[4]; /* 40 16 */ }; /* 40 16 */ /* size: 56, cachelines: 1, members: 5 */ /* last cacheline: 56 bytes */ }; Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 33 +++++++++++++++++++++++++++------ include/uapi/linux/ptp_clock.h | 17 ++++++++++++++--- 2 files changed, 41 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 375cd6e4aade..e0e6f85966e1 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -191,12 +191,33 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = -EFAULT; break; } - if (((req.perout.flags & ~PTP_PEROUT_VALID_FLAGS) || - req.perout.rsv[0] || req.perout.rsv[1] || - req.perout.rsv[2] || req.perout.rsv[3]) && - cmd == PTP_PEROUT_REQUEST2) { - err = -EINVAL; - break; + if (cmd == PTP_PEROUT_REQUEST2) { + struct ptp_perout_request *perout = &req.perout; + + if (perout->flags & ~PTP_PEROUT_VALID_FLAGS) { + err = -EINVAL; + break; + } + /* + * The "on" field has undefined meaning if + * PTP_PEROUT_DUTY_CYCLE isn't set, we must still treat + * it as reserved, which must be set to zero. + */ + if (!(perout->flags & PTP_PEROUT_DUTY_CYCLE) && + (perout->rsv[0] || perout->rsv[1] || + perout->rsv[2] || perout->rsv[3])) { + err = -EINVAL; + break; + } + if (perout->flags & PTP_PEROUT_DUTY_CYCLE) { + /* The duty cycle must be subunitary. */ + if (perout->on.sec > perout->period.sec || + (perout->on.sec == perout->period.sec && + perout->on.nsec > perout->period.nsec)) { + err = -ERANGE; + break; + } + } } else if (cmd == PTP_PEROUT_REQUEST) { req.perout.flags &= PTP_PEROUT_V1_VALID_FLAGS; req.perout.rsv[0] = 0; diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index ff070aa64278..1d2841155f7d 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -53,12 +53,14 @@ /* * Bits of the ptp_perout_request.flags field: */ -#define PTP_PEROUT_ONE_SHOT (1<<0) +#define PTP_PEROUT_ONE_SHOT (1<<0) +#define PTP_PEROUT_DUTY_CYCLE (1<<1) /* * flag fields valid for the new PTP_PEROUT_REQUEST2 ioctl. */ -#define PTP_PEROUT_VALID_FLAGS (PTP_PEROUT_ONE_SHOT) +#define PTP_PEROUT_VALID_FLAGS (PTP_PEROUT_ONE_SHOT | \ + PTP_PEROUT_DUTY_CYCLE) /* * No flags are valid for the original PTP_PEROUT_REQUEST ioctl @@ -105,7 +107,16 @@ struct ptp_perout_request { struct ptp_clock_time period; /* Desired period, zero means disable. */ unsigned int index; /* Which channel to configure. */ unsigned int flags; - unsigned int rsv[4]; /* Reserved for future use. */ + union { + /* + * The "on" time of the signal. + * Must be lower than the period. + * Valid only if (flags & PTP_PEROUT_DUTY_CYCLE) is set. + */ + struct ptp_clock_time on; + /* Reserved for future use. */ + unsigned int rsv[4]; + }; }; #define PTP_MAX_SAMPLES 25 /* Maximum allowed offset measurement samples. */ -- cgit v1.2.3 From b6bd41363a1ca39282496803cc32f7515ed917fe Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Jul 2020 01:45:30 +0300 Subject: ptp: introduce a phase offset in the periodic output request Some PHCs like the ocelot/felix switch cannot emit generic periodic output, but just PPS (pulse per second) signals, which: - don't start from arbitrary absolute times, but are rather phase-aligned to the beginning of [the closest next] second. - have an optional phase offset relative to that beginning of the second. For those, it was initially established that they should reject any other absolute time for the PTP_PEROUT_REQUEST than 0.000000000 [1]. But when it actually came to writing an application [2] that makes use of this functionality, we realized that we can't really deal generically with PHCs that support absolute start time, and with PHCs that don't, without an explicit interface. Namely, in an ideal world, PHC drivers would ensure that the "perout.start" value written to hardware will result in a functional output. This means that if the PTP time has become in the past of this PHC's current time, it should be automatically fast-forwarded by the driver into a close enough future time that is known to work (note: this is necessary only if the hardware doesn't do this fast-forward by itself). But we don't really know what is the status for PHC drivers in use today, so in the general sense, user space would be risking to have a non-functional periodic output if it simply asked for a start time of 0.000000000. So let's introduce a flag for this type of reduced-functionality hardware, named PTP_PEROUT_PHASE. The start time is just "soon", the only thing we know for sure about this signal is that its rising edge events, Rn, occur at: Rn = perout.phase + n * perout.period The "phase" in the periodic output structure is simply an alias to the "start" time, since both cannot logically be specified at the same time. Therefore, the binary layout of the structure is not affected. [1]: https://patchwork.ozlabs.org/project/netdev/patch/20200320103726.32559-7-yangbo.lu@nxp.com/ [2]: https://www.mail-archive.com/linuxptp-devel@lists.sourceforge.net/msg04142.html Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/uapi/linux/ptp_clock.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 1d2841155f7d..1d108d597f66 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -55,12 +55,14 @@ */ #define PTP_PEROUT_ONE_SHOT (1<<0) #define PTP_PEROUT_DUTY_CYCLE (1<<1) +#define PTP_PEROUT_PHASE (1<<2) /* * flag fields valid for the new PTP_PEROUT_REQUEST2 ioctl. */ #define PTP_PEROUT_VALID_FLAGS (PTP_PEROUT_ONE_SHOT | \ - PTP_PEROUT_DUTY_CYCLE) + PTP_PEROUT_DUTY_CYCLE | \ + PTP_PEROUT_PHASE) /* * No flags are valid for the original PTP_PEROUT_REQUEST ioctl @@ -103,7 +105,20 @@ struct ptp_extts_request { }; struct ptp_perout_request { - struct ptp_clock_time start; /* Absolute start time. */ + union { + /* + * Absolute start time. + * Valid only if (flags & PTP_PEROUT_PHASE) is unset. + */ + struct ptp_clock_time start; + /* + * Phase offset. The signal should start toggling at an + * unspecified integer multiple of the period, plus this value. + * The start time should be "as soon as possible". + * Valid only if (flags & PTP_PEROUT_PHASE) is set. + */ + struct ptp_clock_time phase; + }; struct ptp_clock_time period; /* Desired period, zero means disable. */ unsigned int index; /* Which channel to configure. */ unsigned int flags; -- cgit v1.2.3 From b0487e0d96d58906e286592dd02e7292f53e399a Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Sun, 19 Jul 2020 19:14:28 +0200 Subject: drm: Replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20200719171428.60470-1-grandmaster@al2klimov.de --- Documentation/gpu/vgaarbiter.rst | 8 ++++---- drivers/gpu/drm/drm_modes.c | 2 +- include/uapi/drm/drm_mode.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/gpu/vgaarbiter.rst b/Documentation/gpu/vgaarbiter.rst index 0b41b051d021..339ed5fecd2e 100644 --- a/Documentation/gpu/vgaarbiter.rst +++ b/Documentation/gpu/vgaarbiter.rst @@ -185,7 +185,7 @@ enhancing the kernel code to adapt as a kernel module and also did the implementation of the user space side [3]. Now (2009) Tiago Vignatti and Dave Airlie finally put this work in shape and queued to Jesse Barnes' PCI tree. -0) http://cgit.freedesktop.org/xorg/xserver/commit/?id=4b42448a2388d40f257774fbffdccaea87bd0347 -1) http://lists.freedesktop.org/archives/xorg/2005-March/006663.html -2) http://lists.freedesktop.org/archives/xorg/2005-March/006745.html -3) http://lists.freedesktop.org/archives/xorg/2007-October/029507.html +0) https://cgit.freedesktop.org/xorg/xserver/commit/?id=4b42448a2388d40f257774fbffdccaea87bd0347 +1) https://lists.freedesktop.org/archives/xorg/2005-March/006663.html +2) https://lists.freedesktop.org/archives/xorg/2005-March/006745.html +3) https://lists.freedesktop.org/archives/xorg/2007-October/029507.html diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c index f2865f88bd54..14b6f7638728 100644 --- a/drivers/gpu/drm/drm_modes.c +++ b/drivers/gpu/drm/drm_modes.c @@ -548,7 +548,7 @@ EXPORT_SYMBOL(drm_gtf_mode_complex); * Generalized Timing Formula is derived from: * * GTF Spreadsheet by Andy Morrish (1/5/97) - * available at http://www.vesa.org + * available at https://www.vesa.org * * And it is copied from the file of xserver/hw/xfree86/modes/xf86gtf.c. * What I have done is to translate it by using integer calculation. diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h index 735c8cfdaaa1..deea447e5f22 100644 --- a/include/uapi/drm/drm_mode.h +++ b/include/uapi/drm/drm_mode.h @@ -497,7 +497,7 @@ struct drm_mode_fb_cmd2 { * In case of planar formats, this ioctl allows up to 4 * buffer objects with offsets and pitches per plane. * The pitch and offset order is dictated by the fourcc, - * e.g. NV12 (http://fourcc.org/yuv.php#NV12) is described as: + * e.g. NV12 (https://fourcc.org/yuv.php#NV12) is described as: * * YUV 4:2:0 image with a plane of 8 bit Y samples * followed by an interleaved U/V plane containing -- cgit v1.2.3 From 6c0246a4588d418f72acd40a7b7601be403d80a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:28 +0800 Subject: perf: Add perf_event_mmap_page::cap_user_time_short ABI In order to support short clock counters, provide an ABI extension. As a whole: u64 time, delta, cyc = read_cycle_counter(); + if (cap_user_time_short) + cyc = time_cycle + ((cyc - time_cycle) & time_mask); delta = mul_u64_u32_shr(cyc, time_mult, time_shift); if (cap_user_time_zero) time = time_zero + delta; delta += time_offset; Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-6-leo.yan@linaro.org Signed-off-by: Will Deacon --- include/uapi/linux/perf_event.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7b2d6fc9e6ed..21a1edd08cbe 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -532,9 +532,10 @@ struct perf_event_mmap_page { cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ - cap_user_time : 1, /* The time_* fields are used */ + cap_user_time : 1, /* The time_{shift,mult,offset} fields are used */ cap_user_time_zero : 1, /* The time_zero field is used */ - cap_____res : 59; + cap_user_time_short : 1, /* the time_{cycle,mask} fields are used */ + cap_____res : 58; }; }; @@ -593,13 +594,29 @@ struct perf_event_mmap_page { * ((rem * time_mult) >> time_shift); */ __u64 time_zero; + __u32 size; /* Header size up to __reserved[] fields. */ + __u32 __reserved_1; + + /* + * If cap_usr_time_short, the hardware clock is less than 64bit wide + * and we must compute the 'cyc' value, as used by cap_usr_time, as: + * + * cyc = time_cycles + ((cyc - time_cycles) & time_mask) + * + * NOTE: this form is explicitly chosen such that cap_usr_time_short + * is a correction on top of cap_usr_time, and code that doesn't + * know about cap_usr_time_short still works under the assumption + * the counter doesn't wrap. + */ + __u64 time_cycles; + __u64 time_mask; /* * Hole for extension of the self monitor capabilities */ - __u8 __reserved[118*8+4]; /* align to 1k. */ + __u8 __reserved[116*8]; /* align to 1k. */ /* * Control data for the mmap() data buffer. -- cgit v1.2.3 From 8e7eafb816ab7e5047b74cb8eb1db2f8f14f7d7a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:32:20 -0700 Subject: RDMA: rdma_user_ioctl.h: fix a duplicated word + clarify Change the repeated word "it" in a comment to "it to". Also insert a dash in the sentence to add clarity. Link: https://lore.kernel.org/r/20200719003220.21250-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_ioctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h index d92d2721b28c..53c55188dd2a 100644 --- a/include/uapi/rdma/rdma_user_ioctl.h +++ b/include/uapi/rdma/rdma_user_ioctl.h @@ -43,7 +43,7 @@ /* * General blocks assignments - * It is closed on purpose do not expose it it user space + * It is closed on purpose - do not expose it to user space * #define MAD_CMD_BASE 0x00 * #define HFI1_CMD_BAS 0xE0 */ -- cgit v1.2.3 From b43870c74f3fdf0cd06bf5f1b7a5ed70a2cd4ed2 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Sat, 4 Jul 2020 15:15:28 +0000 Subject: audit: report audit wait metric in audit status reply In environments where the preservation of audit events and predictable usage of system memory are prioritized, admins may use a combination of --backlog_wait_time and -b options at the risk of degraded performance resulting from backlog waiting. In some cases, this risk may be preferred to lost events or unbounded memory usage. Ideally, this risk can be mitigated by making adjustments when backlog waiting is detected. However, detection can be difficult using the currently available metrics. For example, an admin attempting to debug degraded performance may falsely believe a full backlog indicates backlog waiting. It may turn out the backlog frequently fills up but drains quickly. To make it easier to reliably track degraded performance to backlog waiting, this patch makes the following changes: Add a new field backlog_wait_time_total to the audit status reply. Initialize this field to zero. Add to this field the total time spent by the current task on scheduled timeouts while the backlog limit is exceeded. Reset field to zero upon request via AUDIT_SET. Tested on Ubuntu 18.04 using complementary changes to the audit-userspace and audit-testsuite: - https://github.com/linux-audit/audit-userspace/pull/134 - https://github.com/linux-audit/audit-testsuite/pull/97 Signed-off-by: Max Englander Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 18 +++++++++++------- kernel/audit.c | 35 +++++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 9b6a973f4cc3..cd2d8279a5e4 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -333,14 +333,15 @@ enum { }; /* Status symbols */ - /* Mask values */ -#define AUDIT_STATUS_ENABLED 0x0001 -#define AUDIT_STATUS_FAILURE 0x0002 -#define AUDIT_STATUS_PID 0x0004 + /* Mask values */ +#define AUDIT_STATUS_ENABLED 0x0001 +#define AUDIT_STATUS_FAILURE 0x0002 +#define AUDIT_STATUS_PID 0x0004 #define AUDIT_STATUS_RATE_LIMIT 0x0008 -#define AUDIT_STATUS_BACKLOG_LIMIT 0x0010 -#define AUDIT_STATUS_BACKLOG_WAIT_TIME 0x0020 -#define AUDIT_STATUS_LOST 0x0040 +#define AUDIT_STATUS_BACKLOG_LIMIT 0x0010 +#define AUDIT_STATUS_BACKLOG_WAIT_TIME 0x0020 +#define AUDIT_STATUS_LOST 0x0040 +#define AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL 0x0080 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT 0x00000001 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME 0x00000002 @@ -467,6 +468,9 @@ struct audit_status { __u32 feature_bitmap; /* bitmap of kernel audit features */ }; __u32 backlog_wait_time;/* message queue wait timeout */ + __u32 backlog_wait_time_actual;/* time spent waiting while + * message limit exceeded + */ }; struct audit_features { diff --git a/kernel/audit.c b/kernel/audit.c index a2f3e34aa724..d72663ac248c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -136,6 +136,11 @@ u32 audit_sig_sid = 0; */ static atomic_t audit_lost = ATOMIC_INIT(0); +/* Monotonically increasing sum of time the kernel has spent + * waiting while the backlog limit is exceeded. + */ +static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); + /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -1201,17 +1206,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); - s.enabled = audit_enabled; - s.failure = audit_failure; + s.enabled = audit_enabled; + s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ - s.pid = auditd_pid_vnr(); - s.rate_limit = audit_rate_limit; - s.backlog_limit = audit_backlog_limit; - s.lost = atomic_read(&audit_lost); - s.backlog = skb_queue_len(&audit_queue); - s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; - s.backlog_wait_time = audit_backlog_wait_time; + s.pid = auditd_pid_vnr(); + s.rate_limit = audit_rate_limit; + s.backlog_limit = audit_backlog_limit; + s.lost = atomic_read(&audit_lost); + s.backlog = skb_queue_len(&audit_queue); + s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; + s.backlog_wait_time = audit_backlog_wait_time; + s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } @@ -1315,6 +1321,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_config_change("lost", 0, lost, 1); return lost; } + if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { + u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); + + audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); + return actual; + } break; } case AUDIT_GET_FEATURE: @@ -1826,12 +1838,15 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { + long rtime = stime; + DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - stime = schedule_timeout(stime); + stime = schedule_timeout(rtime); + atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) -- cgit v1.2.3 From 4787dd582dbde0b7f29eb3dbe59df3da1b350925 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Fri, 17 Jul 2020 08:05:12 +0530 Subject: bareudp: Reverted support to enable & disable rx metadata collection The commit fe80536acf83 ("bareudp: Added attribute to enable & disable rx metadata collection") breaks the the original(5.7) default behavior of bareudp module to collect RX metadadata at the receive. It was added to avoid the crash at the kernel neighbour subsytem when packet with metadata from bareudp is processed. But it is no more needed as the commit 394de110a733 ("net: Added pointer check for dst->ops->neigh_lookup in dst_neigh_lookup_skb") solves this crash. Fixes: fe80536acf83 ("bareudp: Added attribute to enable & disable rx metadata collection") Signed-off-by: Martin Varghese Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- Documentation/networking/bareudp.rst | 6 ++---- drivers/net/bareudp.c | 21 +++++---------------- include/net/bareudp.h | 1 - include/uapi/linux/if_link.h | 1 - 4 files changed, 7 insertions(+), 22 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/bareudp.rst b/Documentation/networking/bareudp.rst index 0e00636d8d74..465a8b251bfe 100644 --- a/Documentation/networking/bareudp.rst +++ b/Documentation/networking/bareudp.rst @@ -48,7 +48,5 @@ enabled. The bareudp device could be used along with OVS or flower filter in TC. The OVS or TC flower layer must set the tunnel information in SKB dst field before sending packet buffer to the bareudp device for transmission. On reception the -bareudp device decapsulates the udp header and passes the inner packet to the -network stack. If RX_COLLECT_METADATA flag is enabled in the device the tunnel -information will be stored in the SKB dst field before the packet buffer is -passed to the network stack. +bareudp device extracts and stores the tunnel information in SKB dst field before +passing the packet buffer to the network stack. diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 108a8cafc4f8..44eb2b1d0416 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -46,7 +46,6 @@ struct bareudp_dev { __be16 port; u16 sport_min; bool multi_proto_mode; - bool rx_collect_metadata; struct socket __rcu *sock; struct list_head next; /* bareudp node on namespace list */ struct gro_cells gro_cells; @@ -126,14 +125,12 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) bareudp->dev->stats.rx_dropped++; goto drop; } - if (bareudp->rx_collect_metadata) { - tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); - if (!tun_dst) { - bareudp->dev->stats.rx_dropped++; - goto drop; - } - skb_dst_set(skb, &tun_dst->dst); + tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); + if (!tun_dst) { + bareudp->dev->stats.rx_dropped++; + goto drop; } + skb_dst_set(skb, &tun_dst->dst); skb->dev = bareudp->dev; oiph = skb_network_header(skb); skb_reset_network_header(skb); @@ -577,9 +574,6 @@ static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf, if (data[IFLA_BAREUDP_MULTIPROTO_MODE]) conf->multi_proto_mode = true; - if (data[IFLA_BAREUDP_RX_COLLECT_METADATA]) - conf->rx_collect_metadata = true; - return 0; } @@ -617,7 +611,6 @@ static int bareudp_configure(struct net *net, struct net_device *dev, bareudp->ethertype = conf->ethertype; bareudp->sport_min = conf->sport_min; bareudp->multi_proto_mode = conf->multi_proto_mode; - bareudp->rx_collect_metadata = conf->rx_collect_metadata; err = register_netdevice(dev); if (err) @@ -676,7 +669,6 @@ static size_t bareudp_get_size(const struct net_device *dev) nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ nla_total_size(0) + /* IFLA_BAREUDP_MULTIPROTO_MODE */ - nla_total_size(0) + /* IFLA_BAREUDP_RX_COLLECT_METADATA */ 0; } @@ -693,9 +685,6 @@ static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) if (bareudp->multi_proto_mode && nla_put_flag(skb, IFLA_BAREUDP_MULTIPROTO_MODE)) goto nla_put_failure; - if (bareudp->rx_collect_metadata && - nla_put_flag(skb, IFLA_BAREUDP_RX_COLLECT_METADATA)) - goto nla_put_failure; return 0; diff --git a/include/net/bareudp.h b/include/net/bareudp.h index 3dd5f9a8d01c..dc65a0d71d9b 100644 --- a/include/net/bareudp.h +++ b/include/net/bareudp.h @@ -12,7 +12,6 @@ struct bareudp_conf { __be16 port; u16 sport_min; bool multi_proto_mode; - bool rx_collect_metadata; }; struct net_device *bareudp_dev_create(struct net *net, const char *name, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 26842ffd0501..af8f31987526 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -601,7 +601,6 @@ enum { IFLA_BAREUDP_ETHERTYPE, IFLA_BAREUDP_SRCPORT_MIN, IFLA_BAREUDP_MULTIPROTO_MODE, - IFLA_BAREUDP_RX_COLLECT_METADATA, __IFLA_BAREUDP_MAX }; -- cgit v1.2.3 From c333f9495c451d958c6f4a41e5de2d8f80f79496 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Jul 2020 16:37:13 -0700 Subject: raid: md_p.h: drop duplicated word in a comment Drop the doubled word "the" in a comment. Signed-off-by: Randy Dunlap Cc: Song Liu Cc: linux-raid@vger.kernel.org Signed-off-by: Song Liu --- include/uapi/linux/raid/md_p.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 1f2d8c81f0e0..e5a98a16f9b0 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -123,7 +123,7 @@ typedef struct mdp_device_descriptor_s { /* * Notes: - * - if an array is being reshaped (restriped) in order to change the + * - if an array is being reshaped (restriped) in order to change * the number of active devices in the array, 'raid_disks' will be * the larger of the old and new numbers. 'delta_disks' will * be the "new - old". So if +ve, raid_disks is the new value, and -- cgit v1.2.3 From 1859f4ebcf2dfe2e878a3805700036fdc4e01e2c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:27:38 -0700 Subject: android: binder.h: drop a duplicated word MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the repeated word "the" in a comment. Cc: Arve Hjønnevåg Cc: Todd Kjos Cc: Martijn Coenen Cc: Joel Fernandes Cc: Hridya Valsaraju Cc: Suren Baghdasaryan Cc: devel@driverdev.osuosl.org Acked-by: Christian Brauner Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20200719002738.20210-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/android/binder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 2832134e5397..f1ce2c4c077e 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -404,7 +404,7 @@ enum binder_driver_return_protocol { BR_FAILED_REPLY = _IO('r', 17), /* - * The the last transaction (either a bcTRANSACTION or + * The last transaction (either a bcTRANSACTION or * a bcATTEMPT_ACQUIRE) failed (e.g. out of memory). No parameters. */ }; -- cgit v1.2.3 From 7deff7b5b4395784b194bae3631b8333d3423938 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:28:41 -0700 Subject: hyperv: hyperv.h: drop a duplicated word Drop the repeated word "the" in a comment. Signed-off-by: Randy Dunlap Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: linux-hyperv@vger.kernel.org Link: https://lore.kernel.org/r/20200719002841.20369-1-rdunlap@infradead.org Signed-off-by: Wei Liu --- include/uapi/linux/hyperv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h index 8f24404ad04f..6135d92e0d47 100644 --- a/include/uapi/linux/hyperv.h +++ b/include/uapi/linux/hyperv.h @@ -219,7 +219,7 @@ struct hv_do_fcopy { * kernel and user-level daemon communicate using a connector channel. * * The user mode component first registers with the - * the kernel component. Subsequently, the kernel component requests, data + * kernel component. Subsequently, the kernel component requests, data * for the specified keys. In response to this message the user mode component * fills in the value corresponding to the specified key. We overload the * sequence field in the cn_msg header to define our KVP message types. -- cgit v1.2.3 From 3bf1c021e36e7269c71dceafec713b0181f413a9 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 9 Jun 2020 16:14:55 +0300 Subject: uapi/habanalabs: fix some comments MAP/UNMAP are done also for device memory. Reviewed-by: Omer Shpigelman Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index f6267a8d7416..f218d1c62c62 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -530,13 +530,13 @@ union hl_wait_cs_args { struct hl_wait_cs_out out; }; -/* Opcode to alloc device memory */ +/* Opcode to allocate device memory */ #define HL_MEM_OP_ALLOC 0 /* Opcode to free previously allocated device memory */ #define HL_MEM_OP_FREE 1 -/* Opcode to map host memory */ +/* Opcode to map host and device memory */ #define HL_MEM_OP_MAP 2 -/* Opcode to unmap previously mapped host memory */ +/* Opcode to unmap previously mapped host and device memory */ #define HL_MEM_OP_UNMAP 3 /* Memory flags */ -- cgit v1.2.3 From db491e4f08a9fd84ebb1ebd22a6b0b988a81a0d8 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 18 Jun 2020 09:51:16 +0300 Subject: habanalabs: Add dropped cs statistics info struct Add command submission statistics structure which can be obtained through the info ioctl. Each drop counter describes the reason for which the command submission was dropped. This information is needed for the user to be aware of the specific reason for which the submitted work was dropped. The user can then utilize the driver more efficiently. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 24 +++++++++++++++++++++++- drivers/misc/habanalabs/habanalabs.h | 5 +++++ drivers/misc/habanalabs/habanalabs_ioctl.c | 24 ++++++++++++++++++++++++ drivers/misc/habanalabs/hw_queue.c | 5 ++++- include/uapi/misc/habanalabs.h | 21 +++++++++++++++++++++ 5 files changed, 77 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index f81d6685e011..777f88d25acd 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -246,6 +246,18 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job) kfree(job); } +static void cs_counters_aggregate(struct hl_device *hdev, struct hl_ctx *ctx) +{ + hdev->aggregated_cs_counters.device_in_reset_drop_cnt += + ctx->cs_counters.device_in_reset_drop_cnt; + hdev->aggregated_cs_counters.out_of_mem_drop_cnt += + ctx->cs_counters.out_of_mem_drop_cnt; + hdev->aggregated_cs_counters.parsing_drop_cnt += + ctx->cs_counters.parsing_drop_cnt; + hdev->aggregated_cs_counters.queue_full_drop_cnt += + ctx->cs_counters.queue_full_drop_cnt; +} + static void cs_do_release(struct kref *ref) { struct hl_cs *cs = container_of(ref, struct hl_cs, @@ -349,6 +361,8 @@ static void cs_do_release(struct kref *ref) dma_fence_signal(cs->fence); dma_fence_put(cs->fence); + cs_counters_aggregate(hdev, cs->ctx); + kfree(cs); } @@ -632,12 +646,15 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, rc = validate_queue_index(hdev, chunk, &queue_type, &is_kernel_allocated_cb); - if (rc) + if (rc) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; goto free_cs_object; + } if (is_kernel_allocated_cb) { cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk); if (!cb) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; rc = -EINVAL; goto free_cs_object; } @@ -651,6 +668,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, job = hl_cs_allocate_job(hdev, queue_type, is_kernel_allocated_cb); if (!job) { + hpriv->ctx->cs_counters.out_of_mem_drop_cnt++; dev_err(hdev->dev, "Failed to allocate a new job\n"); rc = -ENOMEM; if (is_kernel_allocated_cb) @@ -683,6 +701,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, rc = cs_parser(hpriv, job); if (rc) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; dev_err(hdev->dev, "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n", cs->ctx->asid, cs->sequence, job->id, rc); @@ -691,6 +710,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, } if (int_queues_only) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; dev_err(hdev->dev, "Reject CS %d.%llu because only internal queues jobs are present\n", cs->ctx->asid, cs->sequence); @@ -875,6 +895,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, job = hl_cs_allocate_job(hdev, q_type, true); if (!job) { + ctx->cs_counters.out_of_mem_drop_cnt++; dev_err(hdev->dev, "Failed to allocate a new job\n"); rc = -ENOMEM; goto put_cs; @@ -882,6 +903,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, cb = hl_cb_kernel_create(hdev, PAGE_SIZE); if (!cb) { + ctx->cs_counters.out_of_mem_drop_cnt++; kfree(job); rc = -EFAULT; goto put_cs; diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index e4d6f7c91194..ae781453a509 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -10,6 +10,7 @@ #include "include/armcp_if.h" #include "include/qman_if.h" +#include #include #include @@ -787,6 +788,7 @@ struct hl_ctx { struct mutex mem_hash_lock; struct mutex mmu_lock; struct list_head debugfs_list; + struct hl_cs_counters cs_counters; u64 cs_sequence; u64 *dram_default_hops; spinlock_t cs_lock; @@ -1391,6 +1393,7 @@ struct hl_device_idle_busy_ts { * @compute_ctx: current compute context executing. * @idle_busy_ts_arr: array to hold time stamps of transitions from idle to busy * and vice-versa + * @aggregated_cs_counters: aggregated cs counters among all contexts * @dram_used_mem: current DRAM memory consumption. * @timeout_jiffies: device CS timeout value. * @max_power: the max power of the device, as configured by the sysadmin. This @@ -1489,6 +1492,8 @@ struct hl_device { struct hl_device_idle_busy_ts *idle_busy_ts_arr; + struct hl_cs_counters aggregated_cs_counters; + atomic64_t dram_used_mem; u64 timeout_jiffies; u64 max_power; diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c index 52eedd3a6c3a..5af1c03da473 100644 --- a/drivers/misc/habanalabs/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/habanalabs_ioctl.c @@ -276,6 +276,27 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args) min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0; } +static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_info_cs_counters cs_counters = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + memcpy(&cs_counters.cs_counters, &hdev->aggregated_cs_counters, + sizeof(struct hl_cs_counters)); + + if (hpriv->ctx) + memcpy(&cs_counters.ctx_cs_counters, &hpriv->ctx->cs_counters, + sizeof(struct hl_cs_counters)); + + return copy_to_user(out, &cs_counters, + min((size_t) max_size, sizeof(cs_counters))) ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -336,6 +357,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_TIME_SYNC: return time_sync_info(hdev, args); + case HL_INFO_CS_COUNTERS: + return cs_counters_info(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index f5a10a5ac300..da66ffb528f8 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -514,6 +514,7 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) hdev->asic_funcs->hw_queues_lock(hdev); if (hl_device_disabled_or_in_reset(hdev)) { + ctx->cs_counters.device_in_reset_drop_cnt++; dev_err(hdev->dev, "device is disabled or in reset, CS rejected!\n"); rc = -EPERM; @@ -543,8 +544,10 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) break; } - if (rc) + if (rc) { + ctx->cs_counters.queue_full_drop_cnt++; goto unroll_cq_resv; + } if (q->queue_type == QUEUE_TYPE_EXT || q->queue_type == QUEUE_TYPE_HW) diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index f218d1c62c62..d5c4f983b7a8 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -263,6 +263,7 @@ enum hl_device_status { * time the driver was loaded. * HL_INFO_TIME_SYNC - Retrieve the device's time alongside the host's time * for synchronization. + * HL_INFO_CS_COUNTERS - Retrieve command submission counters */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -274,6 +275,7 @@ enum hl_device_status { #define HL_INFO_CLK_RATE 8 #define HL_INFO_RESET_COUNT 9 #define HL_INFO_TIME_SYNC 10 +#define HL_INFO_CS_COUNTERS 11 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -338,6 +340,25 @@ struct hl_info_time_sync { __u64 host_time; }; +/** + * struct hl_info_cs_counters - command submission counters + * @out_of_mem_drop_cnt: dropped due to memory allocation issue + * @parsing_drop_cnt: dropped due to error in packet parsing + * @queue_full_drop_cnt: dropped due to queue full + * @device_in_reset_drop_cnt: dropped due to device in reset + */ +struct hl_cs_counters { + __u64 out_of_mem_drop_cnt; + __u64 parsing_drop_cnt; + __u64 queue_full_drop_cnt; + __u64 device_in_reset_drop_cnt; +}; + +struct hl_info_cs_counters { + struct hl_cs_counters cs_counters; + struct hl_cs_counters ctx_cs_counters; +}; + struct hl_info_args { /* Location of relevant struct in userspace */ __u64 return_pointer; -- cgit v1.2.3 From 8b603d0715a372f5827d3a6b19d9568bf854b687 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Fri, 24 Jul 2020 10:41:12 +0200 Subject: RDMA/mlx5: Fix typo in enum name Nnothing uses the enum name, so this is harmless. Fixes: 322694412400 ("IB/mlx5: Introduce driver create and destroy flow methods") Link: https://lore.kernel.org/r/20200724084112.GC31930@amd Signed-off-by: Pavel Machek (CIP) Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index b330e6eee626..e24d66d278cf 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -263,7 +263,7 @@ enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_FLAGS, }; -enum mlx5_ib_destoy_flow_attrs { +enum mlx5_ib_destroy_flow_attrs { MLX5_IB_ATTR_DESTROY_FLOW_HANDLE = (1U << UVERBS_ID_NS_SHIFT), }; -- cgit v1.2.3 From 5923b8f7fa218a9bccd730c0a9692635eb2fc740 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Thu, 23 Jul 2020 01:03:01 +0300 Subject: net/sched: cls_flower: Add hash info to flow classification Adding new cls flower keys for hash value and hash mask and dissect the hash info from the skb into the flow key towards flow classication. Signed-off-by: Ariel Levkovich Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 3 +++ net/sched/cls_flower.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7576209d96f9..ee95f42fb0ec 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -578,6 +578,9 @@ enum { TCA_FLOWER_KEY_MPLS_OPTS, + TCA_FLOWER_KEY_HASH, /* u32 */ + TCA_FLOWER_KEY_HASH_MASK, /* u32 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index acd8e05c2ba5..a4f7ef1de7e7 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -64,6 +64,7 @@ struct fl_flow_key { }; } tp_range; struct flow_dissector_key_ct ct; + struct flow_dissector_key_hash hash; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -318,6 +319,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, ARRAY_SIZE(fl_ct_info_to_flower_map)); + skb_flow_dissect_hash(skb, &mask->dissector, &skb_key); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); f = fl_mask_lookup(mask, &skb_key); @@ -695,6 +697,9 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY, .len = 128 / BITS_PER_BYTE }, [TCA_FLOWER_FLAGS] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_HASH] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_HASH_MASK] = { .type = NLA_U32 }, + }; static const struct nla_policy @@ -1626,6 +1631,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip); + fl_set_key_val(tb, &key->hash.hash, TCA_FLOWER_KEY_HASH, + &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK, + sizeof(key->hash.hash)); + if (tb[TCA_FLOWER_KEY_ENC_OPTS]) { ret = fl_set_enc_opt(tb, key, mask, extack); if (ret) @@ -1740,6 +1749,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_CT, ct); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_HASH, hash); skb_flow_dissector_init(dissector, keys, cnt); } @@ -2960,6 +2971,11 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; + if (fl_dump_key_val(skb, &key->hash.hash, TCA_FLOWER_KEY_HASH, + &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK, + sizeof(key->hash.hash))) + goto nla_put_failure; + return 0; nla_put_failure: -- cgit v1.2.3 From 01370434df85eb76ecb1527a4466013c4aca2436 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 24 Jul 2020 09:03:10 -0400 Subject: icmp6: support rfc 4884 Extend the rfc 4884 read interface introduced for ipv4 in commit eba75c587e81 ("icmp: support rfc 4884") to ipv6. Add socket option SOL_IPV6/IPV6_RECVERR_RFC4884. Changes v1->v2: - make ipv6_icmp_error_rfc4884 static (file scope) Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/uapi/linux/icmpv6.h | 1 + include/uapi/linux/in6.h | 1 + net/ipv4/icmp.c | 1 + net/ipv6/datagram.c | 16 ++++++++++++++++ net/ipv6/ipv6_sockglue.c | 12 ++++++++++++ 6 files changed, 32 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 8d8f877e7f81..a44789d027cc 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -283,6 +283,7 @@ struct ipv6_pinfo { autoflowlabel:1, autoflowlabel_set:1, mc_all:1, + recverr_rfc4884:1, rtalert_isolate:1; __u8 min_hopcount; __u8 tclass; diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index 2622b5a3e616..c1661febc2dc 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -68,6 +68,7 @@ struct icmp6hdr { #define icmp6_mtu icmp6_dataun.un_data32[0] #define icmp6_unused icmp6_dataun.un_data32[0] #define icmp6_maxdelay icmp6_dataun.un_data16[0] +#define icmp6_datagram_len icmp6_dataun.un_data8[0] #define icmp6_router icmp6_dataun.u_nd_advt.router #define icmp6_solicited icmp6_dataun.u_nd_advt.solicited #define icmp6_override icmp6_dataun.u_nd_advt.override diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 9f2273a08356..5ad396a57eb3 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -179,6 +179,7 @@ struct in6_flowlabel_req { #define IPV6_LEAVE_ANYCAST 28 #define IPV6_MULTICAST_ALL 29 #define IPV6_ROUTER_ALERT_ISOLATE 30 +#define IPV6_RECVERR_RFC4884 31 /* IPV6_MTU_DISCOVER values */ #define IPV6_PMTUDISC_DONT 0 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7498c58460a1..cf36f955bfe6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1173,6 +1173,7 @@ void ip_icmp_error_rfc4884(const struct sk_buff *skb, if (!ip_icmp_error_rfc4884_validate(skb, off)) out->flags |= SO_EE_RFC4884_FLAG_INVALID; } +EXPORT_SYMBOL_GPL(ip_icmp_error_rfc4884); int icmp_err(struct sk_buff *skb, u32 info) { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 390bedde21a5..cc8ad7ddecda 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -284,6 +285,17 @@ int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, } EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); +static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out) +{ + switch (icmp6_hdr(skb)->icmp6_type) { + case ICMPV6_TIME_EXCEED: + case ICMPV6_DEST_UNREACH: + ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr), + icmp6_hdr(skb)->icmp6_datagram_len * 8); + } +} + void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { @@ -313,6 +325,10 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; __skb_pull(skb, payload - skb->data); + + if (inet6_sk(sk)->recverr_rfc4884) + ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index d2282f5c9760..20c740976334 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -965,6 +965,14 @@ done: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; + case IPV6_RECVERR_RFC4884: + if (optlen < sizeof(int)) + goto e_inval; + if (val < 0 || val > 1) + goto e_inval; + np->recverr_rfc4884 = valbool; + retv = 0; + break; } release_sock(sk); @@ -1439,6 +1447,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->rtalert_isolate; break; + case IPV6_RECVERR_RFC4884: + val = np->recverr_rfc4884; + break; + default: return -ENOPROTOOPT; } -- cgit v1.2.3 From d721a43ff69cd473019c3b77aacb76b09102aca3 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 25 Jul 2020 20:00:27 +0800 Subject: bcache: increase super block version for cache device and backing device The new added super block version BCACHE_SB_VERSION_BDEV_WITH_FEATURES (5) BCACHE_SB_VERSION_CDEV_WITH_FEATURES value (6), is for the feature set bits. Devices have super block version equal to the new version will have three new members for feature set bits in the on-disk super block, __le64 feature_compat; __le64 feature_incompat; __le64 feature_ro_compat; They are used for further new features which may introduce on-disk format change, and avoid unncessary super block version increase. The very basic features handling code skeleton is also initialized in this patch. Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/features.h | 78 ++++++++++++++++++++++++++++++++++++++++++++ drivers/md/bcache/super.c | 32 ++++++++++++++++-- include/uapi/linux/bcache.h | 29 +++++++++++----- 3 files changed, 128 insertions(+), 11 deletions(-) create mode 100644 drivers/md/bcache/features.h (limited to 'include/uapi') diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h new file mode 100644 index 000000000000..ae7df37b9862 --- /dev/null +++ b/drivers/md/bcache/features.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _BCACHE_FEATURES_H +#define _BCACHE_FEATURES_H + +#include +#include +#include + +#define BCH_FEATURE_COMPAT 0 +#define BCH_FEATURE_RO_COMPAT 1 +#define BCH_FEATURE_INCOMPAT 2 +#define BCH_FEATURE_TYPE_MASK 0x03 + +#define BCH_FEATURE_COMPAT_SUUP 0 +#define BCH_FEATURE_RO_COMPAT_SUUP 0 +#define BCH_FEATURE_INCOMPAT_SUUP 0 + +#define BCH_HAS_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_compat & (mask)) +#define BCH_HAS_RO_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_ro_compat & (mask)) +#define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \ + ((sb)->feature_incompat & (mask)) + +/* Feature set definition */ + +#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_compat & \ + BCH##_FEATURE_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat |= \ + BCH##_FEATURE_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat &= \ + ~BCH##_FEATURE_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_ro_compat & \ + BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat |= \ + BCH##_FEATURE_RO_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat &= \ + ~BCH##_FEATURE_RO_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_incompat & \ + BCH##_FEATURE_INCOMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat |= \ + BCH##_FEATURE_INCOMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat &= \ + ~BCH##_FEATURE_INCOMPAT_##flagname; \ +} + +#endif diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 40fb18028c01..c6ef410a21a8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -13,6 +13,7 @@ #include "extents.h" #include "request.h" #include "writeback.h" +#include "features.h" #include #include @@ -194,6 +195,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, sb->data_offset = BDEV_DATA_START_DEFAULT; break; case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: + case BCACHE_SB_VERSION_BDEV_WITH_FEATURES: sb->data_offset = le64_to_cpu(s->data_offset); err = "Bad data offset"; @@ -207,6 +209,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, if (err) goto err; break; + case BCACHE_SB_VERSION_CDEV_WITH_FEATURES: + err = read_super_common(sb, bdev, s); + if (err) + goto err; + sb->feature_compat = le64_to_cpu(s->feature_compat); + sb->feature_incompat = le64_to_cpu(s->feature_incompat); + sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat); + break; default: err = "Unsupported superblock version"; goto err; @@ -241,7 +251,6 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, offset_in_page(out)); out->offset = cpu_to_le64(sb->offset); - out->version = cpu_to_le64(sb->version); memcpy(out->uuid, sb->uuid, 16); memcpy(out->set_uuid, sb->set_uuid, 16); @@ -257,6 +266,13 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, for (i = 0; i < sb->keys; i++) out->d[i] = cpu_to_le64(sb->d[i]); + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + out->feature_compat = cpu_to_le64(sb->feature_compat); + out->feature_incompat = cpu_to_le64(sb->feature_incompat); + out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat); + } + + out->version = cpu_to_le64(sb->version); out->csum = csum_set(out); pr_debug("ver %llu, flags %llu, seq %llu\n", @@ -313,17 +329,20 @@ void bcache_write_super(struct cache_set *c) { struct closure *cl = &c->sb_write; struct cache *ca; - unsigned int i; + unsigned int i, version = BCACHE_SB_VERSION_CDEV_WITH_UUID; down(&c->sb_write_mutex); closure_init(cl, &c->cl); c->sb.seq++; + if (c->sb.version > version) + version = c->sb.version; + for_each_cache(ca, c, i) { struct bio *bio = &ca->sb_bio; - ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID; + ca->sb.version = version; ca->sb.seq = c->sb.seq; ca->sb.last_mount = c->sb.last_mount; @@ -1839,6 +1858,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->sb.bucket_size = sb->bucket_size; c->sb.nr_in_set = sb->nr_in_set; c->sb.last_mount = sb->last_mount; + c->sb.version = sb->version; + if (c->sb.version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + c->sb.feature_compat = sb->feature_compat; + c->sb.feature_ro_compat = sb->feature_ro_compat; + c->sb.feature_incompat = sb->feature_incompat; + } + c->bucket_bits = ilog2(sb->bucket_size); c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 9a1965c6c3d0..47df2db2e727 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -141,11 +141,13 @@ static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys) * Version 3: Cache device with new UUID format * Version 4: Backing device with data offset */ -#define BCACHE_SB_VERSION_CDEV 0 -#define BCACHE_SB_VERSION_BDEV 1 -#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 -#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 -#define BCACHE_SB_MAX_VERSION 4 +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_VERSION_CDEV_WITH_FEATURES 5 +#define BCACHE_SB_VERSION_BDEV_WITH_FEATURES 6 +#define BCACHE_SB_MAX_VERSION 6 #define SB_SECTOR 8 #define SB_OFFSET (SB_SECTOR << SECTOR_SHIFT) @@ -173,7 +175,12 @@ struct cache_sb_disk { __le64 flags; __le64 seq; - __le64 pad[8]; + + __le64 feature_compat; + __le64 feature_incompat; + __le64 feature_ro_compat; + + __le64 pad[5]; union { struct { @@ -224,7 +231,12 @@ struct cache_sb { __u64 flags; __u64 seq; - __u64 pad[8]; + + __u64 feature_compat; + __u64 feature_incompat; + __u64 feature_ro_compat; + + __u64 pad[5]; union { struct { @@ -262,7 +274,8 @@ struct cache_sb { static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) { return sb->version == BCACHE_SB_VERSION_BDEV - || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_FEATURES; } BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); -- cgit v1.2.3 From 4c1ccd0896d6a45f2159280d957afd441a7aeaba Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 25 Jul 2020 20:00:29 +0800 Subject: bcache: struct cache_sb is only for in-memory super block now We have struct cache_sb_disk for on-disk super block already, it is unnecessary to keep the in-memory super block format exactly mapping to the on-disk struct layout. This patch adds code comments to notice that struct cache_sb is not exactly mapping to cache_sb_disk, and removes the useless member csum and pad[5]. Although struct cache_sb does not belong to uapi, but there are still some on-disk format related macros reference it and it is unncessary to get rid of such dependency now. So struct cache_sb will continue to stay in include/uapi/linux/bache.h for now. Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/uapi/linux/bcache.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 47df2db2e727..0ef984ea515a 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -215,8 +215,13 @@ struct cache_sb_disk { __le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ }; +/* + * This is for in-memory bcache super block. + * NOTE: cache_sb is NOT exactly mapping to cache_sb_disk, the member + * size, ordering and even whole struct size may be different + * from cache_sb_disk. + */ struct cache_sb { - __u64 csum; __u64 offset; /* sector where this sb was written */ __u64 version; @@ -236,8 +241,6 @@ struct cache_sb { __u64 feature_incompat; __u64 feature_ro_compat; - __u64 pad[5]; - union { struct { /* Cache devices */ @@ -245,7 +248,6 @@ struct cache_sb { __u16 block_size; /* sectors */ __u16 bucket_size; /* sectors */ - __u16 nr_in_set; __u16 nr_this_dev; }; -- cgit v1.2.3 From ffa470327572b8f85dceda48fd0676d9658cb8c5 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sat, 25 Jul 2020 20:00:35 +0800 Subject: bcache: add bucket_size_hi into struct cache_sb_disk for large bucket The large bucket feature is to extend bucket_size from 16bit to 32bit. When create cache device on zoned device (e.g. zoned NVMe SSD), making a single bucket cover one or more zones of the zoned device is the simplest way to support zoned device as cache by bcache. But current maximum bucket size is 16MB and a typical zone size of zoned device is 256MB, this is the major motiviation to extend bucket size to a larger bit width. This patch is the basic and first change to support large bucket size, the major changes it makes are, - Add BCH_FEATURE_INCOMPAT_LARGE_BUCKET for the large bucket feature, INCOMPAT means it introduces incompatible on-disk format change. - Add BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET) routines. - Adds __le16 bucket_size_hi into struct cache_sb_disk at offset 0x8d0 for the on-disk super block format. - For the in-memory super block struct cache_sb, member bucket_size is extended from __u16 to __32. - Add get_bucket_size() to combine the bucket_size and bucket_size_hi from struct cache_sb_disk into an unsigned int value. Since we already have large bucket size helpers meta_bucket_pages(), meta_bucket_bytes() and alloc_meta_bucket_pages(), they make sure when bucket size > 8MB, the memory allocation for bcache meta data bucket won't fail no matter how large the bucket size extended. So these meta data buckets are handled properly when the bucket size width increase from 16bit to 32bit, we don't need to worry about them. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 2 +- drivers/md/bcache/features.c | 22 ++++++++++++++++++++++ drivers/md/bcache/features.h | 9 ++++++--- drivers/md/bcache/movinggc.c | 4 ++-- drivers/md/bcache/super.c | 23 +++++++++++++++++++---- include/uapi/linux/bcache.h | 3 ++- 6 files changed, 52 insertions(+), 11 deletions(-) create mode 100644 drivers/md/bcache/features.c (limited to 'include/uapi') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a1df0d95151c..52035a78d836 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -87,7 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) { struct cache *ca; struct bucket *b; - unsigned int next = c->nbuckets * c->sb.bucket_size / 1024; + unsigned long next = c->nbuckets * c->sb.bucket_size / 1024; unsigned int i; int r; diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c new file mode 100644 index 000000000000..ba53944bb390 --- /dev/null +++ b/drivers/md/bcache/features.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Feature set bits and string conversion. + * Inspired by ext4's features compat/incompat/ro_compat related code. + * + * Copyright 2020 Coly Li + * + */ +#include +#include "bcache.h" + +struct feature { + int compat; + unsigned int mask; + const char *string; +}; + +static struct feature feature_list[] = { + {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LARGE_BUCKET, + "large_bucket"}, + {0, 0, 0 }, +}; diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h index ae7df37b9862..dca052cf5203 100644 --- a/drivers/md/bcache/features.h +++ b/drivers/md/bcache/features.h @@ -11,9 +11,13 @@ #define BCH_FEATURE_INCOMPAT 2 #define BCH_FEATURE_TYPE_MASK 0x03 +/* Feature set definition */ +/* Incompat feature set */ +#define BCH_FEATURE_INCOMPAT_LARGE_BUCKET 0x0001 /* 32bit bucket size */ + #define BCH_FEATURE_COMPAT_SUUP 0 #define BCH_FEATURE_RO_COMPAT_SUUP 0 -#define BCH_FEATURE_INCOMPAT_SUUP 0 +#define BCH_FEATURE_INCOMPAT_SUUP BCH_FEATURE_INCOMPAT_LARGE_BUCKET #define BCH_HAS_COMPAT_FEATURE(sb, mask) \ ((sb)->feature_compat & (mask)) @@ -22,8 +26,6 @@ #define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \ ((sb)->feature_incompat & (mask)) -/* Feature set definition */ - #define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline int bch_has_feature_##name(struct cache_sb *sb) \ { \ @@ -75,4 +77,5 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \ ~BCH##_FEATURE_INCOMPAT_##flagname; \ } +BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET); #endif diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index b7dd2d75f58c..5872d6470470 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -206,8 +206,8 @@ void bch_moving_gc(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) { - unsigned int sectors_to_move = 0; - unsigned int reserve_sectors = ca->sb.bucket_size * + unsigned long sectors_to_move = 0; + unsigned long reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); ca->heap.used = 0; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e4f05c4ddcdd..62c9681fe92f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -60,6 +60,17 @@ struct workqueue_struct *bch_journal_wq; /* Superblock */ +static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s) +{ + unsigned int bucket_size = le16_to_cpu(s->bucket_size); + + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES && + bch_has_feature_large_bucket(sb)) + bucket_size |= le16_to_cpu(s->bucket_size_hi) << 16; + + return bucket_size; +} + static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev, struct cache_sb_disk *s) { @@ -68,7 +79,7 @@ static const char *read_super_common(struct cache_sb *sb, struct block_device * sb->first_bucket= le16_to_cpu(s->first_bucket); sb->nbuckets = le64_to_cpu(s->nbuckets); - sb->bucket_size = le16_to_cpu(s->bucket_size); + sb->bucket_size = get_bucket_size(sb, s); sb->nr_in_set = le16_to_cpu(s->nr_in_set); sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); @@ -210,12 +221,16 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, goto err; break; case BCACHE_SB_VERSION_CDEV_WITH_FEATURES: - err = read_super_common(sb, bdev, s); - if (err) - goto err; + /* + * Feature bits are needed in read_super_common(), + * convert them firstly. + */ sb->feature_compat = le64_to_cpu(s->feature_compat); sb->feature_incompat = le64_to_cpu(s->feature_incompat); sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat); + err = read_super_common(sb, bdev, s); + if (err) + goto err; break; default: err = "Unsupported superblock version"; diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 0ef984ea515a..52e8bcb33981 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -213,6 +213,7 @@ struct cache_sb_disk { __le16 keys; }; __le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ + __le16 bucket_size_hi; }; /* @@ -247,9 +248,9 @@ struct cache_sb { __u64 nbuckets; /* device size */ __u16 block_size; /* sectors */ - __u16 bucket_size; /* sectors */ __u16 nr_in_set; __u16 nr_this_dev; + __u32 bucket_size; /* sectors */ }; struct { /* Backing devices */ -- cgit v1.2.3 From 92fe2aa859f52ce6aa595ca97fec110dc7100e63 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 20 Jul 2020 15:07:30 -0700 Subject: libnvdimm: Validate command family indices The ND_CMD_CALL format allows for a general passthrough of passlisted commands targeting a given command set. However there is no validation of the family index relative to what the bus supports. - Update the NFIT bus implementation (the only one that supports ND_CMD_CALL passthrough) to also passlist the valid set of command family indices. - Update the generic __nd_ioctl() path to validate that field on behalf of all implementations. Fixes: 31eca76ba2fc ("nfit, libnvdimm: limited/whitelisted dimm command marshaling mechanism") Cc: Vishal Verma Cc: Dave Jiang Cc: Ira Weiny Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Signed-off-by: Dan Williams Signed-off-by: Vishal Verma --- drivers/acpi/nfit/core.c | 11 +++++++++-- drivers/acpi/nfit/nfit.h | 1 - drivers/nvdimm/bus.c | 16 ++++++++++++++++ include/linux/libnvdimm.h | 2 ++ include/uapi/linux/ndctl.h | 4 ++++ 5 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 7c138a4edc03..1f72ce1a782b 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1823,6 +1823,7 @@ static void populate_shutdown_status(struct nfit_mem *nfit_mem) static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, struct nfit_mem *nfit_mem, u32 device_handle) { + struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; struct acpi_device *adev, *adev_dimm; struct device *dev = acpi_desc->dev; unsigned long dsm_mask, label_mask; @@ -1834,6 +1835,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, /* nfit test assumes 1:1 relationship between commands and dsms */ nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en; nfit_mem->family = NVDIMM_FAMILY_INTEL; + set_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask); if (dcr->valid_fields & ACPI_NFIT_CONTROL_MFG_INFO_VALID) sprintf(nfit_mem->id, "%04x-%02x-%04x-%08x", @@ -1886,10 +1888,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, * Note, that checking for function0 (bit0) tells us if any commands * are reachable through this GUID. */ + clear_bit(NVDIMM_FAMILY_INTEL, &nd_desc->dimm_family_mask); for (i = 0; i <= NVDIMM_FAMILY_MAX; i++) - if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) + if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) { + set_bit(i, &nd_desc->dimm_family_mask); if (family < 0 || i == default_dsm_family) family = i; + } /* limit the supported commands to those that are publicly documented */ nfit_mem->family = family; @@ -2153,6 +2158,9 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en; + set_bit(ND_CMD_CALL, &nd_desc->cmd_mask); + set_bit(NVDIMM_BUS_FAMILY_NFIT, &nd_desc->bus_family_mask); + adev = to_acpi_dev(acpi_desc); if (!adev) return; @@ -2160,7 +2168,6 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++) if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) set_bit(i, &nd_desc->cmd_mask); - set_bit(ND_CMD_CALL, &nd_desc->cmd_mask); dsm_mask = (1 << ND_CMD_ARS_CAP) | diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index f5525f8bb770..5c5e7ebba8dc 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -33,7 +33,6 @@ | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED) -#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV #define NVDIMM_CMD_MAX 31 #define NVDIMM_STANDARD_CMDMASK \ diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 09087c38fabd..955265656b96 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -1037,9 +1037,25 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, dimm_name = "bus"; } + /* Validate command family support against bus declared support */ if (cmd == ND_CMD_CALL) { + unsigned long *mask; + if (copy_from_user(&pkg, p, sizeof(pkg))) return -EFAULT; + + if (nvdimm) { + if (pkg.nd_family > NVDIMM_FAMILY_MAX) + return -EINVAL; + mask = &nd_desc->dimm_family_mask; + } else { + if (pkg.nd_family > NVDIMM_BUS_FAMILY_MAX) + return -EINVAL; + mask = &nd_desc->bus_family_mask; + } + + if (!test_bit(pkg.nd_family, mask)) + return -EINVAL; } if (!desc || diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 18da4059be09..bd39a2cf7972 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -78,6 +78,8 @@ struct nvdimm_bus_descriptor { const struct attribute_group **attr_groups; unsigned long bus_dsm_mask; unsigned long cmd_mask; + unsigned long dimm_family_mask; + unsigned long bus_family_mask; struct module *module; char *provider_name; struct device_node *of_node; diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 0e09dc5cec19..e9468b9332bd 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -245,6 +245,10 @@ struct nd_cmd_pkg { #define NVDIMM_FAMILY_MSFT 3 #define NVDIMM_FAMILY_HYPERV 4 #define NVDIMM_FAMILY_PAPR 5 +#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_PAPR + +#define NVDIMM_BUS_FAMILY_NFIT 0 +#define NVDIMM_BUS_FAMILY_MAX NVDIMM_BUS_FAMILY_NFIT #define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\ struct nd_cmd_pkg) -- cgit v1.2.3 From 6450ddbd5d8e83ea9927c7f9076a21f829699e0f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 20 Jul 2020 15:07:40 -0700 Subject: ACPI: NFIT: Define runtime firmware activation commands Platform reboots are expensive. Towards reducing downtime to apply firmware updates the Intel NVDIMM command definition is growing support for applying live firmware updates that only require temporarily suspending memory traffic instead of a full reboot. Follow-on commits add support for triggering firmware activation, this patch only defines the commands, adds probe support, and validates that they are blocked via the ioctl path. The ioctl-path block ensures that the OS is in charge since these commands have side effects only the OS can handle. Specifically firmware activation may cause the memory controller to be quiesced on the order of 100s of milliseconds. In that case Linux ensure the activation only takes place while the OS is in a suspend state. Link: https://pmem.io/documents/IntelOptanePMem_DSM_Interface-V2.0.pdf Cc: Vishal Verma Cc: Dave Jiang Cc: Ira Weiny Cc: "Rafael J. Wysocki" Cc: Len Brown Signed-off-by: Dan Williams Signed-off-by: Vishal Verma --- drivers/acpi/nfit/core.c | 86 +++++++++++++++++++++++++++++++--------------- drivers/acpi/nfit/intel.h | 53 ++++++++++++++++++++++++++++ drivers/acpi/nfit/nfit.h | 25 +++++++++++++- include/uapi/linux/ndctl.h | 3 +- 4 files changed, 137 insertions(+), 30 deletions(-) (limited to 'include/uapi') diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 9fdd655bdf0e..78cc9e2d2aa3 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -73,6 +73,18 @@ const guid_t *to_nfit_uuid(enum nfit_uuids id) } EXPORT_SYMBOL(to_nfit_uuid); +static const guid_t *to_nfit_bus_uuid(int family) +{ + if (WARN_ONCE(family == NVDIMM_BUS_FAMILY_NFIT, + "only secondary bus families can be translated\n")) + return NULL; + /* + * The index of bus UUIDs starts immediately following the last + * NVDIMM/leaf family. + */ + return to_nfit_uuid(family + NVDIMM_FAMILY_MAX); +} + static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; @@ -362,24 +374,8 @@ static u8 nfit_dsm_revid(unsigned family, unsigned func) { static const u8 revid_table[NVDIMM_FAMILY_MAX+1][NVDIMM_CMD_MAX+1] = { [NVDIMM_FAMILY_INTEL] = { - [NVDIMM_INTEL_GET_MODES] = 2, - [NVDIMM_INTEL_GET_FWINFO] = 2, - [NVDIMM_INTEL_START_FWUPDATE] = 2, - [NVDIMM_INTEL_SEND_FWUPDATE] = 2, - [NVDIMM_INTEL_FINISH_FWUPDATE] = 2, - [NVDIMM_INTEL_QUERY_FWUPDATE] = 2, - [NVDIMM_INTEL_SET_THRESHOLD] = 2, - [NVDIMM_INTEL_INJECT_ERROR] = 2, - [NVDIMM_INTEL_GET_SECURITY_STATE] = 2, - [NVDIMM_INTEL_SET_PASSPHRASE] = 2, - [NVDIMM_INTEL_DISABLE_PASSPHRASE] = 2, - [NVDIMM_INTEL_UNLOCK_UNIT] = 2, - [NVDIMM_INTEL_FREEZE_LOCK] = 2, - [NVDIMM_INTEL_SECURE_ERASE] = 2, - [NVDIMM_INTEL_OVERWRITE] = 2, - [NVDIMM_INTEL_QUERY_OVERWRITE] = 2, - [NVDIMM_INTEL_SET_MASTER_PASSPHRASE] = 2, - [NVDIMM_INTEL_MASTER_SECURE_ERASE] = 2, + [NVDIMM_INTEL_GET_MODES ... + NVDIMM_INTEL_FW_ACTIVATE_ARM] = 2, }, }; u8 id; @@ -406,7 +402,7 @@ static bool payload_dumpable(struct nvdimm *nvdimm, unsigned int func) } static int cmd_to_func(struct nfit_mem *nfit_mem, unsigned int cmd, - struct nd_cmd_pkg *call_pkg) + struct nd_cmd_pkg *call_pkg, int *family) { if (call_pkg) { int i; @@ -417,6 +413,7 @@ static int cmd_to_func(struct nfit_mem *nfit_mem, unsigned int cmd, for (i = 0; i < ARRAY_SIZE(call_pkg->nd_reserved2); i++) if (call_pkg->nd_reserved2[i]) return -EINVAL; + *family = call_pkg->nd_family; return call_pkg->nd_command; } @@ -450,13 +447,14 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, acpi_handle handle; const guid_t *guid; int func, rc, i; + int family = 0; if (cmd_rc) *cmd_rc = -EINVAL; if (cmd == ND_CMD_CALL) call_pkg = buf; - func = cmd_to_func(nfit_mem, cmd, call_pkg); + func = cmd_to_func(nfit_mem, cmd, call_pkg, &family); if (func < 0) return func; @@ -478,9 +476,17 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, cmd_name = nvdimm_bus_cmd_name(cmd); cmd_mask = nd_desc->cmd_mask; - dsm_mask = acpi_desc->bus_dsm_mask; + if (cmd == ND_CMD_CALL && call_pkg->nd_family) { + family = call_pkg->nd_family; + if (!test_bit(family, &nd_desc->bus_family_mask)) + return -EINVAL; + dsm_mask = acpi_desc->family_dsm_mask[family]; + guid = to_nfit_bus_uuid(family); + } else { + dsm_mask = acpi_desc->bus_dsm_mask; + guid = to_nfit_uuid(NFIT_DEV_BUS); + } desc = nd_cmd_bus_desc(cmd); - guid = to_nfit_uuid(NFIT_DEV_BUS); handle = adev->handle; dimm_name = "bus"; } @@ -516,8 +522,8 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, in_buf.buffer.length = call_pkg->nd_size_in; } - dev_dbg(dev, "%s cmd: %d: func: %d input length: %d\n", - dimm_name, cmd, func, in_buf.buffer.length); + dev_dbg(dev, "%s cmd: %d: family: %d func: %d input length: %d\n", + dimm_name, cmd, family, func, in_buf.buffer.length); if (payload_dumpable(nvdimm, func)) print_hex_dump_debug("nvdimm in ", DUMP_PREFIX_OFFSET, 4, 4, in_buf.buffer.pointer, @@ -2153,14 +2159,21 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; const guid_t *guid = to_nfit_uuid(NFIT_DEV_BUS); + unsigned long dsm_mask, *mask; struct acpi_device *adev; - unsigned long dsm_mask; int i; - nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; set_bit(ND_CMD_CALL, &nd_desc->cmd_mask); set_bit(NVDIMM_BUS_FAMILY_NFIT, &nd_desc->bus_family_mask); + /* enable nfit_test to inject bus command emulation */ + if (acpi_desc->bus_cmd_force_en) { + nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; + mask = &nd_desc->bus_family_mask; + if (acpi_desc->family_dsm_mask[NVDIMM_BUS_FAMILY_INTEL]) + set_bit(NVDIMM_BUS_FAMILY_INTEL, mask); + } + adev = to_acpi_dev(acpi_desc); if (!adev) return; @@ -2181,6 +2194,14 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) set_bit(i, &acpi_desc->bus_dsm_mask); + + /* Enumerate allowed NVDIMM_BUS_FAMILY_INTEL commands */ + dsm_mask = NVDIMM_BUS_INTEL_FW_ACTIVATE_CMDMASK; + guid = to_nfit_bus_uuid(NVDIMM_BUS_FAMILY_INTEL); + mask = &acpi_desc->family_dsm_mask[NVDIMM_BUS_FAMILY_INTEL]; + for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) + if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) + set_bit(i, mask); } static ssize_t range_index_show(struct device *dev, @@ -3492,7 +3513,10 @@ static int __acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc, return 0; } -/* prevent security commands from being issued via ioctl */ +/* + * Prevent security and firmware activate commands from being issued via + * ioctl. + */ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf) { @@ -3503,10 +3527,15 @@ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc, call_pkg->nd_family == NVDIMM_FAMILY_INTEL) { func = call_pkg->nd_command; if (func > NVDIMM_CMD_MAX || - (1 << func) & NVDIMM_INTEL_SECURITY_CMDMASK) + (1 << func) & NVDIMM_INTEL_DENY_CMDMASK) return -EOPNOTSUPP; } + /* block all non-nfit bus commands */ + if (!nvdimm && cmd == ND_CMD_CALL && + call_pkg->nd_family != NVDIMM_BUS_FAMILY_NFIT) + return -EOPNOTSUPP; + return __acpi_nfit_clear_to_send(nd_desc, nvdimm, cmd); } @@ -3798,6 +3827,7 @@ static __init int nfit_init(void) guid_parse(UUID_NFIT_DIMM_N_HPE2, &nfit_uuid[NFIT_DEV_DIMM_N_HPE2]); guid_parse(UUID_NFIT_DIMM_N_MSFT, &nfit_uuid[NFIT_DEV_DIMM_N_MSFT]); guid_parse(UUID_NFIT_DIMM_N_HYPERV, &nfit_uuid[NFIT_DEV_DIMM_N_HYPERV]); + guid_parse(UUID_INTEL_BUS, &nfit_uuid[NFIT_BUS_INTEL]); nfit_wq = create_singlethread_workqueue("nfit"); if (!nfit_wq) diff --git a/drivers/acpi/nfit/intel.h b/drivers/acpi/nfit/intel.h index 0aca682ab9d7..868d073731cc 100644 --- a/drivers/acpi/nfit/intel.h +++ b/drivers/acpi/nfit/intel.h @@ -111,4 +111,57 @@ struct nd_intel_master_secure_erase { u8 passphrase[ND_INTEL_PASSPHRASE_SIZE]; u32 status; } __packed; + +#define ND_INTEL_FWA_IDLE 0 +#define ND_INTEL_FWA_ARMED 1 +#define ND_INTEL_FWA_BUSY 2 + +#define ND_INTEL_DIMM_FWA_NONE 0 +#define ND_INTEL_DIMM_FWA_NOTSTAGED 1 +#define ND_INTEL_DIMM_FWA_SUCCESS 2 +#define ND_INTEL_DIMM_FWA_NEEDRESET 3 +#define ND_INTEL_DIMM_FWA_MEDIAFAILED 4 +#define ND_INTEL_DIMM_FWA_ABORT 5 +#define ND_INTEL_DIMM_FWA_NOTSUPP 6 +#define ND_INTEL_DIMM_FWA_ERROR 7 + +struct nd_intel_fw_activate_dimminfo { + u32 status; + u16 result; + u8 state; + u8 reserved[7]; +} __packed; + +struct nd_intel_fw_activate_arm { + u8 activate_arm; + u32 status; +} __packed; + +/* Root device command payloads */ +#define ND_INTEL_BUS_FWA_CAP_FWQUIESCE (1 << 0) +#define ND_INTEL_BUS_FWA_CAP_OSQUIESCE (1 << 1) +#define ND_INTEL_BUS_FWA_CAP_RESET (1 << 2) + +struct nd_intel_bus_fw_activate_businfo { + u32 status; + u16 reserved; + u8 state; + u8 capability; + u64 activate_tmo; + u64 cpu_quiesce_tmo; + u64 io_quiesce_tmo; + u64 max_quiesce_tmo; +} __packed; + +#define ND_INTEL_BUS_FWA_STATUS_NOARM (6 | 1 << 16) +#define ND_INTEL_BUS_FWA_STATUS_BUSY (6 | 2 << 16) +#define ND_INTEL_BUS_FWA_STATUS_NOFW (6 | 3 << 16) +#define ND_INTEL_BUS_FWA_STATUS_TMO (6 | 4 << 16) +#define ND_INTEL_BUS_FWA_STATUS_NOIDLE (6 | 5 << 16) +#define ND_INTEL_BUS_FWA_STATUS_ABORT (6 | 6 << 16) + +struct nd_intel_bus_fw_activate { + u8 iodev_state; + u32 status; +} __packed; #endif diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index da097149d94d..97c122628975 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -18,6 +18,7 @@ /* http://pmem.io/documents/NVDIMM_DSM_Interface-V1.6.pdf */ #define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66" +#define UUID_INTEL_BUS "c7d8acd4-2df8-4b82-9f65-a325335af149" /* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */ #define UUID_NFIT_DIMM_N_HPE1 "9002c334-acf3-4c0e-9642-a235f0d53bc6" @@ -65,6 +66,13 @@ enum nvdimm_family_cmds { NVDIMM_INTEL_QUERY_OVERWRITE = 26, NVDIMM_INTEL_SET_MASTER_PASSPHRASE = 27, NVDIMM_INTEL_MASTER_SECURE_ERASE = 28, + NVDIMM_INTEL_FW_ACTIVATE_DIMMINFO = 29, + NVDIMM_INTEL_FW_ACTIVATE_ARM = 30, +}; + +enum nvdimm_bus_family_cmds { + NVDIMM_BUS_INTEL_FW_ACTIVATE_BUSINFO = 1, + NVDIMM_BUS_INTEL_FW_ACTIVATE = 2, }; #define NVDIMM_INTEL_SECURITY_CMDMASK \ @@ -75,13 +83,22 @@ enum nvdimm_family_cmds { | 1 << NVDIMM_INTEL_SET_MASTER_PASSPHRASE \ | 1 << NVDIMM_INTEL_MASTER_SECURE_ERASE) +#define NVDIMM_INTEL_FW_ACTIVATE_CMDMASK \ +(1 << NVDIMM_INTEL_FW_ACTIVATE_DIMMINFO | 1 << NVDIMM_INTEL_FW_ACTIVATE_ARM) + +#define NVDIMM_BUS_INTEL_FW_ACTIVATE_CMDMASK \ +(1 << NVDIMM_BUS_INTEL_FW_ACTIVATE_BUSINFO | 1 << NVDIMM_BUS_INTEL_FW_ACTIVATE) + #define NVDIMM_INTEL_CMDMASK \ (NVDIMM_STANDARD_CMDMASK | 1 << NVDIMM_INTEL_GET_MODES \ | 1 << NVDIMM_INTEL_GET_FWINFO | 1 << NVDIMM_INTEL_START_FWUPDATE \ | 1 << NVDIMM_INTEL_SEND_FWUPDATE | 1 << NVDIMM_INTEL_FINISH_FWUPDATE \ | 1 << NVDIMM_INTEL_QUERY_FWUPDATE | 1 << NVDIMM_INTEL_SET_THRESHOLD \ | 1 << NVDIMM_INTEL_INJECT_ERROR | 1 << NVDIMM_INTEL_LATCH_SHUTDOWN \ - | NVDIMM_INTEL_SECURITY_CMDMASK) + | NVDIMM_INTEL_SECURITY_CMDMASK | NVDIMM_INTEL_FW_ACTIVATE_CMDMASK) + +#define NVDIMM_INTEL_DENY_CMDMASK \ +(NVDIMM_INTEL_SECURITY_CMDMASK | NVDIMM_INTEL_FW_ACTIVATE_CMDMASK) enum nfit_uuids { /* for simplicity alias the uuid index with the family id */ @@ -90,6 +107,11 @@ enum nfit_uuids { NFIT_DEV_DIMM_N_HPE2 = NVDIMM_FAMILY_HPE2, NFIT_DEV_DIMM_N_MSFT = NVDIMM_FAMILY_MSFT, NFIT_DEV_DIMM_N_HYPERV = NVDIMM_FAMILY_HYPERV, + /* + * to_nfit_bus_uuid() expects to translate bus uuid family ids + * to a UUID index using NVDIMM_FAMILY_MAX as an offset + */ + NFIT_BUS_INTEL = NVDIMM_FAMILY_MAX + NVDIMM_BUS_FAMILY_INTEL, NFIT_SPA_VOLATILE, NFIT_SPA_PM, NFIT_SPA_DCR, @@ -238,6 +260,7 @@ struct acpi_nfit_desc { unsigned long dimm_cmd_force_en; unsigned long bus_cmd_force_en; unsigned long bus_dsm_mask; + unsigned long family_dsm_mask[NVDIMM_BUS_FAMILY_MAX + 1]; unsigned int platform_cap; unsigned int scrub_tmo; int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index e9468b9332bd..8cf1e4884fd5 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -248,7 +248,8 @@ struct nd_cmd_pkg { #define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_PAPR #define NVDIMM_BUS_FAMILY_NFIT 0 -#define NVDIMM_BUS_FAMILY_MAX NVDIMM_BUS_FAMILY_NFIT +#define NVDIMM_BUS_FAMILY_INTEL 1 +#define NVDIMM_BUS_FAMILY_MAX NVDIMM_BUS_FAMILY_INTEL #define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\ struct nd_cmd_pkg) -- cgit v1.2.3 From a5cbe05a6673b85bed2a63ffcfea6a96c6410cff Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:12 -0700 Subject: bpf: Implement bpf iterator for map elements The bpf iterator for map elements are implemented. The bpf program will receive four parameters: bpf_iter_meta *meta: the meta data bpf_map *map: the bpf_map whose elements are traversed void *key: the key of one element void *value: the value of the same element Here, meta and map pointers are always valid, and key has register type PTR_TO_RDONLY_BUF_OR_NULL and value has register type PTR_TO_RDWR_BUF_OR_NULL. The kernel will track the access range of key and value during verification time. Later, these values will be compared against the values in the actual map to ensure all accesses are within range. A new field iter_seq_info is added to bpf_map_ops which is used to add map type specific information, i.e., seq_ops, init/fini seq_file func and seq_file private data size. Subsequent patches will have actual implementation for bpf_map_ops->iter_seq_info. In user space, BPF_ITER_LINK_MAP_FD needs to be specified in prog attr->link_create.flags, which indicates that attr->link_create.target_fd is a map_fd. The reason for such an explicit flag is for possible future cases where one bpf iterator may allow more than one possible customization, e.g., pid and cgroup id for task_file. Current kernel internal implementation only allows the target to register at most one required bpf_iter_link_info. To support the above case, optional bpf_iter_link_info's are needed, the target can be extended to register such link infos, and user provided link_info needs to match one of target supported ones. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com --- include/linux/bpf.h | 16 ++++++++ include/uapi/linux/bpf.h | 7 ++++ kernel/bpf/bpf_iter.c | 85 ++++++++++++++++++++++++++++++++++-------- kernel/bpf/map_iter.c | 30 ++++++++++++++- tools/include/uapi/linux/bpf.h | 7 ++++ 5 files changed, 128 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f9c4bb08f616..4175cf1f4665 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -107,6 +107,9 @@ struct bpf_map_ops { /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; + + /* bpf_iter info used to open a seq_file */ + const struct bpf_iter_seq_info *iter_seq_info; }; struct bpf_map_memory { @@ -1207,12 +1210,18 @@ int bpf_obj_get_user(const char __user *pathname, int flags); int __init bpf_iter_ ## target(args) { return 0; } struct bpf_iter_aux_info { + struct bpf_map *map; }; +typedef int (*bpf_iter_check_target_t)(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux); + #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; + bpf_iter_check_target_t check_target; u32 ctx_arg_info_size; + enum bpf_iter_link_info req_linfo; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; @@ -1223,6 +1232,13 @@ struct bpf_iter_meta { u64 seq_num; }; +struct bpf_iter__bpf_map_elem { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(void *, key); + __bpf_md_ptr(void *, value); +}; + int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54d0c886e3ba..828c2f6438f2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -246,6 +246,13 @@ enum bpf_link_type { MAX_BPF_LINK_TYPE, }; +enum bpf_iter_link_info { + BPF_ITER_LINK_UNSPEC = 0, + BPF_ITER_LINK_MAP_FD = 1, + + MAX_BPF_ITER_LINK_INFO, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 8fa94cb1b5a0..363b9cafc2d8 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -14,11 +14,13 @@ struct bpf_iter_target_info { struct bpf_iter_link { struct bpf_link link; + struct bpf_iter_aux_info aux; struct bpf_iter_target_info *tinfo; }; struct bpf_iter_priv_data { struct bpf_iter_target_info *tinfo; + const struct bpf_iter_seq_info *seq_info; struct bpf_prog *prog; u64 session_id; u64 seq_num; @@ -35,7 +37,8 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info); static void bpf_iter_inc_seq_num(struct seq_file *seq) { @@ -199,11 +202,25 @@ done: return copied; } +static const struct bpf_iter_seq_info * +__get_seq_info(struct bpf_iter_link *link) +{ + const struct bpf_iter_seq_info *seq_info; + + if (link->aux.map) { + seq_info = link->aux.map->ops->iter_seq_info; + if (seq_info) + return seq_info; + } + + return link->tinfo->reg_info->seq_info; +} + static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; - return prepare_seq_file(file, link); + return prepare_seq_file(file, link, __get_seq_info(link)); } static int iter_release(struct inode *inode, struct file *file) @@ -218,8 +235,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->reg_info->seq_info->fini_seq_private) - iter_priv->tinfo->reg_info->seq_info->fini_seq_private(seq->private); + if (iter_priv->seq_info->fini_seq_private) + iter_priv->seq_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -318,6 +335,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) static void bpf_iter_link_release(struct bpf_link *link) { + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + if (iter_link->aux.map) + bpf_map_put_with_uref(iter_link->aux.map); } static void bpf_iter_link_dealloc(struct bpf_link *link) @@ -370,14 +392,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; + struct bpf_iter_aux_info aux = {}; struct bpf_iter_link *link; + u32 prog_btf_id, target_fd; bool existed = false; - u32 prog_btf_id; + struct bpf_map *map; int err; - if (attr->link_create.target_fd || attr->link_create.flags) - return -EINVAL; - prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { @@ -390,6 +411,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!existed) return -ENOENT; + /* Make sure user supplied flags are target expected. */ + target_fd = attr->link_create.target_fd; + if (attr->link_create.flags != tinfo->reg_info->req_linfo) + return -EINVAL; + if (!attr->link_create.flags && target_fd) + return -EINVAL; + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -403,21 +431,45 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } + if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) { + map = bpf_map_get_with_uref(target_fd); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto cleanup_link; + } + + aux.map = map; + err = tinfo->reg_info->check_target(prog, &aux); + if (err) { + bpf_map_put_with_uref(map); + goto cleanup_link; + } + + link->aux.map = map; + } + return bpf_link_settle(&link_primer); + +cleanup_link: + bpf_link_cleanup(&link_primer); + return err; } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, struct bpf_iter_target_info *tinfo, + const struct bpf_iter_seq_info *seq_info, struct bpf_prog *prog) { priv_data->tinfo = tinfo; + priv_data->seq_info = seq_info; priv_data->prog = prog; priv_data->session_id = atomic64_inc_return(&session_id); priv_data->seq_num = 0; priv_data->done_stop = false; } -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info) { struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; @@ -433,21 +485,21 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->reg_info->seq_info->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->reg_info->seq_info->seq_ops, + seq_info->seq_priv_size; + priv_data = __seq_open_private(file, seq_info->seq_ops, total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->reg_info->seq_info->init_seq_private) { - err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private, NULL); + if (seq_info->init_seq_private) { + err = seq_info->init_seq_private(priv_data->target_private, &link->aux); if (err) goto release_seq_file; } - init_seq_meta(priv_data, tinfo, prog); + init_seq_meta(priv_data, tinfo, seq_info, prog); seq = file->private_data; seq->private = priv_data->target_private; @@ -463,6 +515,7 @@ release_prog: int bpf_iter_new_fd(struct bpf_link *link) { + struct bpf_iter_link *iter_link; struct file *file; unsigned int flags; int err, fd; @@ -481,8 +534,8 @@ int bpf_iter_new_fd(struct bpf_link *link) goto free_fd; } - err = prepare_seq_file(file, - container_of(link, struct bpf_iter_link, link)); + iter_link = container_of(link, struct bpf_iter_link, link); + err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); if (err) goto free_file; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 1a69241fb1e2..8a1f9b3355d0 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -98,10 +98,38 @@ static struct bpf_iter_reg bpf_map_reg_info = { .seq_info = &bpf_map_seq_info, }; +static int bpf_iter_check_map(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux) +{ + return -EINVAL; +} + +DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, void *value) + +static const struct bpf_iter_reg bpf_map_elem_reg_info = { + .target = "bpf_map_elem", + .check_target = bpf_iter_check_map, + .req_linfo = BPF_ITER_LINK_MAP_FD, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map_elem, key), + PTR_TO_RDONLY_BUF_OR_NULL }, + { offsetof(struct bpf_iter__bpf_map_elem, value), + PTR_TO_RDWR_BUF_OR_NULL }, + }, +}; + static int __init bpf_map_iter_init(void) { + int ret; + bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id; - return bpf_iter_reg_target(&bpf_map_reg_info); + ret = bpf_iter_reg_target(&bpf_map_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&bpf_map_elem_reg_info); } late_initcall(bpf_map_iter_init); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 54d0c886e3ba..828c2f6438f2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -246,6 +246,13 @@ enum bpf_link_type { MAX_BPF_LINK_TYPE, }; +enum bpf_iter_link_info { + BPF_ITER_LINK_UNSPEC = 0, + BPF_ITER_LINK_MAP_FD = 1, + + MAX_BPF_ITER_LINK_INFO, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. -- cgit v1.2.3 From aa8d3a716b59db6c1ad6c68fb8aa05e31980da60 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 Jul 2020 23:45:57 -0700 Subject: bpf, xdp: Add bpf_link-based XDP attachment API Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through BPF_LINK_CREATE command. bpf_xdp_link is mutually exclusive with direct BPF program attachment, previous BPF program should be detached prior to attempting to create a new bpf_xdp_link attachment (for a given XDP mode). Once BPF link is attached, it can't be replaced by other BPF program attachment or link attachment. It will be detached only when the last BPF link FD is closed. bpf_xdp_link will be auto-detached when net_device is shutdown, similarly to how other BPF links behave (cgroup, flow_dissector). At that point bpf_link will become defunct, but won't be destroyed until last FD is closed. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722064603.3350758-5-andriin@fb.com --- include/linux/netdevice.h | 4 ++ include/uapi/linux/bpf.h | 7 +- kernel/bpf/syscall.c | 5 ++ net/core/dev.c | 169 ++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 178 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cad44b40c776..7d3c412fcfe5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -888,6 +888,7 @@ struct bpf_prog_offload_ops; struct netlink_ext_ack; struct xdp_umem; struct xdp_dev_bulk_queue; +struct bpf_xdp_link; enum bpf_xdp_mode { XDP_MODE_SKB = 0, @@ -898,6 +899,7 @@ enum bpf_xdp_mode { struct bpf_xdp_entity { struct bpf_prog *prog; + struct bpf_xdp_link *link; }; struct netdev_bpf { @@ -3831,7 +3833,9 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); + int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 828c2f6438f2..87823fb9c123 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -230,6 +230,7 @@ enum bpf_attach_type { BPF_CGROUP_INET_SOCK_RELEASE, BPF_XDP_CPUMAP, BPF_SK_LOOKUP, + BPF_XDP, __MAX_BPF_ATTACH_TYPE }; @@ -242,6 +243,7 @@ enum bpf_link_type { BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, + BPF_LINK_TYPE_XDP = 6, MAX_BPF_LINK_TYPE, }; @@ -614,7 +616,10 @@ union bpf_attr { struct { /* struct used by BPF_LINK_CREATE command */ __u32 prog_fd; /* eBPF program to attach */ - __u32 target_fd; /* object to attach to */ + union { + __u32 target_fd; /* object to attach to */ + __u32 target_ifindex; /* target ifindex */ + }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ } link_create; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ee290b1f2d9e..0e8c88db7e7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2824,6 +2824,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_TRACING; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; + case BPF_XDP: + return BPF_PROG_TYPE_XDP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3921,6 +3923,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_XDP: + ret = bpf_xdp_link_attach(attr, prog); + break; default: ret = -EINVAL; } diff --git a/net/core/dev.c b/net/core/dev.c index 521ce031ee35..e24248f3d675 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8716,6 +8716,12 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); +struct bpf_xdp_link { + struct bpf_link link; + struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ + int flags; +}; + static enum bpf_xdp_mode dev_xdp_mode(u32 flags) { if (flags & XDP_FLAGS_HW_MODE) @@ -8738,9 +8744,19 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) }; } +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + return dev->xdp_state[mode].link; +} + static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode) { + struct bpf_xdp_link *link = dev_xdp_link(dev, mode); + + if (link) + return link->link.prog; return dev->xdp_state[mode].prog; } @@ -8751,9 +8767,17 @@ u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) return prog ? prog->aux->id : 0; } +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_xdp_link *link) +{ + dev->xdp_state[mode].link = link; + dev->xdp_state[mode].prog = NULL; +} + static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_prog *prog) { + dev->xdp_state[mode].link = NULL; dev->xdp_state[mode].prog = prog; } @@ -8793,6 +8817,7 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, static void dev_xdp_uninstall(struct net_device *dev) { + struct bpf_xdp_link *link; struct bpf_prog *prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; @@ -8810,14 +8835,20 @@ static void dev_xdp_uninstall(struct net_device *dev) WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); - bpf_prog_put(prog); - dev_xdp_set_prog(dev, mode, NULL); + /* auto-detach link from net device */ + link = dev_xdp_link(dev, mode); + if (link) + link->dev = NULL; + else + bpf_prog_put(prog); + + dev_xdp_set_link(dev, mode, NULL); } } static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, - struct bpf_prog *new_prog, struct bpf_prog *old_prog, - u32 flags) + struct bpf_xdp_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog, u32 flags) { struct bpf_prog *cur_prog; enum bpf_xdp_mode mode; @@ -8826,6 +8857,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack ASSERT_RTNL(); + /* either link or prog attachment, never both */ + if (link && (new_prog || old_prog)) + return -EINVAL; + /* link supports only XDP mode flags */ + if (link && (flags & ~XDP_FLAGS_MODES)) { + NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); + return -EINVAL; + } /* just one XDP mode bit should be set, zero defaults to SKB mode */ if (hweight32(flags & XDP_FLAGS_MODES) > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); @@ -8838,7 +8877,18 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack } mode = dev_xdp_mode(flags); + /* can't replace attached link */ + if (dev_xdp_link(dev, mode)) { + NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); + return -EBUSY; + } + cur_prog = dev_xdp_prog(dev, mode); + /* can't replace attached prog with link */ + if (link && cur_prog) { + NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); + return -EBUSY; + } if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST; @@ -8848,6 +8898,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return -EBUSY; } + /* put effective new program into new_prog */ + if (link) + new_prog = link->link.prog; + if (new_prog) { bool offload = mode == XDP_MODE_HW; enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB @@ -8884,13 +8938,116 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return err; } - dev_xdp_set_prog(dev, mode, new_prog); + if (link) + dev_xdp_set_link(dev, mode, link); + else + dev_xdp_set_prog(dev, mode, new_prog); if (cur_prog) bpf_prog_put(cur_prog); return 0; } +static int dev_xdp_attach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); +} + +static int dev_xdp_detach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + + ASSERT_RTNL(); + + mode = dev_xdp_mode(link->flags); + if (dev_xdp_link(dev, mode) != link) + return -EINVAL; + + bpf_op = dev_xdp_bpf_op(dev, mode); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + dev_xdp_set_link(dev, mode, NULL); + return 0; +} + +static void bpf_xdp_link_release(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + rtnl_lock(); + + /* if racing with net_device's tear down, xdp_link->dev might be + * already NULL, in which case link was already auto-detached + */ + if (xdp_link->dev) + WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + + rtnl_unlock(); +} + +static void bpf_xdp_link_dealloc(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + kfree(xdp_link); +} + +static const struct bpf_link_ops bpf_xdp_link_lops = { + .release = bpf_xdp_link_release, + .dealloc = bpf_xdp_link_dealloc, +}; + +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_xdp_link *link; + struct net_device *dev; + int err, fd; + + dev = dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_dev; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + link->dev = dev; + link->flags = attr->link_create.flags; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto out_put_dev; + } + + rtnl_lock(); + err = dev_xdp_attach_link(dev, NULL, link); + rtnl_unlock(); + + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_dev; + } + + fd = bpf_link_settle(&link_primer); + /* link itself doesn't hold dev's refcnt to not complicate shutdown */ + dev_put(dev); + return fd; + +out_put_dev: + dev_put(dev); + return err; +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -8927,7 +9084,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, } } - err = dev_xdp_attach(dev, extack, new_prog, old_prog, flags); + err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); err_out: if (err && new_prog) -- cgit v1.2.3 From c1931c9784ebb5787c0784c112fb8baa5e8455b3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 Jul 2020 23:45:59 -0700 Subject: bpf: Implement BPF XDP link-specific introspection APIs Implement XDP link-specific show_fdinfo and link_info to emit ifindex. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722064603.3350758-7-andriin@fb.com --- include/uapi/linux/bpf.h | 3 +++ net/core/dev.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 87823fb9c123..e1ba4ae6a916 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4069,6 +4069,9 @@ struct bpf_link_info { __u32 netns_ino; __u32 attach_type; } netns; + struct { + __u32 ifindex; + } xdp; }; } __attribute__((aligned(8))); diff --git a/net/core/dev.c b/net/core/dev.c index 49f284f51a22..82ce0920b172 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8996,6 +8996,35 @@ static void bpf_xdp_link_dealloc(struct bpf_link *link) kfree(xdp_link); } +static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); +} + +static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + info->xdp.ifindex = ifindex; + return 0; +} + static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { @@ -9041,6 +9070,8 @@ out_unlock: static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, + .show_fdinfo = bpf_xdp_link_show_fdinfo, + .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, }; -- cgit v1.2.3 From da3a9e9a6aa96ef589c153078f66e0646bf06b55 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Thu, 23 Jul 2020 11:05:50 +0200 Subject: drm/fourcc: fix Amlogic Video Framebuffer Compression macro Fix the Amlogic Video Framebuffer Compression modifier macro to correctly add the layout options, a pair of parenthesis was missing. Fixes: d6528ec88309 ("drm/fourcc: Add modifier definitions for describing Amlogic Video Framebuffer Compression") Signed-off-by: Neil Armstrong Acked-by: Sam Ravnborg Link: https://patchwork.freedesktop.org/patch/msgid/20200723090551.27529-1-narmstrong@baylibre.com --- include/uapi/drm/drm_fourcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 4bee7de5f306..82f327801267 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -1004,7 +1004,7 @@ drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier) #define DRM_FORMAT_MOD_AMLOGIC_FBC(__layout, __options) \ fourcc_mod_code(AMLOGIC, \ ((__layout) & __fourcc_mod_amlogic_layout_mask) | \ - ((__options) & __fourcc_mod_amlogic_options_mask \ + (((__options) & __fourcc_mod_amlogic_options_mask) \ << __fourcc_mod_amlogic_options_shift)) /* Amlogic FBC Layouts */ -- cgit v1.2.3 From 1a0c02ba643ed05c07ddf14d87c3bec640666836 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 20 Jul 2020 08:50:58 -0700 Subject: dmaengine: idxd: add missing invalid flags field to completion Add missing "invalid flags" field to completion record struct. Reported-by: Nikhil Rao Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/159526025819.49266.13176787210106133664.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- include/uapi/linux/idxd.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index 1f412fbf561b..5d07a3ff9e96 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -178,6 +178,12 @@ struct dsa_completion_record { uint32_t bytes_completed; uint64_t fault_addr; union { + /* common record */ + struct { + uint32_t invalid_flags:24; + uint32_t rsvd2:8; + }; + uint16_t delta_rec_size; uint16_t crc_val; -- cgit v1.2.3 From 06f67c47076e5c3ee65276171596479dcc3a3941 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 28 Jun 2020 13:07:14 +0800 Subject: btrfs: use __u16 for the return value of btrfs_qgroup_level() The qgroup level is limited to u16, so no need to use u64 for it. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 +- include/uapi/linux/btrfs_tree.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 787128d7e196..229e461dbfc3 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -538,7 +538,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) if (qgroup->rsv.values[i]) { ret = true; btrfs_warn(fs_info, - "qgroup %llu/%llu has unreleased space, type %d rsv %llu", + "qgroup %hu/%llu has unreleased space, type %d rsv %llu", btrfs_qgroup_level(qgroup->qgroupid), btrfs_qgroup_subvolid(qgroup->qgroupid), i, qgroup->rsv.values[i]); diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index a3f3975df0de..9ba64ca6b4ac 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -913,9 +913,9 @@ struct btrfs_free_space_info { #define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) #define BTRFS_QGROUP_LEVEL_SHIFT 48 -static inline __u64 btrfs_qgroup_level(__u64 qgroupid) +static inline __u16 btrfs_qgroup_level(__u64 qgroupid) { - return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; + return (__u16)(qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT); } /* -- cgit v1.2.3 From 137c541821a83debb63b3fa8abdd1cbc41bdf3a1 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jul 2020 21:28:58 +0900 Subject: btrfs: pass checksum type via BTRFS_IOC_FS_INFO ioctl With the recent addition of filesystem checksum types other than CRC32c, it is not anymore hard-coded which checksum type a btrfs filesystem uses. Up to now there is no good way to read the filesystem checksum, apart from reading the filesystem UUID and then query sysfs for the checksum type. Add a new csum_type and csum_size fields to the BTRFS_IOC_FS_INFO ioctl command which usually is used to query filesystem features. Also add a flags member indicating that the kernel responded with a set csum_type and csum_size field. For compatibility reasons, only return the csum_type and csum_size if the BTRFS_FS_INFO_FLAG_CSUM_INFO flag was passed to the kernel. Also clear any unknown flags so we don't pass false positives to user-space newer than the kernel. To simplify further additions to the ioctl, also switch the padding to a u8 array. Pahole was used to verify the result of this switch: The csum members are added before flags, which might look odd, but this is to keep the alignment requirements and not to introduce holes in the structure. $ pahole -C btrfs_ioctl_fs_info_args fs/btrfs/btrfs.ko struct btrfs_ioctl_fs_info_args { __u64 max_id; /* 0 8 */ __u64 num_devices; /* 8 8 */ __u8 fsid[16]; /* 16 16 */ __u32 nodesize; /* 32 4 */ __u32 sectorsize; /* 36 4 */ __u32 clone_alignment; /* 40 4 */ __u16 csum_type; /* 44 2 */ __u16 csum_size; /* 46 2 */ __u64 flags; /* 48 8 */ __u8 reserved[968]; /* 56 968 */ /* size: 1024, cachelines: 16, members: 10 */ }; Fixes: 3951e7f050ac ("btrfs: add xxhash64 to checksumming algorithms") Fixes: 3831bf0094ab ("btrfs: add sha256 to checksumming algorithm") CC: stable@vger.kernel.org # 5.5+ Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 16 +++++++++++++--- include/uapi/linux/btrfs.h | 14 ++++++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ab34179d7cbc..2854f4a40787 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3217,11 +3217,15 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 flags_in; int ret = 0; - fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); - if (!fi_args) - return -ENOMEM; + fi_args = memdup_user(arg, sizeof(*fi_args)); + if (IS_ERR(fi_args)) + return PTR_ERR(fi_args); + + flags_in = fi_args->flags; + memset(fi_args, 0, sizeof(*fi_args)); rcu_read_lock(); fi_args->num_devices = fs_devices->num_devices; @@ -3237,6 +3241,12 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; + if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) { + fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy); + fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy); + fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; + } + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index e6b6cb0f8bc6..24f6848ad78e 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -243,6 +243,13 @@ struct btrfs_ioctl_dev_info_args { __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ }; +/* + * Retrieve information about the filesystem + */ + +/* Request information about checksum type and size */ +#define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0) + struct btrfs_ioctl_fs_info_args { __u64 max_id; /* out */ __u64 num_devices; /* out */ @@ -250,8 +257,11 @@ struct btrfs_ioctl_fs_info_args { __u32 nodesize; /* out */ __u32 sectorsize; /* out */ __u32 clone_alignment; /* out */ - __u32 reserved32; - __u64 reserved[122]; /* pad to 1k */ + /* See BTRFS_FS_INFO_FLAG_* */ + __u16 csum_type; /* out */ + __u16 csum_size; /* out */ + __u64 flags; /* in/out */ + __u8 reserved[968]; /* pad to 1k */ }; /* -- cgit v1.2.3 From 0fb408a558aadbaa58beb75b02c95741e1fbb514 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jul 2020 21:28:59 +0900 Subject: btrfs: add filesystem generation to FS_INFO ioctl Add retrieval of the filesystem's generation to the fsinfo ioctl. This is driven by setting the BTRFS_FS_INFO_FLAG_GENERATION flag in btrfs_ioctl_fs_info_args::flags. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +++++ include/uapi/linux/btrfs.h | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2854f4a40787..55dd20d0f9cb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3247,6 +3247,11 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; } + if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) { + fi_args->generation = fs_info->generation; + fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; + } + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 24f6848ad78e..9b82e01c191d 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -250,6 +250,9 @@ struct btrfs_ioctl_dev_info_args { /* Request information about checksum type and size */ #define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0) +/* Request information about filesystem generation */ +#define BTRFS_FS_INFO_FLAG_GENERATION (1 << 1) + struct btrfs_ioctl_fs_info_args { __u64 max_id; /* out */ __u64 num_devices; /* out */ @@ -261,7 +264,8 @@ struct btrfs_ioctl_fs_info_args { __u16 csum_type; /* out */ __u16 csum_size; /* out */ __u64 flags; /* in/out */ - __u8 reserved[968]; /* pad to 1k */ + __u64 generation; /* out */ + __u8 reserved[960]; /* pad to 1k */ }; /* -- cgit v1.2.3 From 49bac897683340457a0ab1f76b924a1220bdb604 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jul 2020 21:29:00 +0900 Subject: btrfs: add metadata_uuid to FS_INFO ioctl Add retrieval of the filesystem's metadata UUID to the fsinfo ioctl. This is driven by setting the BTRFS_FS_INFO_FLAG_METADATA_UUID flag in btrfs_ioctl_fs_info_args::flags. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 6 ++++++ include/uapi/linux/btrfs.h | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 55dd20d0f9cb..b4ddf51ae377 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3252,6 +3252,12 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; } + if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) { + memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid, + sizeof(fi_args->metadata_uuid)); + fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID; + } + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 9b82e01c191d..2c39d15a2beb 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -252,6 +252,8 @@ struct btrfs_ioctl_dev_info_args { /* Request information about filesystem generation */ #define BTRFS_FS_INFO_FLAG_GENERATION (1 << 1) +/* Request information about filesystem metadata UUID */ +#define BTRFS_FS_INFO_FLAG_METADATA_UUID (1 << 2) struct btrfs_ioctl_fs_info_args { __u64 max_id; /* out */ @@ -265,7 +267,8 @@ struct btrfs_ioctl_fs_info_args { __u16 csum_size; /* out */ __u64 flags; /* in/out */ __u64 generation; /* out */ - __u8 reserved[960]; /* pad to 1k */ + __u8 metadata_uuid[BTRFS_FSID_SIZE]; /* out */ + __u8 reserved[944]; /* pad to 1k */ }; /* -- cgit v1.2.3 From 50c8a002bfd43798768ad07833b2b9d4b4d5274f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:29:03 -0700 Subject: platform/x86: ISST: drop a duplicated word in isst_if.h Drop the repeated word "for" in a comment. Signed-off-by: Randy Dunlap Cc: Srinivas Pandruvada Cc: platform-driver-x86@vger.kernel.org Cc: Darren Hart Cc: Andy Shevchenko Acked-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- include/uapi/linux/isst_if.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h index 0a52b7b093d3..ba078f8e9add 100644 --- a/include/uapi/linux/isst_if.h +++ b/include/uapi/linux/isst_if.h @@ -69,7 +69,7 @@ struct isst_if_cpu_maps { * @logical_cpu: Logical CPU number to get target PCI device. * @reg: PUNIT register offset * @value: For write operation value to write and for - * for read placeholder read value + * read placeholder read value * * Structure to specify read/write data to PUNIT registers. */ -- cgit v1.2.3 From 1e6b57d6421f0343dd11619612e5ff8930cddf38 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 11 Jun 2020 11:11:32 -0400 Subject: unexport linux/elfcore.h It's unusable from userland - it uses elf_gregset_t, which is not provided by exported headers. glibc has it in sys/procfs.h, but the same file defines struct elf_prstatus, so linux/elfcore.h can't be included once sys/procfs.h has been pulled. Same goes for uclibc and dietlibc simply doesn't have elf_gregset_t defined anywhere. IOW, no userland source is including that thing. Signed-off-by: Al Viro --- include/linux/elfcore.h | 69 +++++++++++++++++++++++++++-- include/uapi/linux/elfcore.h | 101 ------------------------------------------- scripts/headers_install.sh | 1 - usr/include/Makefile | 1 - 4 files changed, 66 insertions(+), 106 deletions(-) delete mode 100644 include/uapi/linux/elfcore.h (limited to 'include/uapi') diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 4cad0e784b28..96ab215dad2d 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -5,12 +5,75 @@ #include #include #include - -#include -#include +#include +#include +#include +#include +#include +#include struct coredump_params; +struct elf_siginfo +{ + int si_signo; /* signal number */ + int si_code; /* extra code */ + int si_errno; /* errno */ +}; + +/* + * Definitions to generate Intel SVR4-like core files. + * These mostly have the same names as the SVR4 types with "elf_" + * tacked on the front to prevent clashes with linux definitions, + * and the typedef forms have been avoided. This is mostly like + * the SVR4 structure, but more Linuxy, with things that Linux does + * not support and which gdb doesn't really use excluded. + */ +struct elf_prstatus +{ + struct elf_siginfo pr_info; /* Info associated with signal */ + short pr_cursig; /* Current signal */ + unsigned long pr_sigpend; /* Set of pending signals */ + unsigned long pr_sighold; /* Set of held signals */ + pid_t pr_pid; + pid_t pr_ppid; + pid_t pr_pgrp; + pid_t pr_sid; + struct __kernel_old_timeval pr_utime; /* User time */ + struct __kernel_old_timeval pr_stime; /* System time */ + struct __kernel_old_timeval pr_cutime; /* Cumulative user time */ + struct __kernel_old_timeval pr_cstime; /* Cumulative system time */ + elf_gregset_t pr_reg; /* GP registers */ +#ifdef CONFIG_BINFMT_ELF_FDPIC + /* When using FDPIC, the loadmap addresses need to be communicated + * to GDB in order for GDB to do the necessary relocations. The + * fields (below) used to communicate this information are placed + * immediately after ``pr_reg'', so that the loadmap addresses may + * be viewed as part of the register set if so desired. + */ + unsigned long pr_exec_fdpic_loadmap; + unsigned long pr_interp_fdpic_loadmap; +#endif + int pr_fpvalid; /* True if math co-processor being used. */ +}; + +#define ELF_PRARGSZ (80) /* Number of chars for args */ + +struct elf_prpsinfo +{ + char pr_state; /* numeric process state */ + char pr_sname; /* char for pr_state */ + char pr_zomb; /* zombie */ + char pr_nice; /* nice val */ + unsigned long pr_flag; /* flags */ + __kernel_uid_t pr_uid; + __kernel_gid_t pr_gid; + pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; + /* Lots missing */ + char pr_fname[16]; /* filename of executable */ + char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ +}; + static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs) { #ifdef ELF_CORE_COPY_REGS diff --git a/include/uapi/linux/elfcore.h b/include/uapi/linux/elfcore.h deleted file mode 100644 index baf03562306d..000000000000 --- a/include/uapi/linux/elfcore.h +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_ELFCORE_H -#define _UAPI_LINUX_ELFCORE_H - -#include -#include -#include -#include -#include -#include - -struct elf_siginfo -{ - int si_signo; /* signal number */ - int si_code; /* extra code */ - int si_errno; /* errno */ -}; - - -#ifndef __KERNEL__ -typedef elf_greg_t greg_t; -typedef elf_gregset_t gregset_t; -typedef elf_fpregset_t fpregset_t; -typedef elf_fpxregset_t fpxregset_t; -#define NGREG ELF_NGREG -#endif - -/* - * Definitions to generate Intel SVR4-like core files. - * These mostly have the same names as the SVR4 types with "elf_" - * tacked on the front to prevent clashes with linux definitions, - * and the typedef forms have been avoided. This is mostly like - * the SVR4 structure, but more Linuxy, with things that Linux does - * not support and which gdb doesn't really use excluded. - * Fields present but not used are marked with "XXX". - */ -struct elf_prstatus -{ -#if 0 - long pr_flags; /* XXX Process flags */ - short pr_why; /* XXX Reason for process halt */ - short pr_what; /* XXX More detailed reason */ -#endif - struct elf_siginfo pr_info; /* Info associated with signal */ - short pr_cursig; /* Current signal */ - unsigned long pr_sigpend; /* Set of pending signals */ - unsigned long pr_sighold; /* Set of held signals */ -#if 0 - struct sigaltstack pr_altstack; /* Alternate stack info */ - struct sigaction pr_action; /* Signal action for current sig */ -#endif - pid_t pr_pid; - pid_t pr_ppid; - pid_t pr_pgrp; - pid_t pr_sid; - struct __kernel_old_timeval pr_utime; /* User time */ - struct __kernel_old_timeval pr_stime; /* System time */ - struct __kernel_old_timeval pr_cutime; /* Cumulative user time */ - struct __kernel_old_timeval pr_cstime; /* Cumulative system time */ -#if 0 - long pr_instr; /* Current instruction */ -#endif - elf_gregset_t pr_reg; /* GP registers */ -#ifdef CONFIG_BINFMT_ELF_FDPIC - /* When using FDPIC, the loadmap addresses need to be communicated - * to GDB in order for GDB to do the necessary relocations. The - * fields (below) used to communicate this information are placed - * immediately after ``pr_reg'', so that the loadmap addresses may - * be viewed as part of the register set if so desired. - */ - unsigned long pr_exec_fdpic_loadmap; - unsigned long pr_interp_fdpic_loadmap; -#endif - int pr_fpvalid; /* True if math co-processor being used. */ -}; - -#define ELF_PRARGSZ (80) /* Number of chars for args */ - -struct elf_prpsinfo -{ - char pr_state; /* numeric process state */ - char pr_sname; /* char for pr_state */ - char pr_zomb; /* zombie */ - char pr_nice; /* nice val */ - unsigned long pr_flag; /* flags */ - __kernel_uid_t pr_uid; - __kernel_gid_t pr_gid; - pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; - /* Lots missing */ - char pr_fname[16]; /* filename of executable */ - char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ -}; - -#ifndef __KERNEL__ -typedef struct elf_prstatus prstatus_t; -typedef struct elf_prpsinfo prpsinfo_t; -#define PRARGSZ ELF_PRARGSZ -#endif - - -#endif /* _UAPI_LINUX_ELFCORE_H */ diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh index 955cf3aedf21..9314247bb222 100755 --- a/scripts/headers_install.sh +++ b/scripts/headers_install.sh @@ -86,7 +86,6 @@ arch/x86/include/uapi/asm/auxvec.h:CONFIG_X86_64 arch/x86/include/uapi/asm/mman.h:CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS include/uapi/asm-generic/fcntl.h:CONFIG_64BIT include/uapi/linux/atmdev.h:CONFIG_COMPAT -include/uapi/linux/elfcore.h:CONFIG_BINFMT_ELF_FDPIC include/uapi/linux/eventpoll.h:CONFIG_PM_SLEEP include/uapi/linux/hw_breakpoint.h:CONFIG_HAVE_MIXED_BREAKPOINTS_REGS include/uapi/linux/pktcdvd.h:CONFIG_CDROM_PKTCDVD_WCACHE diff --git a/usr/include/Makefile b/usr/include/Makefile index 55362f3ab393..f6b3c85d900e 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -28,7 +28,6 @@ no-header-test += linux/am437x-vpfe.h no-header-test += linux/android/binder.h no-header-test += linux/android/binderfs.h no-header-test += linux/coda.h -no-header-test += linux/elfcore.h no-header-test += linux/errqueue.h no-header-test += linux/fsmap.h no-header-test += linux/hdlc/ioctl.h -- cgit v1.2.3 From 8f4c0e01789c18674acdf17cae3822b3dc3db715 Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Wed, 22 Jul 2020 10:40:16 -0400 Subject: hsr: enhance netlink socket interface to support PRP Parallel Redundancy Protocol (PRP) is another redundancy protocol introduced by IEC 63439 standard. It is similar to HSR in many aspects:- - Use a pair of Ethernet interfaces to created the PRP device - Use a 6 byte redundancy protocol part (RCT, Redundancy Check Trailer) similar to HSR Tag. - Has Link Redundancy Entity (LRE) that works with RCT to implement redundancy. Key difference is that the protocol unit is a trailer instead of a prefix as in HSR. That makes it inter-operable with tradition network components such as bridges/switches which treat it as pad bytes, whereas HSR nodes requires some kind of translators (Called redbox) to talk to regular network devices. This features allows regular linux box to be converted to a DAN-P box. DAN-P stands for Dual Attached Node - PRP similar to DAN-H (Dual Attached Node - HSR). Add a comment at the header/source code to explicitly state that the driver files also handles PRP protocol as well. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- include/uapi/linux/hsr_netlink.h | 2 +- include/uapi/linux/if_link.h | 12 +++++++++++- net/hsr/Kconfig | 35 +++++++++++++++++++++++------------ net/hsr/hsr_debugfs.c | 2 +- net/hsr/hsr_device.c | 7 +++++-- net/hsr/hsr_device.h | 2 ++ net/hsr/hsr_forward.c | 2 ++ net/hsr/hsr_forward.h | 2 ++ net/hsr/hsr_framereg.c | 1 + net/hsr/hsr_framereg.h | 2 ++ net/hsr/hsr_main.c | 2 ++ net/hsr/hsr_main.h | 11 ++++++++++- net/hsr/hsr_netlink.c | 38 +++++++++++++++++++++++++++++++------- net/hsr/hsr_netlink.h | 2 ++ net/hsr/hsr_slave.c | 2 ++ net/hsr/hsr_slave.h | 2 ++ 16 files changed, 99 insertions(+), 25 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/hsr_netlink.h b/include/uapi/linux/hsr_netlink.h index c218ef9c35dd..d540ea9bbef4 100644 --- a/include/uapi/linux/hsr_netlink.h +++ b/include/uapi/linux/hsr_netlink.h @@ -17,7 +17,7 @@ /* Generic Netlink HSR family definition */ -/* attributes */ +/* attributes for HSR or PRP node */ enum { HSR_A_UNSPEC, HSR_A_NODE_ADDR, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index af8f31987526..63af64646358 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -907,7 +907,14 @@ enum { #define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1) -/* HSR section */ +/* HSR/PRP section, both uses same interface */ + +/* Different redundancy protocols for hsr device */ +enum { + HSR_PROTOCOL_HSR, + HSR_PROTOCOL_PRP, + HSR_PROTOCOL_MAX, +}; enum { IFLA_HSR_UNSPEC, @@ -917,6 +924,9 @@ enum { IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */ IFLA_HSR_SEQ_NR, IFLA_HSR_VERSION, /* HSR version */ + IFLA_HSR_PROTOCOL, /* Indicate different protocol than + * HSR. For example PRP. + */ __IFLA_HSR_MAX, }; diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig index 8095b034e76e..1b048c17b6c8 100644 --- a/net/hsr/Kconfig +++ b/net/hsr/Kconfig @@ -4,24 +4,35 @@ # config HSR - tristate "High-availability Seamless Redundancy (HSR)" + tristate "High-availability Seamless Redundancy (HSR & PRP)" help + This enables IEC 62439 defined High-availability Seamless + Redundancy (HSR) and Parallel Redundancy Protocol (PRP). + If you say Y here, then your Linux box will be able to act as a - DANH ("Doubly attached node implementing HSR"). For this to work, - your Linux box needs (at least) two physical Ethernet interfaces, - and it must be connected as a node in a ring network together with - other HSR capable nodes. + DANH ("Doubly attached node implementing HSR") or DANP ("Doubly + attached node implementing PRP"). For this to work, your Linux box + needs (at least) two physical Ethernet interfaces. + + For DANH, it must be connected as a node in a ring network together + with other HSR capable nodes. All Ethernet frames sent over the HSR + device will be sent in both directions on the ring (over both slave + ports), giving a redundant, instant fail-over network. Each HSR node + in the ring acts like a bridge for HSR frames, but filters frames + that have been forwarded earlier. - All Ethernet frames sent over the hsr device will be sent in both - directions on the ring (over both slave ports), giving a redundant, - instant fail-over network. Each HSR node in the ring acts like a - bridge for HSR frames, but filters frames that have been forwarded - earlier. + For DANP, it must be connected as a node connecting to two + separate networks over the two slave interfaces. Like HSR, Ethernet + frames sent over the PRP device will be sent to both networks giving + a redundant, instant fail-over network. Unlike HSR, PRP networks + can have Singly Attached Nodes (SAN) such as PC, printer, bridges + etc and will be able to communicate with DANP nodes. This code is a "best effort" to comply with the HSR standard as described in IEC 62439-3:2010 (HSRv0) and IEC 62439-3:2012 (HSRv1), - but no compliancy tests have been made. Use iproute2 to select - the version you desire. + and PRP standard described in IEC 62439-4:2012 (PRP), but no + compliancy tests have been made. Use iproute2 to select the protocol + you would like to use. You need to perform any and all necessary tests yourself before relying on this code in a safety critical system! diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 9787ef11ca71..c1932c0a15be 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -1,5 +1,5 @@ /* - * hsr_debugfs code + * debugfs code for HSR & PRP * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 8a927b647829..40ac45123a62 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -3,9 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se - * * This file contains device methods for creating, using and destroying - * virtual HSR devices. + * virtual HSR or PRP devices. */ #include @@ -427,6 +426,10 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); + /* currently PRP is not supported */ + if (protocol_version == PRP_V1) + return -EPROTONOSUPPORT; + /* Make sure we recognize frames from ourselves in hsr_rcv() */ res = hsr_create_self_node(hsr, hsr_dev->dev_addr, slave[1]->dev_addr); diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index b8f9262ed101..868373822ee4 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_DEVICE_H diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index ab8dca0c0b65..55adb4dbd235 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Frame router for HSR and PRP. */ #include "hsr_forward.h" diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h index 51a69295566c..b2a6fa319d94 100644 --- a/net/hsr/hsr_forward.h +++ b/net/hsr/hsr_forward.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_FORWARD_H diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 530de24b1fb5..13b2190e6556 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -8,6 +8,7 @@ * interface. A frame is identified by its source MAC address and its HSR * sequence number. This code keeps track of senders and their sequence numbers * to allow filtering of duplicate frames, and to detect HSR ring errors. + * Same code handles filtering of duplicates for PRP as well. */ #include diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 0f0fa12b4329..c06447780d05 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_FRAMEREG_H diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 144da15f0a81..2fd1976e5b1c 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Event handling for HSR and PRP devices. */ #include diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index f74193465bf5..8cf10d67d5f9 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_PRIVATE_H @@ -131,6 +133,13 @@ struct hsr_port { enum hsr_port_type type; }; +/* used by driver internally to differentiate various protocols */ +enum hsr_version { + HSR_V0 = 0, + HSR_V1, + PRP_V1, +}; + struct hsr_priv { struct rcu_head rcu_head; struct list_head ports; @@ -141,7 +150,7 @@ struct hsr_priv { int announce_count; u16 sequence_nr; u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ - u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ + enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */ spinlock_t seqnr_lock; /* locking for sequence_nr */ spinlock_t list_lock; /* locking for node list */ unsigned char sup_multicast_addr[ETH_ALEN]; diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 6e14b7d22639..06c3cd988760 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -4,7 +4,7 @@ * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * - * Routines for handling Netlink messages for HSR. + * Routines for handling Netlink messages for HSR and PRP. */ #include "hsr_netlink.h" @@ -22,6 +22,7 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { [IFLA_HSR_VERSION] = { .type = NLA_U8 }, [IFLA_HSR_SUPERVISION_ADDR] = { .len = ETH_ALEN }, [IFLA_HSR_SEQ_NR] = { .type = NLA_U16 }, + [IFLA_HSR_PROTOCOL] = { .type = NLA_U8 }, }; /* Here, it seems a netdevice has already been allocated for us, and the @@ -31,8 +32,10 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { + enum hsr_version proto_version; + unsigned char multicast_spec; + u8 proto = HSR_PROTOCOL_HSR; struct net_device *link[2]; - unsigned char multicast_spec, hsr_version; if (!data) { NL_SET_ERR_MSG_MOD(extack, "No slave devices specified"); @@ -69,18 +72,34 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, else multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]); + if (data[IFLA_HSR_PROTOCOL]) + proto = nla_get_u8(data[IFLA_HSR_PROTOCOL]); + + if (proto >= HSR_PROTOCOL_MAX) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported protocol\n"); + return -EINVAL; + } + if (!data[IFLA_HSR_VERSION]) { - hsr_version = 0; + proto_version = HSR_V0; } else { - hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]); - if (hsr_version > 1) { + if (proto == HSR_PROTOCOL_PRP) { + NL_SET_ERR_MSG_MOD(extack, "PRP version unsupported\n"); + return -EINVAL; + } + + proto_version = nla_get_u8(data[IFLA_HSR_VERSION]); + if (proto_version > HSR_V1) { NL_SET_ERR_MSG_MOD(extack, - "Only versions 0..1 are supported"); + "Only HSR version 0/1 supported\n"); return -EINVAL; } } - return hsr_dev_finalize(dev, link, multicast_spec, hsr_version, extack); + if (proto == HSR_PROTOCOL_PRP) + proto_version = PRP_V1; + + return hsr_dev_finalize(dev, link, multicast_spec, proto_version, extack); } static void hsr_dellink(struct net_device *dev, struct list_head *head) @@ -102,6 +121,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); + u8 proto = HSR_PROTOCOL_HSR; struct hsr_port *port; port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); @@ -120,6 +140,10 @@ static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) hsr->sup_multicast_addr) || nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr)) goto nla_put_failure; + if (hsr->prot_version == PRP_V1) + proto = HSR_PROTOCOL_PRP; + if (nla_put_u8(skb, IFLA_HSR_PROTOCOL, proto)) + goto nla_put_failure; return 0; diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h index 1121bb192a18..501552d9753b 100644 --- a/net/hsr/hsr_netlink.h +++ b/net/hsr/hsr_netlink.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_NETLINK_H diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 25b6ffba26cd..b5c0834de338 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h index 8953ea279ce9..9708a4f0ec09 100644 --- a/net/hsr/hsr_slave.h +++ b/net/hsr/hsr_slave.h @@ -2,6 +2,8 @@ /* Copyright 2011-2014 Autronica Fire and Security AS * * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_SLAVE_H -- cgit v1.2.3 From 08b95c338e0c5a96e47f4ca314ea1e7580ecb5d7 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 8 Jul 2020 14:11:52 +0300 Subject: fanotify: remove event FAN_DIR_MODIFY It was never enabled in uapi and its functionality is about to be superseded by events FAN_CREATE, FAN_DELETE, FAN_MOVE with group flag FAN_REPORT_NAME. Keep a place holder variable name_event instead of removing the name recording code since it will be used by the new events. Link: https://lore.kernel.org/r/20200708111156.24659-17-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 9 ++------- fs/notify/fsnotify.c | 2 +- include/linux/fsnotify.h | 6 ------ include/linux/fsnotify_backend.h | 4 +--- include/uapi/linux/fanotify.h | 1 - 5 files changed, 4 insertions(+), 18 deletions(-) (limited to 'include/uapi') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e417c64c365b..e6ba605732d7 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -425,6 +425,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, gfp_t gfp = GFP_KERNEL_ACCOUNT; struct inode *id = fanotify_fid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); + bool name_event = false; /* * For queues with unlimited length lost events are not expected and @@ -442,12 +443,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, if (fanotify_is_perm_event(mask)) { event = fanotify_alloc_perm_event(path, gfp); - } else if (mask & FAN_DIR_MODIFY && !(WARN_ON_ONCE(!file_name))) { - /* - * For FAN_DIR_MODIFY event, we report the fid of the directory - * and the name of the modified entry. - * Allocate an fanotify_name_event struct and copy the name. - */ + } else if (name_event && file_name) { event = fanotify_alloc_name_event(id, fsid, file_name, gfp); } else if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { event = fanotify_alloc_fid_event(id, fsid, gfp); @@ -528,7 +524,6 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM); BUILD_BUG_ON(FAN_CREATE != FS_CREATE); BUILD_BUG_ON(FAN_DELETE != FS_DELETE); - BUILD_BUG_ON(FAN_DIR_MODIFY != FS_DIR_MODIFY); BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF); BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF); BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index c4ac4d13e10f..f12a554be3f0 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -393,7 +393,7 @@ static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 316c9b820517..9b2566d273a9 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -30,12 +30,6 @@ static inline void fsnotify_name(struct inode *dir, __u32 mask, const struct qstr *name, u32 cookie) { fsnotify(dir, mask, child, FSNOTIFY_EVENT_INODE, name, cookie); - /* - * Send another flavor of the event without child inode data and - * without the specific event type (e.g. FS_CREATE|FS_IS_DIR). - * The name is relative to the dir inode the event is reported to. - */ - fsnotify(dir, FS_DIR_MODIFY, dir, FSNOTIFY_EVENT_INODE, name, 0); } static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0de130cbf72d..94a4ff3d5bbe 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -47,7 +47,6 @@ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ -#define FS_DIR_MODIFY 0x00080000 /* Directory entry was modified */ #define FS_EXCL_UNLINK 0x04000000 /* do not send events if object is unlinked */ /* This inode cares about things that happen to its children. Always set for @@ -67,8 +66,7 @@ * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event * when a directory entry inside a child subdir changes. */ -#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | \ - FS_DIR_MODIFY) +#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE) #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ FS_OPEN_EXEC_PERM) diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index a88c7c6d0692..7f2f17eacbf9 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -24,7 +24,6 @@ #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */ #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */ #define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */ -#define FAN_DIR_MODIFY 0x00080000 /* Directory entry was modified */ #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */ -- cgit v1.2.3 From 83b7a59896dd24015a34b7f00027f0ff3747972f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 16 Jul 2020 11:42:26 +0300 Subject: fanotify: add basic support for FAN_REPORT_DIR_FID For now, the flag is mutually exclusive with FAN_REPORT_FID. Events include a single info record of type FAN_EVENT_INFO_TYPE_DFID with a directory file handle. For now, events are only reported for: - Directory modification events - Events on children of a watching directory - Events on directory objects Soon, we will add support for reporting the parent directory fid for events on non-directories with filesystem/mount mark and support for reporting both parent directory fid and child fid. Link: https://lore.kernel.org/r/20200716084230.30611-19-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 34 ++++++++++++++++-- fs/notify/fanotify/fanotify_user.c | 71 ++++++++++++++++++++++++++++++++------ include/linux/fanotify.h | 2 +- include/uapi/linux/fanotify.h | 11 +++--- 4 files changed, 101 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 3baf93e998c1..fc2e1fab34af 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -223,7 +223,7 @@ out: static u32 fanotify_group_event_mask(struct fsnotify_group *group, struct fsnotify_iter_info *iter_info, u32 event_mask, const void *data, - int data_type) + int data_type, struct inode *dir) { __u32 marks_mask = 0, marks_ignored_mask = 0; __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS | @@ -243,6 +243,10 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, /* Path type events are only relevant for files and dirs */ if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry)) return 0; + } else if (!(fid_mode & FAN_REPORT_FID)) { + /* Do we have a directory inode to report? */ + if (!dir && !(event_mask & FS_ISDIR)) + return 0; } fsnotify_foreach_obj_type(type) { @@ -396,6 +400,28 @@ static struct inode *fanotify_fid_inode(u32 event_mask, const void *data, return fsnotify_data_inode(data, data_type); } +/* + * The inode to use as identifier when reporting dir fid depends on the event. + * Report the modified directory inode on dirent modification events. + * Report the "victim" inode if "victim" is a directory. + * Report the parent inode if "victim" is not a directory and event is + * reported to parent. + * Otherwise, do not report dir fid. + */ +static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data, + int data_type, struct inode *dir) +{ + struct inode *inode = fsnotify_data_inode(data, data_type); + + if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) + return dir; + + if (S_ISDIR(inode->i_mode)) + return inode; + + return dir; +} + static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, gfp_t gfp) { @@ -491,10 +517,14 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct fanotify_event *event = NULL; gfp_t gfp = GFP_KERNEL_ACCOUNT; struct inode *id = fanotify_fid_inode(mask, data, data_type, dir); + struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); bool name_event = false; + if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) + id = dirid; + /* * For queues with unlimited length lost events are not expected and * can possibly have security implications. Avoid losing events when @@ -605,7 +635,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); mask = fanotify_group_event_mask(group, iter_info, mask, data, - data_type); + data_type, dir); if (!mask) return 0; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 3842ef00b52e..e494400711c9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -216,7 +216,7 @@ static int process_access_response(struct fsnotify_group *group, } static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, - const char *name, size_t name_len, + int info_type, const char *name, size_t name_len, char __user *buf, size_t count) { struct fanotify_event_info_fid info = { }; @@ -229,7 +229,7 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", __func__, fh_len, name_len, info_len, count); - if (!fh_len || (name && !name_len)) + if (!fh_len) return 0; if (WARN_ON_ONCE(len < sizeof(info) || len > count)) @@ -239,8 +239,21 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, * Copy event info fid header followed by variable sized file handle * and optionally followed by variable sized filename. */ - info.hdr.info_type = name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : - FAN_EVENT_INFO_TYPE_FID; + switch (info_type) { + case FAN_EVENT_INFO_TYPE_FID: + case FAN_EVENT_INFO_TYPE_DFID: + if (WARN_ON_ONCE(name_len)) + return -EFAULT; + break; + case FAN_EVENT_INFO_TYPE_DFID_NAME: + if (WARN_ON_ONCE(!name || !name_len)) + return -EFAULT; + break; + default: + return -EFAULT; + } + + info.hdr.info_type = info_type; info.hdr.len = len; info.fsid = *fsid; if (copy_to_user(buf, &info, sizeof(info))) @@ -304,8 +317,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event_metadata metadata; struct path *path = fanotify_event_path(event); struct fanotify_info *info = fanotify_event_info(event); + unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); struct file *f = NULL; int ret, fd = FAN_NOFD; + int info_type = 0; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -346,9 +361,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, /* Event info records order is: dir fid + name, child fid */ if (fanotify_event_dir_fh_len(event)) { + info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; ret = copy_info_to_user(fanotify_event_fsid(event), fanotify_info_dir_fh(info), - fanotify_info_name(info), + info_type, fanotify_info_name(info), info->name_len, buf, count); if (ret < 0) return ret; @@ -358,9 +374,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, } if (fanotify_event_object_fh_len(event)) { + if (fid_mode == FAN_REPORT_FID || info_type) { + /* + * With only group flag FAN_REPORT_FID only type FID is + * reported. Second info record type is always FID. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_DIR_FID, a single info + * record has type DFID for directory entry modification + * event and for event on a directory. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID; + } else { + /* + * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, + * a single info record has type FID for event on a + * non-directory, when there is no directory to report. + * For example, on FAN_DELETE_SELF event. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } + ret = copy_info_to_user(fanotify_event_fsid(event), fanotify_event_object_fh(event), - NULL, 0, buf, count); + info_type, NULL, 0, buf, count); if (ret < 0) return ret; @@ -861,6 +901,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) struct fsnotify_group *group; int f_flags, fd; struct user_struct *user; + unsigned int fid_mode = flags & FANOTIFY_FID_BITS; + unsigned int class = flags & FANOTIFY_CLASS_BITS; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); @@ -887,10 +929,19 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) return -EINVAL; } - if ((flags & FANOTIFY_FID_BITS) && - (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF) + if (fid_mode && class != FAN_CLASS_NOTIF) return -EINVAL; + /* Reporting either object fid or dir fid */ + switch (fid_mode) { + case 0: + case FAN_REPORT_FID: + case FAN_REPORT_DIR_FID: + break; + default: + return -EINVAL; + } + user = get_current_user(); if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { free_uid(user); @@ -926,7 +977,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - switch (flags & FANOTIFY_CLASS_BITS) { + switch (class) { case FAN_CLASS_NOTIF: group->priority = FS_PRIO_0; break; @@ -1236,7 +1287,7 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 9); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index bbbee11d2521..4ddac97b2bf7 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -18,7 +18,7 @@ #define FANOTIFY_CLASS_BITS (FAN_CLASS_NOTIF | FAN_CLASS_CONTENT | \ FAN_CLASS_PRE_CONTENT) -#define FANOTIFY_FID_BITS (FAN_REPORT_FID) +#define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DIR_FID) #define FANOTIFY_INIT_FLAGS (FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \ FAN_REPORT_TID | \ diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 7f2f17eacbf9..21afebf77fd7 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -53,6 +53,7 @@ /* Flags to determine fanotify event format */ #define FAN_REPORT_TID 0x00000100 /* event->pid is thread id */ #define FAN_REPORT_FID 0x00000200 /* Report unique file id */ +#define FAN_REPORT_DIR_FID 0x00000400 /* Report unique directory id */ /* Deprecated - do not use this in programs and do not add new flags here! */ #define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | \ @@ -117,6 +118,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_FID 1 #define FAN_EVENT_INFO_TYPE_DFID_NAME 2 +#define FAN_EVENT_INFO_TYPE_DFID 3 /* Variable length info record following event metadata */ struct fanotify_event_info_header { @@ -126,10 +128,11 @@ struct fanotify_event_info_header { }; /* - * Unique file identifier info record. This is used both for - * FAN_EVENT_INFO_TYPE_FID records and for FAN_EVENT_INFO_TYPE_DFID_NAME - * records. For FAN_EVENT_INFO_TYPE_DFID_NAME there is additionally a null - * terminated name immediately after the file handle. + * Unique file identifier info record. + * This structure is used for records of types FAN_EVENT_INFO_TYPE_FID, + * FAN_EVENT_INFO_TYPE_DFID and FAN_EVENT_INFO_TYPE_DFID_NAME. + * For FAN_EVENT_INFO_TYPE_DFID_NAME there is additionally a null terminated + * name immediately after the file handle. */ struct fanotify_event_info_fid { struct fanotify_event_info_header hdr; -- cgit v1.2.3 From 929943b38daf817f2e6d303ea04401651fc3bc05 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 16 Jul 2020 11:42:28 +0300 Subject: fanotify: add support for FAN_REPORT_NAME Introduce a new fanotify_init() flag FAN_REPORT_NAME. It requires the flag FAN_REPORT_DIR_FID and there is a constant for setting both flags named FAN_REPORT_DFID_NAME. For a group with flag FAN_REPORT_NAME, the parent fid and name are reported for directory entry modification events (create/detete/move) and for events on non-directory objects. Events on directories themselves are reported with their own fid and "." as the name. The parent fid and name are reported with an info record of type FAN_EVENT_INFO_TYPE_DFID_NAME, similar to the way that parent fid is reported with into type FAN_EVENT_INFO_TYPE_DFID, but with an appended null terminated name string. Link: https://lore.kernel.org/r/20200716084230.30611-21-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 18 ++++++++++++++- fs/notify/fanotify/fanotify_user.c | 45 ++++++++++++++++++++++++++++++-------- include/linux/fanotify.h | 2 +- include/uapi/linux/fanotify.h | 4 ++++ 4 files changed, 58 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index fc2e1fab34af..d793f3e56b26 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -522,9 +522,25 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); bool name_event = false; - if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) + if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) { id = dirid; + /* + * We record file name only in a group with FAN_REPORT_NAME + * and when we have a directory inode to report. + * + * For directory entry modification event, we record the fid of + * the directory and the name of the modified entry. + * + * For event on non-directory that is reported to parent, we + * record the fid of the parent and the name of the child. + */ + if ((fid_mode & FAN_REPORT_NAME) && + ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || + !(mask & FAN_ONDIR))) + name_event = true; + } + /* * For queues with unlimited length lost events are not expected and * can possibly have security implications. Avoid losing events when diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 7caa64d028ba..6b839790cb42 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -64,18 +64,27 @@ static int fanotify_fid_info_len(int fh_len, int name_len) return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN); } -static int fanotify_event_info_len(struct fanotify_event *event) +static int fanotify_event_info_len(unsigned int fid_mode, + struct fanotify_event *event) { struct fanotify_info *info = fanotify_event_info(event); int dir_fh_len = fanotify_event_dir_fh_len(event); int fh_len = fanotify_event_object_fh_len(event); int info_len = 0; + int dot_len = 0; - if (dir_fh_len) + if (dir_fh_len) { info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); + } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_NAME, if name was not recorded in + * event on a directory, we will report the name ".". + */ + dot_len = 1; + } if (fh_len) - info_len += fanotify_fid_info_len(fh_len, 0); + info_len += fanotify_fid_info_len(fh_len, dot_len); return info_len; } @@ -91,6 +100,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, { size_t event_size = FAN_EVENT_METADATA_LEN; struct fanotify_event *event = NULL; + unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -98,8 +108,8 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, if (fsnotify_notify_queue_is_empty(group)) goto out; - if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) { - event_size += fanotify_event_info_len( + if (fid_mode) { + event_size += fanotify_event_info_len(fid_mode, FANOTIFY_E(fsnotify_peek_first_event(group))); } @@ -325,7 +335,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); metadata.event_len = FAN_EVENT_METADATA_LEN + - fanotify_event_info_len(event); + fanotify_event_info_len(fid_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; @@ -374,12 +384,25 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, } if (fanotify_event_object_fh_len(event)) { + const char *dot = NULL; + int dot_len = 0; + if (fid_mode == FAN_REPORT_FID || info_type) { /* * With only group flag FAN_REPORT_FID only type FID is * reported. Second info record type is always FID. */ info_type = FAN_EVENT_INFO_TYPE_FID; + } else if ((fid_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_NAME, if name was not + * recorded in an event on a directory, report the + * name "." with info type DFID_NAME. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; + dot = "."; + dot_len = 1; } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || (event->mask & FAN_ONDIR)) { /* @@ -400,7 +423,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, ret = copy_info_to_user(fanotify_event_fsid(event), fanotify_event_object_fh(event), - info_type, NULL, 0, buf, count); + info_type, dot, dot_len, buf, count); if (ret < 0) return ret; @@ -932,11 +955,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (fid_mode && class != FAN_CLASS_NOTIF) return -EINVAL; - /* Reporting either object fid or dir fid */ + /* + * Reporting either object fid or dir fid. + * Child name is reported with parent fid so requires dir fid. + */ switch (fid_mode) { case 0: case FAN_REPORT_FID: case FAN_REPORT_DIR_FID: + case FAN_REPORT_DFID_NAME: break; default: return -EINVAL; @@ -1294,7 +1321,7 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 9); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 4ddac97b2bf7..3e9c56ee651f 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -18,7 +18,7 @@ #define FANOTIFY_CLASS_BITS (FAN_CLASS_NOTIF | FAN_CLASS_CONTENT | \ FAN_CLASS_PRE_CONTENT) -#define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DIR_FID) +#define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DFID_NAME) #define FANOTIFY_INIT_FLAGS (FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \ FAN_REPORT_TID | \ diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 21afebf77fd7..fbf9c5c7dd59 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -54,6 +54,10 @@ #define FAN_REPORT_TID 0x00000100 /* event->pid is thread id */ #define FAN_REPORT_FID 0x00000200 /* Report unique file id */ #define FAN_REPORT_DIR_FID 0x00000400 /* Report unique directory id */ +#define FAN_REPORT_NAME 0x00000800 /* Report events with name */ + +/* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */ +#define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME) /* Deprecated - do not use this in programs and do not add new flags here! */ #define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | \ -- cgit v1.2.3 From e1613b5714ee6c186c9628e9958edf65e9d9cddd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jul 2020 15:47:15 -0700 Subject: bpf: Fix bpf_ringbuf_output() signature to return long Due to bpf tree fix merge, bpf_ringbuf_output() signature ended up with int as a return type, while all other helpers got converted to returning long. So fix it in bpf-next now. Fixes: b0659d8a950d ("bpf: Fix definition of bpf_ringbuf_output() helper in UAPI comments") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200727224715.652037-1-andriin@fb.com --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e1ba4ae6a916..eb5e0c38eb2c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3241,7 +3241,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e1ba4ae6a916..eb5e0c38eb2c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3241,7 +3241,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification -- cgit v1.2.3 From bc2d214af5dbcf1e53be9a23d95905a585657ff4 Mon Sep 17 00:00:00 2001 From: Bodo Stroesser Date: Sun, 26 Jul 2020 17:35:09 +0200 Subject: scsi: target: tcmu: Implement tmr_notify callback This patch implements the tmr_notify callback for tcmu. When the callback is called, tcmu checks the list of aborted commands it received as parameter: - aborted commands in the qfull_queue are removed from the queue and target_complete_command is called - from the cmd_ids of aborted commands currently uncompleted in cmd ring it creates a list of aborted cmd_ids. Finally a TMR notification is written to cmd ring containing TMR type and cmd_id list. If there is no space in ring, the TMR notification is queued on a TMR specific queue. The TMR specific queue 'tmr_queue' can be seen as a extension of the cmd ring. At the end of each iexecution of tcmu_complete_commands() we check whether tmr_queue contains TMRs and try to move them onto the ring. If tmr_queue is not empty after that, we don't call run_qfull_queue() because commands must not overtake TMRs. This way we guarantee that cmd_ids in TMR notification received by userspace either match an active, not yet completed command or are no longer valid due to userspace having complete some cmd_ids meanwhile. New commands that were assigned to an aborted cmd_id will always appear on the cmd ring _after_ the TMR. Link: https://lore.kernel.org/r/20200726153510.13077-8-bstroesser@ts.fujitsu.com Reviewed-by: Mike Christie Signed-off-by: Bodo Stroesser Signed-off-by: Martin K. Petersen --- drivers/target/target_core_user.c | 225 ++++++++++++++++++++++++++++++++-- include/uapi/linux/target_core_user.h | 25 ++++ 2 files changed, 241 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index bddd40f07929..cb5a561a46e8 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -137,6 +137,7 @@ struct tcmu_dev { struct mutex cmdr_lock; struct list_head qfull_queue; + struct list_head tmr_queue; uint32_t dbi_max; uint32_t dbi_thresh; @@ -183,6 +184,15 @@ struct tcmu_cmd { #define TCMU_CMD_BIT_EXPIRED 0 unsigned long flags; }; + +struct tcmu_tmr { + struct list_head queue_entry; + + uint8_t tmr_type; + uint32_t tmr_cmd_cnt; + int16_t tmr_cmd_ids[0]; +}; + /* * To avoid dead lock the mutex lock order should always be: * @@ -844,6 +854,9 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd, return false; } + if (!data_needed) + return true; + /* try to check and get the data blocks as needed */ space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh); if ((space * DATA_BLOCK_SIZE) < data_needed) { @@ -1106,6 +1119,60 @@ queue: return 1; } +/** + * queue_tmr_ring - queue tmr info to ring or internally + * @udev: related tcmu_dev + * @tmr: tcmu_tmr containing tmr info to queue + * + * Returns: + * 0 success + * 1 internally queued to wait for ring memory to free. + */ +static int +queue_tmr_ring(struct tcmu_dev *udev, struct tcmu_tmr *tmr) +{ + struct tcmu_tmr_entry *entry; + int cmd_size; + int id_list_sz; + struct tcmu_mailbox *mb = udev->mb_addr; + uint32_t cmd_head; + + if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) + goto out_free; + + id_list_sz = sizeof(tmr->tmr_cmd_ids[0]) * tmr->tmr_cmd_cnt; + cmd_size = round_up(sizeof(*entry) + id_list_sz, TCMU_OP_ALIGN_SIZE); + + if (!list_empty(&udev->tmr_queue) || + !is_ring_space_avail(udev, NULL, cmd_size, 0)) { + list_add_tail(&tmr->queue_entry, &udev->tmr_queue); + pr_debug("adding tmr %p on dev %s to TMR ring space wait queue\n", + tmr, udev->name); + return 1; + } + + cmd_head = ring_insert_padding(udev, cmd_size); + + entry = (void *)mb + CMDR_OFF + cmd_head; + memset(entry, 0, cmd_size); + tcmu_hdr_set_op(&entry->hdr.len_op, TCMU_OP_TMR); + tcmu_hdr_set_len(&entry->hdr.len_op, cmd_size); + entry->tmr_type = tmr->tmr_type; + entry->cmd_cnt = tmr->tmr_cmd_cnt; + memcpy(&entry->cmd_ids[0], &tmr->tmr_cmd_ids[0], id_list_sz); + tcmu_flush_dcache_range(entry, cmd_size); + + UPDATE_HEAD(mb->cmd_head, cmd_size, udev->cmdr_size); + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + uio_event_notify(&udev->uio_info); + +out_free: + kfree(tmr); + + return 0; +} + static sense_reason_t tcmu_queue_cmd(struct se_cmd *se_cmd) { @@ -1141,6 +1208,85 @@ static void tcmu_set_next_deadline(struct list_head *queue, del_timer(timer); } +static int +tcmu_tmr_type(enum tcm_tmreq_table tmf) +{ + switch (tmf) { + case TMR_ABORT_TASK: return TCMU_TMR_ABORT_TASK; + case TMR_ABORT_TASK_SET: return TCMU_TMR_ABORT_TASK_SET; + case TMR_CLEAR_ACA: return TCMU_TMR_CLEAR_ACA; + case TMR_CLEAR_TASK_SET: return TCMU_TMR_CLEAR_TASK_SET; + case TMR_LUN_RESET: return TCMU_TMR_LUN_RESET; + case TMR_TARGET_WARM_RESET: return TCMU_TMR_TARGET_WARM_RESET; + case TMR_TARGET_COLD_RESET: return TCMU_TMR_TARGET_COLD_RESET; + case TMR_LUN_RESET_PRO: return TCMU_TMR_LUN_RESET_PRO; + default: return TCMU_TMR_UNKNOWN; + } +} + +static void +tcmu_tmr_notify(struct se_device *se_dev, enum tcm_tmreq_table tmf, + struct list_head *cmd_list) +{ + int i = 0, cmd_cnt = 0; + bool unqueued = false; + uint16_t *cmd_ids = NULL; + struct tcmu_cmd *cmd; + struct se_cmd *se_cmd; + struct tcmu_tmr *tmr; + struct tcmu_dev *udev = TCMU_DEV(se_dev); + + mutex_lock(&udev->cmdr_lock); + + /* First we check for aborted commands in qfull_queue */ + list_for_each_entry(se_cmd, cmd_list, state_list) { + i++; + if (!se_cmd->priv) + continue; + cmd = se_cmd->priv; + /* Commands on qfull queue have no id yet */ + if (cmd->cmd_id) { + cmd_cnt++; + continue; + } + pr_debug("Removing aborted command %p from queue on dev %s.\n", + cmd, udev->name); + + list_del_init(&cmd->queue_entry); + tcmu_free_cmd(cmd); + target_complete_cmd(se_cmd, SAM_STAT_TASK_ABORTED); + unqueued = true; + } + if (unqueued) + tcmu_set_next_deadline(&udev->qfull_queue, &udev->qfull_timer); + + pr_debug("TMR event %d on dev %s, aborted cmds %d, afflicted cmd_ids %d\n", + tcmu_tmr_type(tmf), udev->name, i, cmd_cnt); + + tmr = kmalloc(sizeof(*tmr) + cmd_cnt * sizeof(*cmd_ids), GFP_KERNEL); + if (!tmr) + goto unlock; + + tmr->tmr_type = tcmu_tmr_type(tmf); + tmr->tmr_cmd_cnt = cmd_cnt; + + if (cmd_cnt != 0) { + cmd_cnt = 0; + list_for_each_entry(se_cmd, cmd_list, state_list) { + if (!se_cmd->priv) + continue; + cmd = se_cmd->priv; + if (cmd->cmd_id) + tmr->tmr_cmd_ids[cmd_cnt++] = cmd->cmd_id; + } + } + + queue_tmr_ring(udev, tmr); + +unlock: + mutex_unlock(&udev->cmdr_lock); +} + static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *entry) { struct se_cmd *se_cmd = cmd->se_cmd; @@ -1208,11 +1354,43 @@ out: tcmu_free_cmd(cmd); } +static int tcmu_run_tmr_queue(struct tcmu_dev *udev) +{ + struct tcmu_tmr *tmr, *tmp; + LIST_HEAD(tmrs); + + if (list_empty(&udev->tmr_queue)) + return 1; + + pr_debug("running %s's tmr queue\n", udev->name); + + list_splice_init(&udev->tmr_queue, &tmrs); + + list_for_each_entry_safe(tmr, tmp, &tmrs, queue_entry) { + list_del_init(&tmr->queue_entry); + + pr_debug("removing tmr %p on dev %s from queue\n", + tmr, udev->name); + + if (queue_tmr_ring(udev, tmr)) { + pr_debug("ran out of space during tmr queue run\n"); + /* + * tmr was requeued, so just put all tmrs back in + * the queue + */ + list_splice_tail(&tmrs, &udev->tmr_queue); + return 0; + } + } + + return 1; +} + static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) { struct tcmu_mailbox *mb; struct tcmu_cmd *cmd; - int handled = 0; + bool free_space = false; if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) { pr_err("ring broken, not handling completions\n"); @@ -1235,7 +1413,10 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) tcmu_flush_dcache_range(entry, ring_left < sizeof(*entry) ? ring_left : sizeof(*entry)); - if (tcmu_hdr_get_op(entry->hdr.len_op) == TCMU_OP_PAD) { + free_space = true; + + if (tcmu_hdr_get_op(entry->hdr.len_op) == TCMU_OP_PAD || + tcmu_hdr_get_op(entry->hdr.len_op) == TCMU_OP_TMR) { UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(entry->hdr.len_op), udev->cmdr_size); @@ -1256,9 +1437,9 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(entry->hdr.len_op), udev->cmdr_size); - - handled++; } + if (free_space) + free_space = tcmu_run_tmr_queue(udev); if (atomic_read(&global_db_count) > tcmu_global_max_blocks && idr_is_empty(&udev->commands) && list_empty(&udev->qfull_queue)) { @@ -1271,7 +1452,7 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) if (udev->cmd_time_out) tcmu_set_next_deadline(&udev->inflight_queue, &udev->cmd_timer); - return handled; + return free_space; } static void tcmu_check_expired_ring_cmd(struct tcmu_cmd *cmd) @@ -1381,6 +1562,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) INIT_LIST_HEAD(&udev->node); INIT_LIST_HEAD(&udev->timedout_entry); INIT_LIST_HEAD(&udev->qfull_queue); + INIT_LIST_HEAD(&udev->tmr_queue); INIT_LIST_HEAD(&udev->inflight_queue); idr_init(&udev->commands); @@ -1455,8 +1637,8 @@ static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on) struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); mutex_lock(&udev->cmdr_lock); - tcmu_handle_completions(udev); - run_qfull_queue(udev, false); + if (tcmu_handle_completions(udev)) + run_qfull_queue(udev, false); mutex_unlock(&udev->cmdr_lock); return 0; @@ -1609,6 +1791,16 @@ static void tcmu_blocks_release(struct radix_tree_root *blocks, } } +static void tcmu_remove_all_queued_tmr(struct tcmu_dev *udev) +{ + struct tcmu_tmr *tmr, *tmp; + + list_for_each_entry_safe(tmr, tmp, &udev->tmr_queue, queue_entry) { + list_del_init(&tmr->queue_entry); + kfree(tmr); + } +} + static void tcmu_dev_kref_release(struct kref *kref) { struct tcmu_dev *udev = container_of(kref, struct tcmu_dev, kref); @@ -1631,6 +1823,8 @@ static void tcmu_dev_kref_release(struct kref *kref) if (tcmu_check_and_free_pending_cmd(cmd) != 0) all_expired = false; } + /* There can be left over TMR cmds. Remove them. */ + tcmu_remove_all_queued_tmr(udev); if (!list_empty(&udev->qfull_queue)) all_expired = false; idr_destroy(&udev->commands); @@ -1885,7 +2079,9 @@ static int tcmu_configure_device(struct se_device *dev) /* Initialise the mailbox of the ring buffer */ mb = udev->mb_addr; mb->version = TCMU_MAILBOX_VERSION; - mb->flags = TCMU_MAILBOX_FLAG_CAP_OOOC | TCMU_MAILBOX_FLAG_CAP_READ_LEN; + mb->flags = TCMU_MAILBOX_FLAG_CAP_OOOC | + TCMU_MAILBOX_FLAG_CAP_READ_LEN | + TCMU_MAILBOX_FLAG_CAP_TMR; mb->cmdr_off = CMDR_OFF; mb->cmdr_size = udev->cmdr_size; @@ -2055,6 +2251,15 @@ static void tcmu_reset_ring(struct tcmu_dev *udev, u8 err_level) del_timer(&udev->cmd_timer); + /* + * ring is empty and qfull queue never contains aborted commands. + * So TMRs in tmr queue do not contain relevant cmd_ids. + * After a ring reset userspace should do a fresh start, so + * even LUN RESET message is no longer relevant. + * Therefore remove all TMRs from qfull queue + */ + tcmu_remove_all_queued_tmr(udev); + run_qfull_queue(udev, false); mutex_unlock(&udev->cmdr_lock); @@ -2607,6 +2812,7 @@ static struct target_backend_ops tcmu_ops = { .destroy_device = tcmu_destroy_device, .free_device = tcmu_free_device, .parse_cdb = tcmu_parse_cdb, + .tmr_notify = tcmu_tmr_notify, .set_configfs_dev_params = tcmu_set_configfs_dev_params, .show_configfs_dev_params = tcmu_show_configfs_dev_params, .get_device_type = sbc_get_device_type, @@ -2633,7 +2839,8 @@ static void find_free_blocks(void) } /* Try to complete the finished commands first */ - tcmu_handle_completions(udev); + if (tcmu_handle_completions(udev)) + run_qfull_queue(udev, false); /* Skip the udevs in idle */ if (!udev->dbi_thresh) { diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index b7b57967d90f..95b1597f16ae 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -45,6 +45,7 @@ #define ALIGN_SIZE 64 /* Should be enough for most CPUs */ #define TCMU_MAILBOX_FLAG_CAP_OOOC (1 << 0) /* Out-of-order completions */ #define TCMU_MAILBOX_FLAG_CAP_READ_LEN (1 << 1) /* Read data length */ +#define TCMU_MAILBOX_FLAG_CAP_TMR (1 << 2) /* TMR notifications */ struct tcmu_mailbox { __u16 version; @@ -62,6 +63,7 @@ struct tcmu_mailbox { enum tcmu_opcode { TCMU_OP_PAD = 0, TCMU_OP_CMD, + TCMU_OP_TMR, }; /* @@ -128,6 +130,29 @@ struct tcmu_cmd_entry { } __packed; +struct tcmu_tmr_entry { + struct tcmu_cmd_entry_hdr hdr; + +#define TCMU_TMR_UNKNOWN 0 +#define TCMU_TMR_ABORT_TASK 1 +#define TCMU_TMR_ABORT_TASK_SET 2 +#define TCMU_TMR_CLEAR_ACA 3 +#define TCMU_TMR_CLEAR_TASK_SET 4 +#define TCMU_TMR_LUN_RESET 5 +#define TCMU_TMR_TARGET_WARM_RESET 6 +#define TCMU_TMR_TARGET_COLD_RESET 7 +/* Pseudo reset due to received PR OUT */ +#define TCMU_TMR_LUN_RESET_PRO 128 + __u8 tmr_type; + + __u8 __pad1; + __u16 __pad2; + __u32 cmd_cnt; + __u64 __pad3; + __u64 __pad4; + __u16 cmd_ids[0]; +} __packed; + #define TCMU_OP_ALIGN_SIZE sizeof(__u64) enum tcmu_genl_cmd { -- cgit v1.2.3 From 556c811f24b30cc883733a2eaf9e939817589231 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 Jul 2020 17:03:09 +0300 Subject: RDMA/efa: Expose maximum TX doorbell batch The device reports the maximum number of bytes to be written before ringing the doorbell (zero means unlimited). This patch queries the max batch size and reports it back to the userspace library. Link: https://lore.kernel.org/r/20200722140312.3651-2-galpress@amazon.com Reviewed-by: Daniel Kranzdorf Reviewed-by: Firas JahJah Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_admin_cmds_defs.h | 11 +++++++++++ drivers/infiniband/hw/efa/efa_com_cmd.c | 1 + drivers/infiniband/hw/efa/efa_com_cmd.h | 1 + drivers/infiniband/hw/efa/efa_verbs.c | 1 + include/uapi/rdma/efa-abi.h | 4 +++- 5 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index bef2bd291054..03e7388af06e 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -632,6 +632,17 @@ struct efa_admin_feature_queue_attr_desc { /* Maximum number of SGEs for a single RDMA read WQE */ u16 max_wr_rdma_sges; + + /* + * Maximum number of bytes that can be written to SQ between two + * consecutive doorbells (in units of 64B). Driver must ensure that only + * complete WQEs are written to queue before issuing a doorbell. + * Examples: max_tx_batch=16 and WQE size = 64B, means up to 16 WQEs can + * be written to SQ between two consecutive doorbells. max_tx_batch=11 + * and WQE size = 128B, means up to 5 WQEs can be written to SQ between + * two consecutive doorbells. Zero means unlimited. + */ + u16 max_tx_batch; }; struct efa_admin_feature_aenq_desc { diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index fabd8df2e78f..53cfde5c43d8 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -480,6 +480,7 @@ int efa_com_get_device_attr(struct efa_com_dev *edev, result->max_llq_size = resp.u.queue_attr.max_llq_size; result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq; result->max_wr_rdma_sge = resp.u.queue_attr.max_wr_rdma_sges; + result->max_tx_batch = resp.u.queue_attr.max_tx_batch; err = efa_com_get_feature(edev, &resp, EFA_ADMIN_NETWORK_ATTR); if (err) { diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 41ce4a476ee6..8df2a26d57d4 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -127,6 +127,7 @@ struct efa_com_get_device_attr_result { u16 max_sq_sge; u16 max_rq_sge; u16 max_wr_rdma_sge; + u16 max_tx_batch; u8 db_bar; }; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 08313f7c73bc..f49d14cebe4a 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1525,6 +1525,7 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq; resp.inline_buf_size = dev->dev_attr.inline_buf_size; resp.max_llq_size = dev->dev_attr.max_llq_size; + resp.max_tx_batch = dev->dev_attr.max_tx_batch; if (udata && udata->outlen) { err = ib_copy_to_udata(udata, &resp, diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index 53b6e2036a9b..10781763da37 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ /* - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_ABI_USER_H @@ -31,6 +31,8 @@ struct efa_ibv_alloc_ucontext_resp { __u16 sub_cqs_per_cq; __u16 inline_buf_size; __u32 max_llq_size; /* bytes */ + __u16 max_tx_batch; /* units of 64 bytes */ + __u8 reserved_90[6]; }; struct efa_ibv_alloc_pd_resp { -- cgit v1.2.3 From da2924bdca99768442c5e0ed0a9024ae79d62765 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 Jul 2020 17:03:10 +0300 Subject: RDMA/efa: Expose minimum SQ size The device reports the minimum SQ size required for creation. This patch queries the min SQ size and reports it back to the userspace library. Link: https://lore.kernel.org/r/20200722140312.3651-3-galpress@amazon.com Reviewed-by: Firas JahJah Reviewed-by: Shadi Ammouri Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_admin_cmds_defs.h | 4 ++-- drivers/infiniband/hw/efa/efa_com_cmd.c | 1 + drivers/infiniband/hw/efa/efa_com_cmd.h | 1 + drivers/infiniband/hw/efa/efa_verbs.c | 1 + include/uapi/rdma/efa-abi.h | 3 ++- 5 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index 03e7388af06e..5484b08bbc5d 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -606,8 +606,8 @@ struct efa_admin_feature_queue_attr_desc { /* Number of sub-CQs to be created for each CQ */ u16 sub_cqs_per_cq; - /* MBZ */ - u16 reserved; + /* Minimum number of WQEs per SQ */ + u16 min_sq_depth; /* Maximum number of SGEs (buffers) allowed for a single send WQE */ u16 max_wr_send_sges; diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 53cfde5c43d8..6ac23627f65a 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -481,6 +481,7 @@ int efa_com_get_device_attr(struct efa_com_dev *edev, result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq; result->max_wr_rdma_sge = resp.u.queue_attr.max_wr_rdma_sges; result->max_tx_batch = resp.u.queue_attr.max_tx_batch; + result->min_sq_depth = resp.u.queue_attr.min_sq_depth; err = efa_com_get_feature(edev, &resp, EFA_ADMIN_NETWORK_ATTR); if (err) { diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 8df2a26d57d4..190bac23f585 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -128,6 +128,7 @@ struct efa_com_get_device_attr_result { u16 max_rq_sge; u16 max_wr_rdma_sge; u16 max_tx_batch; + u16 min_sq_depth; u8 db_bar; }; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index f49d14cebe4a..26102ab333b2 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1526,6 +1526,7 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) resp.inline_buf_size = dev->dev_attr.inline_buf_size; resp.max_llq_size = dev->dev_attr.max_llq_size; resp.max_tx_batch = dev->dev_attr.max_tx_batch; + resp.min_sq_wr = dev->dev_attr.min_sq_depth; if (udata && udata->outlen) { err = ib_copy_to_udata(udata, &resp, diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index 10781763da37..7ef2306f8dd4 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -32,7 +32,8 @@ struct efa_ibv_alloc_ucontext_resp { __u16 inline_buf_size; __u32 max_llq_size; /* bytes */ __u16 max_tx_batch; /* units of 64 bytes */ - __u8 reserved_90[6]; + __u16 min_sq_wr; + __u8 reserved_a0[4]; }; struct efa_ibv_alloc_pd_resp { -- cgit v1.2.3 From a5d87b698547233321466b2dc91271f5855a4df6 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 Jul 2020 17:03:11 +0300 Subject: RDMA/efa: User/kernel compatibility handshake mechanism Introduce a mechanism that performs an handshake between the userspace provider and kernel driver which verifies that the user supports all required features in order to operate correctly. The handshake verifies the needed functionality by comparing the reported device caps and the provider caps. If the device reports a non-zero capability the appropriate comp mask is required from the userspace provider in order to allocate the context. Link: https://lore.kernel.org/r/20200722140312.3651-4-galpress@amazon.com Reviewed-by: Shadi Ammouri Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 40 +++++++++++++++++++++++++++++++++++ include/uapi/rdma/efa-abi.h | 10 +++++++++ 2 files changed, 50 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 26102ab333b2..fda175836fb6 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1501,11 +1501,39 @@ static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn) return efa_com_dealloc_uar(&dev->edev, ¶ms); } +#define EFA_CHECK_USER_COMP(_dev, _comp_mask, _attr, _mask, _attr_str) \ + (_attr_str = (!(_dev)->dev_attr._attr || ((_comp_mask) & (_mask))) ? \ + NULL : #_attr) + +static int efa_user_comp_handshake(const struct ib_ucontext *ibucontext, + const struct efa_ibv_alloc_ucontext_cmd *cmd) +{ + struct efa_dev *dev = to_edev(ibucontext->device); + char *attr_str; + + if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, max_tx_batch, + EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH, attr_str)) + goto err; + + if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, min_sq_depth, + EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR, + attr_str)) + goto err; + + return 0; + +err: + ibdev_dbg(&dev->ibdev, "Userspace handshake failed for %s attribute\n", + attr_str); + return -EOPNOTSUPP; +} + int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) { struct efa_ucontext *ucontext = to_eucontext(ibucontext); struct efa_dev *dev = to_edev(ibucontext->device); struct efa_ibv_alloc_ucontext_resp resp = {}; + struct efa_ibv_alloc_ucontext_cmd cmd = {}; struct efa_com_alloc_uar_result result; int err; @@ -1514,6 +1542,18 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) * we will ack input fields in our response. */ + err = ib_copy_from_udata(&cmd, udata, + min(sizeof(cmd), udata->inlen)); + if (err) { + ibdev_dbg(&dev->ibdev, + "Cannot copy udata for alloc_ucontext\n"); + goto err_out; + } + + err = efa_user_comp_handshake(ibucontext, &cmd); + if (err) + goto err_out; + err = efa_com_alloc_uar(&dev->edev, &result); if (err) goto err_out; diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index 7ef2306f8dd4..507a2862bedb 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -20,6 +20,16 @@ * hex bit offset of the field. */ +enum { + EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH = 1 << 0, + EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR = 1 << 1, +}; + +struct efa_ibv_alloc_ucontext_cmd { + __u32 comp_mask; + __u8 reserved_20[4]; +}; + enum efa_ibv_user_cmds_supp_udata { EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0, EFA_USER_CMDS_SUPP_UDATA_CREATE_AH = 1 << 1, -- cgit v1.2.3 From 50935339c394adfb3d7253055e3bc10ee70264b0 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Sat, 25 Jul 2020 19:02:25 +0200 Subject: netfilter: Replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_connmark.h | 2 +- net/decnet/netfilter/dn_rtmsg.c | 2 +- net/netfilter/Kconfig | 2 +- net/netfilter/nfnetlink_acct.c | 2 +- net/netfilter/nft_set_pipapo.c | 4 ++-- net/netfilter/xt_CONNSECMARK.c | 2 +- net/netfilter/xt_connmark.c | 2 +- net/netfilter/xt_nfacct.c | 2 +- net/netfilter/xt_time.c | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/xt_connmark.h b/include/uapi/linux/netfilter/xt_connmark.h index 1aa5c955ee1e..f01c19b83a2b 100644 --- a/include/uapi/linux/netfilter/xt_connmark.h +++ b/include/uapi/linux/netfilter/xt_connmark.h @@ -4,7 +4,7 @@ #include -/* Copyright (C) 2002,2004 MARA Systems AB +/* Copyright (C) 2002,2004 MARA Systems AB * by Henrik Nordstrom * * This program is free software; you can redistribute it and/or modify diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index dc705769acc9..26a9193df783 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -6,7 +6,7 @@ * * DECnet Routing Message Grabulator * - * (C) 2000 ChyGwyn Limited - http://www.chygwyn.com/ + * (C) 2000 ChyGwyn Limited - https://www.chygwyn.com/ * * Author: Steven Whitehouse */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0ffe2b8723c4..25313c29d799 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -447,7 +447,7 @@ config NF_TABLES replace the existing {ip,ip6,arp,eb}_tables infrastructure. It provides a pseudo-state machine with an extensible instruction-set (also known as expressions) that the userspace 'nft' utility - (http://www.netfilter.org/projects/nftables) uses to build the + (https://www.netfilter.org/projects/nftables) uses to build the rule-set. It also comes with the generic set infrastructure that allows you to construct mappings between matchings and actions for performance lookups. diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5827117f2635..5bfec829c12f 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso - * (C) 2011 Intra2net AG + * (C) 2011 Intra2net AG */ #include #include diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index cc6082a5f7ad..9944523f5c2c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -312,7 +312,7 @@ * Jay Ligatti, Josh Kuhn, and Chris Gage. * Proceedings of the IEEE International Conference on Computer * Communication Networks (ICCCN), August 2010. - * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf + * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf * * [Rottenstreich 2010] * Worst-Case TCAM Rule Expansion @@ -325,7 +325,7 @@ * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, * and Patrick Eugster. * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. - * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf + * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf */ #include diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index a5c8b653476a..76acecf3e757 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -6,7 +6,7 @@ * with the SECMARK target and state match. * * Based somewhat on CONNMARK: - * Copyright (C) 2002,2004 MARA Systems AB + * Copyright (C) 2002,2004 MARA Systems AB * by Henrik Nordstrom * * (C) 2006,2008 Red Hat, Inc., James Morris diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index eec2f3a88d73..e5ebc0810675 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -2,7 +2,7 @@ /* * xt_connmark - Netfilter module to operate on connection marks * - * Copyright (C) 2002,2004 MARA Systems AB + * Copyright (C) 2002,2004 MARA Systems AB * by Henrik Nordstrom * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * Jan Engelhardt diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 5aab6df74e0f..a97c2259bbc8 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso - * (C) 2011 Intra2net AG + * (C) 2011 Intra2net AG */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 67cb98489415..6aa12d0f54e2 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -5,7 +5,7 @@ * based on ipt_time by Fabrice MARIE * This is a module which is used for time matching * It is using some modified code from dietlibc (localtime() function) - * that you can find at http://www.fefe.de/dietlibc/ + * that you can find at https://www.fefe.de/dietlibc/ * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. */ -- cgit v1.2.3 From df78a0c0b67de58934877aad61e0431a2bd0caf1 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 1 Jun 2020 23:22:47 -0700 Subject: nl80211: S1G band and channel definitions Gives drivers the definitions needed to advertise support for S1G bands. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200602062247.23212-1-thomas@adapt-ip.com Link: https://lore.kernel.org/r/20200731055636.795173-1-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath10k/mac.c | 9 ++------- include/net/cfg80211.h | 17 +++++++++++++++++ include/uapi/linux/nl80211.h | 16 ++++++++++++++++ net/mac80211/chan.c | 7 ++++++- net/mac80211/scan.c | 1 + net/mac80211/tx.c | 1 + net/mac80211/util.c | 5 +++++ net/wireless/chan.c | 35 +++++++++++++++++++++++++++++++++++ net/wireless/core.c | 5 +++-- net/wireless/util.c | 8 ++++++++ 10 files changed, 94 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 919d15584d4a..3c0c33a9f30c 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -568,11 +568,7 @@ chan_to_phymode(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_40: phymode = MODE_11NG_HT40; break; - case NL80211_CHAN_WIDTH_5: - case NL80211_CHAN_WIDTH_10: - case NL80211_CHAN_WIDTH_80: - case NL80211_CHAN_WIDTH_80P80: - case NL80211_CHAN_WIDTH_160: + default: phymode = MODE_UNKNOWN; break; } @@ -597,8 +593,7 @@ chan_to_phymode(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_80P80: phymode = MODE_11AC_VHT80_80; break; - case NL80211_CHAN_WIDTH_5: - case NL80211_CHAN_WIDTH_10: + default: phymode = MODE_UNKNOWN; break; } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index fc7e8807838d..ac6e58193426 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -417,6 +417,22 @@ struct ieee80211_edmg { enum ieee80211_edmg_bw_config bw_config; }; +/** + * struct ieee80211_sta_s1g_cap - STA's S1G capabilities + * + * This structure describes most essential parameters needed + * to describe 802.11ah S1G capabilities for a STA. + * + * @s1g_supported: is STA an S1G STA + * @cap: S1G capabilities information + * @nss_mcs: Supported NSS MCS set + */ +struct ieee80211_sta_s1g_cap { + bool s1g; + u8 cap[10]; /* use S1G_CAPAB_ */ + u8 nss_mcs[5]; +}; + /** * struct ieee80211_supported_band - frequency band definition * @@ -448,6 +464,7 @@ struct ieee80211_supported_band { int n_bitrates; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; + struct ieee80211_sta_s1g_cap s1g_cap; struct ieee80211_edmg edmg_cap; u16 n_iftype_data; const struct ieee80211_sband_iftype_data *iftype_data; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4e6339ab1fce..ad183469f9af 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4437,6 +4437,11 @@ enum nl80211_key_mode { * attribute must be provided as well * @NL80211_CHAN_WIDTH_5: 5 MHz OFDM channel * @NL80211_CHAN_WIDTH_10: 10 MHz OFDM channel + * @NL80211_CHAN_WIDTH_1: 1 MHz OFDM channel + * @NL80211_CHAN_WIDTH_2: 2 MHz OFDM channel + * @NL80211_CHAN_WIDTH_4: 4 MHz OFDM channel + * @NL80211_CHAN_WIDTH_8: 8 MHz OFDM channel + * @NL80211_CHAN_WIDTH_16: 16 MHz OFDM channel */ enum nl80211_chan_width { NL80211_CHAN_WIDTH_20_NOHT, @@ -4447,6 +4452,11 @@ enum nl80211_chan_width { NL80211_CHAN_WIDTH_160, NL80211_CHAN_WIDTH_5, NL80211_CHAN_WIDTH_10, + NL80211_CHAN_WIDTH_1, + NL80211_CHAN_WIDTH_2, + NL80211_CHAN_WIDTH_4, + NL80211_CHAN_WIDTH_8, + NL80211_CHAN_WIDTH_16, }; /** @@ -4457,11 +4467,15 @@ enum nl80211_chan_width { * @NL80211_BSS_CHAN_WIDTH_20: control channel is 20 MHz wide or compatible * @NL80211_BSS_CHAN_WIDTH_10: control channel is 10 MHz wide * @NL80211_BSS_CHAN_WIDTH_5: control channel is 5 MHz wide + * @NL80211_BSS_CHAN_WIDTH_1: control channel is 1 MHz wide + * @NL80211_BSS_CHAN_WIDTH_2: control channel is 2 MHz wide */ enum nl80211_bss_scan_width { NL80211_BSS_CHAN_WIDTH_20, NL80211_BSS_CHAN_WIDTH_10, NL80211_BSS_CHAN_WIDTH_5, + NL80211_BSS_CHAN_WIDTH_1, + NL80211_BSS_CHAN_WIDTH_2, }; /** @@ -4740,6 +4754,7 @@ enum nl80211_txrate_gi { * @NL80211_BAND_5GHZ: around 5 GHz band (4.9 - 5.7 GHz) * @NL80211_BAND_60GHZ: around 60 GHz band (58.32 - 69.12 GHz) * @NL80211_BAND_6GHZ: around 6 GHz band (5.9 - 7.2 GHz) + * @NL80211_BAND_S1GHZ: around 900MHz, supported by S1G PHYs * @NUM_NL80211_BANDS: number of bands, avoid using this in userspace * since newer kernel versions may support more bands */ @@ -4748,6 +4763,7 @@ enum nl80211_band { NL80211_BAND_5GHZ, NL80211_BAND_60GHZ, NL80211_BAND_6GHZ, + NL80211_BAND_S1GHZ, NUM_NL80211_BANDS, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index e6e192f53e4e..08cf9da9c1e3 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -313,9 +313,14 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, lockdep_assert_held(&local->chanctx_mtx); - /* don't optimize 5MHz, 10MHz, and radar_enabled confs */ + /* don't optimize non-20MHz based and radar_enabled confs */ if (ctx->conf.def.width == NL80211_CHAN_WIDTH_5 || ctx->conf.def.width == NL80211_CHAN_WIDTH_10 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_1 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_2 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_4 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_8 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_16 || ctx->conf.radar_enabled) { ctx->conf.min_def = ctx->conf.def; return; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index ad90bbe57457..8003be6dae8a 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -913,6 +913,7 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, case NL80211_BSS_CHAN_WIDTH_10: local->scan_chandef.width = NL80211_CHAN_WIDTH_10; break; + default: case NL80211_BSS_CHAN_WIDTH_20: /* If scanning on oper channel, use whatever channel-type * is currently in use. diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1a2941e5244f..ee30ef441f4a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -166,6 +166,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (r->flags & IEEE80211_RATE_MANDATORY_A) mrate = r->bitrate; break; + case NL80211_BAND_S1GHZ: case NL80211_BAND_60GHZ: /* TODO, for now fall through */ case NUM_NL80211_BANDS: diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 21c94094a699..64a83ecd0a73 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3730,6 +3730,11 @@ u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c) c->width = NL80211_CHAN_WIDTH_20_NOHT; ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; break; + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: WARN_ON_ONCE(1); diff --git a/net/wireless/chan.c b/net/wireless/chan.c index cddf92c5d09e..90f0f82cd9ca 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -153,6 +153,11 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20: @@ -263,6 +268,21 @@ static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) int width; switch (c->width) { + case NL80211_CHAN_WIDTH_1: + width = 1; + break; + case NL80211_CHAN_WIDTH_2: + width = 2; + break; + case NL80211_CHAN_WIDTH_4: + width = 4; + break; + case NL80211_CHAN_WIDTH_8: + width = 8; + break; + case NL80211_CHAN_WIDTH_16: + width = 16; + break; case NL80211_CHAN_WIDTH_5: width = 5; break; @@ -911,6 +931,21 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + width = 1; + break; + case NL80211_CHAN_WIDTH_2: + width = 2; + break; + case NL80211_CHAN_WIDTH_4: + width = 4; + break; + case NL80211_CHAN_WIDTH_8: + width = 8; + break; + case NL80211_CHAN_WIDTH_16: + width = 16; + break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/core.c b/net/wireless/core.c index c623d9bf5096..1971d7e6eb55 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -803,10 +803,11 @@ int wiphy_register(struct wiphy *wiphy) if (WARN_ON(!sband->n_channels)) return -EINVAL; /* - * on 60GHz band, there are no legacy rates, so + * on 60GHz or sub-1Ghz band, there are no legacy rates, so * n_bitrates is 0 */ - if (WARN_ON(band != NL80211_BAND_60GHZ && + if (WARN_ON((band != NL80211_BAND_60GHZ && + band != NL80211_BAND_S1GHZ) && !sband->n_bitrates)) return -EINVAL; diff --git a/net/wireless/util.c b/net/wireless/util.c index 4d3b76f94f55..26a977343c3b 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -102,6 +102,8 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) if (chan < 7) return MHZ_TO_KHZ(56160 + chan * 2160); break; + case NL80211_BAND_S1GHZ: + return 902000 + chan * 500; default: ; } @@ -210,6 +212,12 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; + case NL80211_BAND_S1GHZ: + /* Figure 9-589bd: 3 means unsupported, so != 3 means at least + * mandatory is ok. + */ + WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3); + break; case NUM_NL80211_BANDS: default: WARN_ON(1); -- cgit v1.2.3 From 987021726f9f41a1daf335c57cd7b6261109cdb2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 09:43:21 -0700 Subject: net/wireless: nl80211.h: drop duplicate words in comments Drop doubled words in several comments. Signed-off-by: Randy Dunlap Cc: netdev@vger.kernel.org Cc: Kalle Valo Cc: linux-wireless@vger.kernel.org Cc: Johannes Berg Link: https://lore.kernel.org/r/20200715164325.9109-1-rdunlap@infradead.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ad183469f9af..f47a7a8d0216 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -363,7 +363,7 @@ * @NL80211_CMD_SET_STATION: Set station attributes for station identified by * %NL80211_ATTR_MAC on the interface identified by %NL80211_ATTR_IFINDEX. * @NL80211_CMD_NEW_STATION: Add a station with given attributes to the - * the interface identified by %NL80211_ATTR_IFINDEX. + * interface identified by %NL80211_ATTR_IFINDEX. * @NL80211_CMD_DEL_STATION: Remove a station identified by %NL80211_ATTR_MAC * or, if no MAC address given, all stations, on the interface identified * by %NL80211_ATTR_IFINDEX. %NL80211_ATTR_MGMT_SUBTYPE and @@ -383,7 +383,7 @@ * @NL80211_CMD_DEL_MPATH: Delete a mesh path to the destination given by * %NL80211_ATTR_MAC. * @NL80211_CMD_NEW_PATH: Add a mesh path with given attributes to the - * the interface identified by %NL80211_ATTR_IFINDEX. + * interface identified by %NL80211_ATTR_IFINDEX. * @NL80211_CMD_DEL_PATH: Remove a mesh path identified by %NL80211_ATTR_MAC * or, if no MAC address given, all mesh paths, on the interface identified * by %NL80211_ATTR_IFINDEX. @@ -934,7 +934,7 @@ * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules. * * @NL80211_CMD_CHANNEL_SWITCH: Perform a channel switch by announcing the - * the new channel information (Channel Switch Announcement - CSA) + * new channel information (Channel Switch Announcement - CSA) * in the beacon for some time (as defined in the * %NL80211_ATTR_CH_SWITCH_COUNT parameter) and then change to the * new channel. Userspace provides the new channel information (using @@ -1113,7 +1113,7 @@ * randomization may be enabled and configured by specifying the * %NL80211_ATTR_MAC and %NL80211_ATTR_MAC_MASK attributes. * If a timeout is requested, use the %NL80211_ATTR_TIMEOUT attribute. - * A u64 cookie for further %NL80211_ATTR_COOKIE use is is returned in + * A u64 cookie for further %NL80211_ATTR_COOKIE use is returned in * the netlink extended ack message. * * To cancel a measurement, close the socket that requested it. @@ -1511,7 +1511,7 @@ enum nl80211_commands { * rates as defined by IEEE 802.11 7.3.2.2 but without the length * restriction (at most %NL80211_MAX_SUPP_RATES). * @NL80211_ATTR_STA_VLAN: interface index of VLAN interface to move station - * to, or the AP interface the station was originally added to to. + * to, or the AP interface the station was originally added to. * @NL80211_ATTR_STA_INFO: information about a station, part of station info * given for %NL80211_CMD_GET_STATION, nested attribute containing * info as possible, see &enum nl80211_sta_info. @@ -2084,7 +2084,7 @@ enum nl80211_commands { * @NL80211_ATTR_STA_SUPPORTED_CHANNELS: array of supported channels. * * @NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES: array of supported - * supported operating classes. + * operating classes. * * @NL80211_ATTR_HANDLE_DFS: A flag indicating whether user space * controls DFS operation in IBSS mode. If the flag is included in @@ -2395,7 +2395,7 @@ enum nl80211_commands { * nl80211_txq_stats) * @NL80211_ATTR_TXQ_LIMIT: Total packet limit for the TXQ queues for this phy. * The smaller of this and the memory limit is enforced. - * @NL80211_ATTR_TXQ_MEMORY_LIMIT: Total memory memory limit (in bytes) for the + * @NL80211_ATTR_TXQ_MEMORY_LIMIT: Total memory limit (in bytes) for the * TXQ queues for this phy. The smaller of this and the packet limit is * enforced. * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes @@ -5652,7 +5652,7 @@ enum nl80211_feature_flags { * enum nl80211_ext_feature_index - bit index of extended features. * @NL80211_EXT_FEATURE_VHT_IBSS: This driver supports IBSS with VHT datarates. * @NL80211_EXT_FEATURE_RRM: This driver supports RRM. When featured, user can - * can request to use RRM (see %NL80211_ATTR_USE_RRM) with + * request to use RRM (see %NL80211_ATTR_USE_RRM) with * %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set * the ASSOC_REQ_USE_RRM flag in the association request even if * NL80211_FEATURE_QUIET is not advertized. @@ -6061,7 +6061,7 @@ enum nl80211_dfs_state { }; /** - * enum enum nl80211_protocol_features - nl80211 protocol features + * enum nl80211_protocol_features - nl80211 protocol features * @NL80211_PROTOCOL_FEATURE_SPLIT_WIPHY_DUMP: nl80211 supports splitting * wiphy dumps (if requested by the application with the attribute * %NL80211_ATTR_SPLIT_WIPHY_DUMP. Also supported is filtering the -- cgit v1.2.3 From 0f55c0c500f2bbfc5cc5590cdf6973b3f64dc195 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 09:43:22 -0700 Subject: net/wireless: wireless.h: drop duplicate word in comments Drop doubled word "threshold" in a comment. Signed-off-by: Randy Dunlap Cc: netdev@vger.kernel.org Cc: Kalle Valo Cc: linux-wireless@vger.kernel.org Cc: Johannes Berg Link: https://lore.kernel.org/r/20200715164325.9109-2-rdunlap@infradead.org Signed-off-by: Johannes Berg --- include/uapi/linux/wireless.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/wireless.h b/include/uapi/linux/wireless.h index 24f3371ad826..08967b3f19c8 100644 --- a/include/uapi/linux/wireless.h +++ b/include/uapi/linux/wireless.h @@ -914,7 +914,7 @@ union iwreq_data { struct iw_param sens; /* signal level threshold */ struct iw_param bitrate; /* default bit rate */ struct iw_param txpower; /* default transmit power */ - struct iw_param rts; /* RTS threshold threshold */ + struct iw_param rts; /* RTS threshold */ struct iw_param frag; /* Fragmentation threshold */ __u32 mode; /* Operation mode */ struct iw_param retry; /* Retry limits & lifetime */ -- cgit v1.2.3 From e3718a611470d311a92c60d4eb535270b49a7108 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Wed, 17 Jun 2020 09:30:33 +0200 Subject: cfg80211/mac80211: add mesh_param "mesh_nolearn" to skip path discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, before being able to forward a packet between two 802.11s nodes, both a PLINK handshake is performed upon receiving a beacon and then later a PREQ/PREP exchange for path discovery is performed on demand upon receiving a data frame to forward. When running a mesh protocol on top of an 802.11s interface, like batman-adv, we do not need the multi-hop mesh routing capabilities of 802.11s and usually set mesh_fwding=0. However, even with mesh_fwding=0 the PREQ/PREP path discovery is still performed on demand. Even though in this scenario the next hop PREQ/PREP will determine is always the direct 11s neighbor node. The new mesh_nolearn parameter allows to skip the PREQ/PREP exchange in this scenario, leading to a reduced delay, reduced packet buffering and simplifies HWMP in general. mesh_nolearn is still rather conservative in that if the packet destination is not a direct 11s neighbor, it will fall back to PREQ/PREP path discovery. For normal, multi-hop 802.11s mesh routing it is usually not advisable to enable mesh_nolearn as a transmission to a direct but distant neighbor might be worse than reaching that same node via a more robust / higher throughput etc. multi-hop path. Cc: Sven Eckelmann Cc: Simon Wunderlich Signed-off-by: Linus Lüssing Link: https://lore.kernel.org/r/20200617073034.26149-1-linus.luessing@c0d3.blue [fix nl80211 policy to range 0/1 only] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 ++++++ include/uapi/linux/nl80211.h | 7 +++++++ net/mac80211/cfg.c | 2 ++ net/mac80211/debugfs_netdev.c | 2 ++ net/mac80211/mesh_hwmp.c | 39 +++++++++++++++++++++++++++++++++++++++ net/wireless/mesh.c | 1 + net/wireless/nl80211.c | 7 ++++++- net/wireless/trace.h | 4 +++- 8 files changed, 66 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index fa4d5627397f..78b220950942 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1870,6 +1870,11 @@ struct bss_parameters { * connected to a mesh gate in mesh formation info. If false, the * value in mesh formation is determined by the presence of root paths * in the mesh path table + * @dot11MeshNolearn: Try to avoid multi-hop path discovery (e.g. PREQ/PREP + * for HWMP) if the destination is a direct neighbor. Note that this might + * not be the optimal decision as a multi-hop route might be better. So + * if using this setting you will likely also want to disable + * dot11MeshForwarding and use another mesh routing protocol on top. */ struct mesh_config { u16 dot11MeshRetryTimeout; @@ -1901,6 +1906,7 @@ struct mesh_config { enum nl80211_mesh_power_mode power_mode; u16 dot11MeshAwakeWindowDuration; u32 plink_timeout; + bool dot11MeshNolearn; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f47a7a8d0216..a83d8faf88ac 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4236,6 +4236,12 @@ enum nl80211_mesh_power_mode { * field. If left unset then the mesh formation field will only * advertise such if there is an active root mesh path. * + * @NL80211_MESHCONF_NOLEARN: Try to avoid multi-hop path discovery (e.g. + * PREQ/PREP for HWMP) if the destination is a direct neighbor. Note that + * this might not be the optimal decision as a multi-hop route might be + * better. So if using this setting you will likely also want to disable + * dot11MeshForwarding and use another mesh routing protocol on top. + * * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use */ enum nl80211_meshconf_params { @@ -4269,6 +4275,7 @@ enum nl80211_meshconf_params { NL80211_MESHCONF_AWAKE_WINDOW, NL80211_MESHCONF_PLINK_TIMEOUT, NL80211_MESHCONF_CONNECTED_TO_GATE, + NL80211_MESHCONF_NOLEARN, /* keep last */ __NL80211_MESHCONF_ATTR_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b4a74064675e..9af56b848544 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2126,6 +2126,8 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask)) conf->dot11MeshConnectedToMeshGate = nconf->dot11MeshConnectedToMeshGate; + if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) + conf->dot11MeshNolearn = nconf->dot11MeshNolearn; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index d7e955127d5c..09eab2c3f380 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -638,6 +638,7 @@ IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration, u.mesh.mshcfg.dot11MeshAwakeWindowDuration, DEC); IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate, u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC); +IEEE80211_IF_FILE(dot11MeshNolearn, u.mesh.mshcfg.dot11MeshNolearn, DEC); #endif #define DEBUGFS_ADD_MODE(name, mode) \ @@ -762,6 +763,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata) MESHPARAMS_ADD(power_mode); MESHPARAMS_ADD(dot11MeshAwakeWindowDuration); MESHPARAMS_ADD(dot11MeshConnectedToMeshGate); + MESHPARAMS_ADD(dot11MeshNolearn); #undef MESHPARAMS_ADD } #endif diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index bae3a3e15b88..bec23d2eee7a 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1172,6 +1172,40 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, return -ENOENT; } +/** + * mesh_nexthop_lookup_nolearn - try to set next hop without path discovery + * @skb: 802.11 frame to be sent + * @sdata: network subif the frame will be sent through + * + * Check if the meshDA (addr3) of a unicast frame is a direct neighbor. + * And if so, set the RA (addr1) to it to transmit to this node directly, + * avoiding PREQ/PREP path discovery. + * + * Returns: 0 if the next hop was found and -ENOENT otherwise. + */ +static int mesh_nexthop_lookup_nolearn(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + struct sta_info *sta; + + if (is_multicast_ether_addr(hdr->addr1)) + return -ENOENT; + + rcu_read_lock(); + sta = sta_info_get(sdata, hdr->addr3); + + if (!sta || sta->mesh->plink_state != NL80211_PLINK_ESTAB) { + rcu_read_unlock(); + return -ENOENT; + } + rcu_read_unlock(); + + memcpy(hdr->addr1, hdr->addr3, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + return 0; +} + /** * mesh_nexthop_lookup - put the appropriate next hop on a mesh frame. Calling * this function is considered "using" the associated mpath, so preempt a path @@ -1185,11 +1219,16 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_path *mpath; struct sta_info *next_hop; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; u8 *target_addr = hdr->addr3; + if (ifmsh->mshcfg.dot11MeshNolearn && + !mesh_nexthop_lookup_nolearn(sdata, skb)) + return 0; + mpath = mesh_path_lookup(sdata, target_addr); if (!mpath || !(mpath->flags & MESH_PATH_ACTIVE)) return -ENOENT; diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index eac5aa1419fc..e4e363138279 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -78,6 +78,7 @@ const struct mesh_config default_mesh_config = { .power_mode = NL80211_MESH_POWER_ACTIVE, .dot11MeshAwakeWindowDuration = MESH_DEFAULT_AWAKE_WINDOW, .plink_timeout = MESH_DEFAULT_PLINK_TIMEOUT, + .dot11MeshNolearn = false, }; const struct mesh_setup default_mesh_setup = { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6fdf818f66cf..257c06315464 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6885,7 +6885,9 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT, cur_params.plink_timeout) || nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE, - cur_params.dot11MeshConnectedToMeshGate)) + cur_params.dot11MeshConnectedToMeshGate) || + nla_put_u8(msg, NL80211_MESHCONF_NOLEARN, + cur_params.dot11MeshNolearn)) goto nla_put_failure; nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); @@ -6943,6 +6945,7 @@ nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 }, [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, [NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1), + [NL80211_MESHCONF_NOLEARN] = NLA_POLICY_RANGE(NLA_U8, 0, 1), }; static const struct nla_policy @@ -7094,6 +7097,8 @@ do { \ NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, mask, NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNolearn, mask, + NL80211_MESHCONF_NOLEARN, nla_get_u8); if (mask_out) *mask_out = mask; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index b23cab016521..6e218a0acd4e 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -68,7 +68,8 @@ __field(u16, ht_opmode) \ __field(u32, dot11MeshHWMPactivePathToRootTimeout) \ __field(u16, dot11MeshHWMProotInterval) \ - __field(u16, dot11MeshHWMPconfirmationInterval) + __field(u16, dot11MeshHWMPconfirmationInterval) \ + __field(bool, dot11MeshNolearn) #define MESH_CFG_ASSIGN \ do { \ __entry->dot11MeshRetryTimeout = conf->dot11MeshRetryTimeout; \ @@ -109,6 +110,7 @@ conf->dot11MeshHWMProotInterval; \ __entry->dot11MeshHWMPconfirmationInterval = \ conf->dot11MeshHWMPconfirmationInterval; \ + __entry->dot11MeshNolearn = conf->dot11MeshNolearn; \ } while (0) #define CHAN_ENTRY __field(enum nl80211_band, band) \ -- cgit v1.2.3 From 184eebe664f0e11c485f6d309fe56297b3f75e9e Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Thu, 11 Jun 2020 16:02:37 +0200 Subject: cfg80211/mac80211: add connected to auth server to meshconf Besides information about num of peerings and gate connectivity, the mesh formation byte also contains a flag for authentication server connectivity, that currently cannot be set in the mesh conf. This patch adds this capability, which is necessary to implement 802.1X authentication in mesh mode. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200611140238.427461-1-markus.theil@tu-ilmenau.de Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 1 + include/uapi/linux/nl80211.h | 5 +++++ net/mac80211/cfg.c | 3 +++ net/mac80211/debugfs_netdev.c | 3 +++ net/mac80211/mesh.c | 5 ++++- net/wireless/nl80211.c | 8 +++++++- 6 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 78b220950942..8d5071f84ffe 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1895,6 +1895,7 @@ struct mesh_config { u16 dot11MeshHWMPnetDiameterTraversalTime; u8 dot11MeshHWMPRootMode; bool dot11MeshConnectedToMeshGate; + bool dot11MeshConnectedToAuthServer; u16 dot11MeshHWMPRannInterval; bool dot11MeshGateAnnouncementProtocol; bool dot11MeshForwarding; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a83d8faf88ac..f1770e3756f4 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4242,6 +4242,10 @@ enum nl80211_mesh_power_mode { * better. So if using this setting you will likely also want to disable * dot11MeshForwarding and use another mesh routing protocol on top. * + * @NL80211_MESHCONF_CONNECTED_TO_AS: If set to true then this mesh STA + * will advertise that it is connected to a authentication server + * in the mesh formation field. + * * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use */ enum nl80211_meshconf_params { @@ -4276,6 +4280,7 @@ enum nl80211_meshconf_params { NL80211_MESHCONF_PLINK_TIMEOUT, NL80211_MESHCONF_CONNECTED_TO_GATE, NL80211_MESHCONF_NOLEARN, + NL80211_MESHCONF_CONNECTED_TO_AS, /* keep last */ __NL80211_MESHCONF_ATTR_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9af56b848544..6a6531a50e54 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2128,6 +2128,9 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, nconf->dot11MeshConnectedToMeshGate; if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) conf->dot11MeshNolearn = nconf->dot11MeshNolearn; + if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_AS, mask)) + conf->dot11MeshConnectedToAuthServer = + nconf->dot11MeshConnectedToAuthServer; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 09eab2c3f380..fe8a7a87e513 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -639,6 +639,8 @@ IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration, IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate, u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC); IEEE80211_IF_FILE(dot11MeshNolearn, u.mesh.mshcfg.dot11MeshNolearn, DEC); +IEEE80211_IF_FILE(dot11MeshConnectedToAuthServer, + u.mesh.mshcfg.dot11MeshConnectedToAuthServer, DEC); #endif #define DEBUGFS_ADD_MODE(name, mode) \ @@ -764,6 +766,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata) MESHPARAMS_ADD(dot11MeshAwakeWindowDuration); MESHPARAMS_ADD(dot11MeshConnectedToMeshGate); MESHPARAMS_ADD(dot11MeshNolearn); + MESHPARAMS_ADD(dot11MeshConnectedToAuthServer); #undef MESHPARAMS_ADD } #endif diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 96f0323c0a3d..d0db6af16427 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -260,6 +260,7 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, bool is_connected_to_gate = ifmsh->num_gates > 0 || ifmsh->mshcfg.dot11MeshGateAnnouncementProtocol || ifmsh->mshcfg.dot11MeshConnectedToMeshGate; + bool is_connected_to_as = ifmsh->mshcfg.dot11MeshConnectedToAuthServer; if (skb_tailroom(skb) < 2 + meshconf_len) return -ENOMEM; @@ -284,7 +285,9 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, /* Mesh Formation Info - number of neighbors */ neighbors = atomic_read(&ifmsh->estab_plinks); neighbors = min_t(int, neighbors, IEEE80211_MAX_MESH_PEERINGS); - *pos++ = (neighbors << 1) | is_connected_to_gate; + *pos++ = (is_connected_to_as << 7) | + (neighbors << 1) | + is_connected_to_gate; /* Mesh capability */ *pos = 0x00; *pos |= ifmsh->mshcfg.dot11MeshForwarding ? diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 257c06315464..434fd06dc5cf 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6887,7 +6887,9 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE, cur_params.dot11MeshConnectedToMeshGate) || nla_put_u8(msg, NL80211_MESHCONF_NOLEARN, - cur_params.dot11MeshNolearn)) + cur_params.dot11MeshNolearn) || + nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_AS, + cur_params.dot11MeshConnectedToAuthServer)) goto nla_put_failure; nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); @@ -6946,6 +6948,7 @@ nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, [NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1), [NL80211_MESHCONF_NOLEARN] = NLA_POLICY_RANGE(NLA_U8, 0, 1), + [NL80211_MESHCONF_CONNECTED_TO_AS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), }; static const struct nla_policy @@ -7058,6 +7061,9 @@ do { \ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToMeshGate, mask, NL80211_MESHCONF_CONNECTED_TO_GATE, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToAuthServer, mask, + NL80211_MESHCONF_CONNECTED_TO_AS, + nla_get_u8); /* * Check HT operation mode based on * IEEE 802.11-2016 9.4.2.57 HT Operation element. -- cgit v1.2.3 From 1303a51c24100b3b1915d6f9072fe5ae5bb4c5f6 Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Thu, 11 Jun 2020 16:02:38 +0200 Subject: cfg80211/mac80211: add connected to auth server to station info This patch adds the necessary bits to later query the auth server flag for every peer from iw. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200611140238.427461-2-markus.theil@tu-ilmenau.de Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 3 +++ net/mac80211/sta_info.c | 4 +++- net/mac80211/sta_info.h | 2 ++ net/wireless/nl80211.c | 1 + 5 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8d5071f84ffe..39fe21edd2c5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1598,6 +1598,7 @@ struct cfg80211_tid_stats { * an FCS error. This counter should be incremented only when TA of the * received packet with an FCS error matches the peer MAC address. * @airtime_link_metric: mesh airtime link metric. + * @connected_to_as: true if mesh STA has a path to authentication server */ struct station_info { u64 filled; @@ -1655,6 +1656,8 @@ struct station_info { u32 fcs_err_count; u32 airtime_link_metric; + + u8 connected_to_as; }; #if IS_ENABLED(CONFIG_CFG80211) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f1770e3756f4..d6b6599a6001 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3370,6 +3370,8 @@ enum nl80211_sta_bss_param { * @NL80211_STA_INFO_AIRTIME_LINK_METRIC: airtime link metric for mesh station * @NL80211_STA_INFO_ASSOC_AT_BOOTTIME: Timestamp (CLOCK_BOOTTIME, nanoseconds) * of STA's association + * @NL80211_STA_INFO_CONNECTED_TO_AS: set to true if STA has a path to a + * authentication server (u8, 0 or 1) * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -3417,6 +3419,7 @@ enum nl80211_sta_info { NL80211_STA_INFO_AIRTIME_WEIGHT, NL80211_STA_INFO_AIRTIME_LINK_METRIC, NL80211_STA_INFO_ASSOC_AT_BOOTTIME, + NL80211_STA_INFO_CONNECTED_TO_AS, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index cd8487bc6fc2..a39773b40457 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2426,7 +2426,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | - BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE); + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; @@ -2439,6 +2440,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; + sinfo->connected_to_as = sta->mesh->connected_to_as; #endif } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 49728047dfad..9d398c9daa4c 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -385,6 +385,7 @@ DECLARE_EWMA(mesh_tx_rate_avg, 8, 16) * @processed_beacon: set to true after peer rates and capabilities are * processed * @connected_to_gate: true if mesh STA has a path to a mesh gate + * @connected_to_as: true if mesh STA has a path to a authentication server * @fail_avg: moving percentage of failed MSDUs * @tx_rate_avg: moving average of tx bitrate */ @@ -404,6 +405,7 @@ struct mesh_sta { bool processed_beacon; bool connected_to_gate; + bool connected_to_as; enum nl80211_plink_state plink_state; u32 plink_timeout; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 434fd06dc5cf..13a38aab1565 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5395,6 +5395,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(PEER_PM, peer_pm, u32); PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8); + PUT_SINFO(CONNECTED_TO_AS, connected_to_as, u8); if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { bss_param = nla_nest_start_noflag(msg, -- cgit v1.2.3 From fd17dba1c860d39f655a3a08387c21e3ceca8c55 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Mon, 20 Jul 2020 13:12:25 +0530 Subject: cfg80211: Add support to advertize OCV support Add a new feature flag that drivers can use to advertize support for Operating Channel Validation (OCV) when using driver's SME for RSNA handshakes. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20200720074225.8990-1-vjakkam@codeaurora.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d6b6599a6001..a3ae2b060a55 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5804,6 +5804,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS: The driver * can report tx status for control port over nl80211 tx operations. * + * @NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION: Driver supports Operating + * Channel Validation (OCV) when using driver's SME for RSNA handshakes. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5859,6 +5862,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, + NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From f96622749a67d40ad5efe8a58d5fc95313097aa0 Mon Sep 17 00:00:00 2001 From: Chung-Hsien Hsu Date: Tue, 23 Jun 2020 08:49:35 -0500 Subject: nl80211: support 4-way handshake offloading for WPA/WPA2-PSK in AP mode Let drivers advertise support for AP-mode WPA/WPA2-PSK 4-way handshake offloading with a new NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK flag. Extend use of NL80211_ATTR_PMK attribute indicating it might be passed as part of NL80211_CMD_START_AP command, and contain the PSK (which is the PMK, hence the name). The driver is assumed to handle the 4-way handshake by itself in this case, instead of relying on userspace. Signed-off-by: Chung-Hsien Hsu Signed-off-by: Chi-Hsien Lin Link: https://lore.kernel.org/r/20200623134938.39997-2-chi-hsien.lin@cypress.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 41 ++++++++++++++++++++++++++++------------- net/wireless/nl80211.c | 4 +++- 2 files changed, 31 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a3ae2b060a55..631f3a997b3c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -183,18 +183,27 @@ * * By setting @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK flag drivers * can indicate they support offloading EAPOL handshakes for WPA/WPA2 - * preshared key authentication. In %NL80211_CMD_CONNECT the preshared - * key should be specified using %NL80211_ATTR_PMK. Drivers supporting - * this offload may reject the %NL80211_CMD_CONNECT when no preshared - * key material is provided, for example when that driver does not - * support setting the temporal keys through %CMD_NEW_KEY. + * preshared key authentication in station mode. In %NL80211_CMD_CONNECT + * the preshared key should be specified using %NL80211_ATTR_PMK. Drivers + * supporting this offload may reject the %NL80211_CMD_CONNECT when no + * preshared key material is provided, for example when that driver does + * not support setting the temporal keys through %NL80211_CMD_NEW_KEY. * * Similarly @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X flag can be * set by drivers indicating offload support of the PTK/GTK EAPOL - * handshakes during 802.1X authentication. In order to use the offload - * the %NL80211_CMD_CONNECT should have %NL80211_ATTR_WANT_1X_4WAY_HS - * attribute flag. Drivers supporting this offload may reject the - * %NL80211_CMD_CONNECT when the attribute flag is not present. + * handshakes during 802.1X authentication in station mode. In order to + * use the offload the %NL80211_CMD_CONNECT should have + * %NL80211_ATTR_WANT_1X_4WAY_HS attribute flag. Drivers supporting this + * offload may reject the %NL80211_CMD_CONNECT when the attribute flag is + * not present. + * + * By setting @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK flag drivers + * can indicate they support offloading EAPOL handshakes for WPA/WPA2 + * preshared key authentication in AP mode. In %NL80211_CMD_START_AP + * the preshared key should be specified using %NL80211_ATTR_PMK. Drivers + * supporting this offload may reject the %NL80211_CMD_START_AP when no + * preshared key material is provided, for example when that driver does + * not support setting the temporal keys through %NL80211_CMD_NEW_KEY. * * For 802.1X the PMK or PMK-R0 are set by providing %NL80211_ATTR_PMK * using %NL80211_CMD_SET_PMK. For offloaded FT support also @@ -2362,10 +2371,11 @@ enum nl80211_commands { * * @NL80211_ATTR_PMK: attribute for passing PMK key material. Used with * %NL80211_CMD_SET_PMKSA for the PMKSA identified by %NL80211_ATTR_PMKID. - * For %NL80211_CMD_CONNECT it is used to provide PSK for offloading 4-way - * handshake for WPA/WPA2-PSK networks. For 802.1X authentication it is - * used with %NL80211_CMD_SET_PMK. For offloaded FT support this attribute - * specifies the PMK-R0 if NL80211_ATTR_PMKR0_NAME is included as well. + * For %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP it is used to provide + * PSK for offloading 4-way handshake for WPA/WPA2-PSK networks. For 802.1X + * authentication it is used with %NL80211_CMD_SET_PMK. For offloaded FT + * support this attribute specifies the PMK-R0 if NL80211_ATTR_PMKR0_NAME + * is included as well. * * @NL80211_ATTR_SCHED_SCAN_MULTI: flag attribute which user-space shall use to * indicate that it supports multiple active scheduled scan requests. @@ -5807,6 +5817,10 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION: Driver supports Operating * Channel Validation (OCV) when using driver's SME for RSNA handshakes. * + * @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK: Device wants to do 4-way + * handshake with PSK in AP mode (PSK is passed as part of the start AP + * command). + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5863,6 +5877,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8d78a6fc59a3..a096682ec0ad 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9442,7 +9442,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, if (nla_len(info->attrs[NL80211_ATTR_PMK]) != WLAN_PMK_LEN) return -EINVAL; if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK)) + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK)) return -EINVAL; settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]); } -- cgit v1.2.3 From 48040793fa6003d211f021c6ad273477bcd90d91 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Thu, 30 Jul 2020 15:44:40 -0700 Subject: tcp: add earliest departure time to SCM_TIMESTAMPING_OPT_STATS This change adds TCP_NLA_EDT to SCM_TIMESTAMPING_OPT_STATS that reports the earliest departure time(EDT) of the timestamped skb. By tracking EDT values of the skb from different timestamps, we can observe when and how much the value changed. This allows to measure the precise delay injected on the sender host e.g. by a bpf-base throttler. Signed-off-by: Yousuk Seung Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 ++- include/uapi/linux/tcp.h | 1 + net/core/skbuff.c | 2 +- net/ipv4/tcp.c | 6 +++++- 4 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 527d668a5275..14b62d7df942 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -484,7 +484,8 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) tp->saved_syn = NULL; } -struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk); +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, + const struct sk_buff *orig_skb); static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index f2acb2566333..cfcb10b75483 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -313,6 +313,7 @@ enum { TCP_NLA_SRTT, /* smoothed RTT in usecs */ TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ + TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b8afefe6f6b6..4e2edfbe0e19 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4692,7 +4692,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - skb = tcp_get_timestamping_opt_stats(sk); + skb = tcp_get_timestamping_opt_stats(sk, orig_skb); opt_stats = true; } else #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4afec552f211..c06d2bfd2ec4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3501,10 +3501,12 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ 0; } -struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, + const struct sk_buff *orig_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; @@ -3558,6 +3560,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); + nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, + TCP_NLA_PAD); return stats; } -- cgit v1.2.3 From 829eb208e80d6db95c0201cb8fa00c2f9ad87faf Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Fri, 31 Jul 2020 17:34:01 -0700 Subject: rtnetlink: add support for protodown reason netdev protodown is a mechanism that allows protocols to hold an interface down. It was initially introduced in the kernel to hold links down by a multihoming protocol. There was also an attempt to introduce protodown reason at the time but was rejected. protodown and protodown reason is supported by almost every switching and routing platform. It was ok for a while to live without a protodown reason. But, its become more critical now given more than one protocol may need to keep a link down on a system at the same time. eg: vrrp peer node, port security, multihoming protocol. Its common for Network operators and protocol developers to look for such a reason on a networking box (Its also known as errDisable by most networking operators) This patch adds support for link protodown reason attribute. There are two ways to maintain protodown reasons. (a) enumerate every possible reason code in kernel - A protocol developer has to make a request and have that appear in a certain kernel version (b) provide the bits in the kernel, and allow user-space (sysadmin or NOS distributions) to manage the bit-to-reasonname map. - This makes extending reason codes easier (kind of like the iproute2 table to vrf-name map /etc/iproute2/rt_tables.d/) This patch takes approach (b). a few things about the patch: - It treats the protodown reason bits as counter to indicate active protodown users - Since protodown attribute is already an exposed UAPI, the reason is not enforced on a protodown set. Its a no-op if not used. the patch follows the below algorithm: - presence of reason bits set indicates protodown is in use - user can set protodown and protodown reason in a single or multiple setlink operations - setlink operation to clear protodown, will return -EBUSY if there are active protodown reason bits - reason is not included in link dumps if not used example with patched iproute2: $cat /etc/iproute2/protodown_reasons.d/r.conf 0 mlag 1 evpn 2 vrrp 3 psecurity $ip link set dev vxlan0 protodown on protodown_reason vrrp on $ip link set dev vxlan0 protodown_reason mlag on $ip link show 14: vxlan0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether f6:06:be:17:91:e7 brd ff:ff:ff:ff:ff:ff protodown on $ip link set dev vxlan0 protodown_reason mlag off $ip link set dev vxlan0 protodown off protodown_reason vrrp off Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++ include/uapi/linux/if_link.h | 10 ++++ net/core/dev.c | 25 ++++++++++ net/core/rtnetlink.c | 113 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 147 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ac2cd3f49aba..ba0fa6b22787 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2058,6 +2058,8 @@ struct net_device { struct timer_list watchdog_timer; int watchdog_timeo; + u32 proto_down_reason; + struct list_head todo_list; int __percpu *pcpu_refcnt; @@ -3810,6 +3812,8 @@ int dev_get_port_parent_id(struct net_device *dev, bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); int dev_change_proto_down(struct net_device *dev, bool proto_down); int dev_change_proto_down_generic(struct net_device *dev, bool proto_down); +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 63af64646358..7fba4de511de 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -170,12 +170,22 @@ enum { IFLA_PROP_LIST, IFLA_ALT_IFNAME, /* Alternative ifname */ IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, __IFLA_MAX }; #define IFLA_MAX (__IFLA_MAX - 1) +enum { + IFLA_PROTO_DOWN_REASON_UNSPEC, + IFLA_PROTO_DOWN_REASON_MASK, /* u32, mask for reason bits */ + IFLA_PROTO_DOWN_REASON_VALUE, /* u32, reason bit value */ + + __IFLA_PROTO_DOWN_REASON_CNT, + IFLA_PROTO_DOWN_REASON_MAX = __IFLA_PROTO_DOWN_REASON_CNT - 1 +}; + /* backwards compatibility for userspace */ #ifndef __KERNEL__ #define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) diff --git a/net/core/dev.c b/net/core/dev.c index 38a6371d9bc5..f7ef0f5c5569 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8715,6 +8715,31 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); +/** + * dev_change_proto_down_reason - proto down reason + * + * @dev: device + * @mask: proto down mask + * @value: proto down value + */ +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value) +{ + int b; + + if (!mask) { + dev->proto_down_reason = value; + } else { + for_each_set_bit(b, &mask, 32) { + if (value & (1 << b)) + dev->proto_down_reason |= BIT(b); + else + dev->proto_down_reason &= ~BIT(b); + } + } +} +EXPORT_SYMBOL(dev_change_proto_down_reason); + u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, enum bpf_netdev_command cmd) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 85a4b0101f76..a54c3e0f2ee1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1000,6 +1000,16 @@ static size_t rtnl_prop_list_size(const struct net_device *dev) return size; } +static size_t rtnl_proto_down_size(const struct net_device *dev) +{ + size_t size = nla_total_size(1); + + if (dev->proto_down_reason) + size += nla_total_size(0) + nla_total_size(4); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1041,7 +1051,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ - + nla_total_size(1) /* IFLA_PROTO_DOWN */ + + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ @@ -1658,6 +1668,35 @@ nest_cancel: return ret; } +static int rtnl_fill_proto_down(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *pr; + u32 preason; + + if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + goto nla_put_failure; + + preason = dev->proto_down_reason; + if (!preason) + return 0; + + pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); + if (!pr) + return -EMSGSIZE; + + if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { + nla_nest_cancel(skb, pr); + goto nla_put_failure; + } + + nla_nest_end(skb, pr); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1708,13 +1747,15 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || - nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; + if (rtnl_fill_proto_down(skb, dev)) + goto nla_put_failure; + if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; @@ -1834,6 +1875,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, + [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2483,6 +2525,67 @@ static int do_set_master(struct net_device *dev, int ifindex, return 0; } +static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { + [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, + [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, +}; + +static int do_set_proto_down(struct net_device *dev, + struct nlattr *nl_proto_down, + struct nlattr *nl_proto_down_reason, + struct netlink_ext_ack *extack) +{ + struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; + const struct net_device_ops *ops = dev->netdev_ops; + unsigned long mask = 0; + u32 value; + bool proto_down; + int err; + + if (!ops->ndo_change_proto_down) { + NL_SET_ERR_MSG(extack, "Protodown not supported by device"); + return -EOPNOTSUPP; + } + + if (nl_proto_down_reason) { + err = nla_parse_nested_deprecated(pdreason, + IFLA_PROTO_DOWN_REASON_MAX, + nl_proto_down_reason, + ifla_proto_down_reason_policy, + NULL); + if (err < 0) + return err; + + if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { + NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); + return -EINVAL; + } + + value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); + + if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) + mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); + + dev_change_proto_down_reason(dev, mask, value); + } + + if (nl_proto_down) { + proto_down = nla_get_u8(nl_proto_down); + + /* Dont turn off protodown if there are active reasons */ + if (!proto_down && dev->proto_down_reason) { + NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); + return -EBUSY; + } + err = dev_change_proto_down(dev, + proto_down); + if (err) + return err; + } + + return 0; +} + #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 @@ -2771,9 +2874,9 @@ static int do_setlink(const struct sk_buff *skb, } err = 0; - if (tb[IFLA_PROTO_DOWN]) { - err = dev_change_proto_down(dev, - nla_get_u8(tb[IFLA_PROTO_DOWN])); + if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { + err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], + tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; -- cgit v1.2.3 From 73b11c2ab072d5b0599d1e12cc126f55ee306daf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 31 Jul 2020 11:28:26 -0700 Subject: bpf: Add support for forced LINK_DETACH command Add LINK_DETACH command to force-detach bpf_link without destroying it. It has the same behavior as auto-detaching of bpf_link due to cgroup dying for bpf_cgroup_link or net_device being destroyed for bpf_xdp_link. In such case, bpf_link is still a valid kernel object, but is defuncts and doesn't hold BPF program attached to corresponding BPF hook. This functionality allows users with enough access rights to manually force-detach attached bpf_link without killing respective owner process. This patch implements LINK_DETACH for cgroup, xdp, and netns links, mostly re-using existing link release handling code. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200731182830.286260-2-andriin@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/cgroup.c | 15 ++++++++++++++- kernel/bpf/net_namespace.c | 8 ++++++++ kernel/bpf/syscall.c | 26 ++++++++++++++++++++++++++ net/core/dev.c | 11 ++++++++++- 6 files changed, 64 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 40c5e206ecf2..cef4ef0d2b4e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -793,6 +793,7 @@ struct bpf_link { struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); + int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index eb5e0c38eb2c..b134e679e9db 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -117,6 +117,7 @@ enum bpf_cmd { BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, BPF_ITER_CREATE, + BPF_LINK_DETACH, }; enum bpf_map_type { @@ -634,6 +635,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { + __u32 link_fd; + } link_detach; + struct { /* struct used by BPF_ENABLE_STATS command */ __u32 type; } enable_stats; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 957cce1d5168..83ff127ef7ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -814,6 +814,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link) { struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link); + struct cgroup *cg; /* link might have been auto-detached by dying cgroup already, * in that case our work is done here @@ -832,8 +833,12 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + cg = cg_link->cgroup; + cg_link->cgroup = NULL; + mutex_unlock(&cgroup_mutex); - cgroup_put(cg_link->cgroup); + + cgroup_put(cg); } static void bpf_cgroup_link_dealloc(struct bpf_link *link) @@ -844,6 +849,13 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } +static int bpf_cgroup_link_detach(struct bpf_link *link) +{ + bpf_cgroup_link_release(link); + + return 0; +} + static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { @@ -883,6 +895,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .detach = bpf_cgroup_link_detach, .update_prog = cgroup_bpf_replace, .show_fdinfo = bpf_cgroup_link_show_fdinfo, .fill_link_info = bpf_cgroup_link_fill_link_info, diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 71405edd667c..542f275bf252 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -142,9 +142,16 @@ static void bpf_netns_link_release(struct bpf_link *link) bpf_prog_array_free(old_array); out_unlock: + net_link->net = NULL; mutex_unlock(&netns_bpf_mutex); } +static int bpf_netns_link_detach(struct bpf_link *link) +{ + bpf_netns_link_release(link); + return 0; +} + static void bpf_netns_link_dealloc(struct bpf_link *link) { struct bpf_netns_link *net_link = @@ -228,6 +235,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, static const struct bpf_link_ops bpf_netns_link_ops = { .release = bpf_netns_link_release, .dealloc = bpf_netns_link_dealloc, + .detach = bpf_netns_link_detach, .update_prog = bpf_netns_link_update_prog, .fill_link_info = bpf_netns_link_fill_info, .show_fdinfo = bpf_netns_link_show_fdinfo, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cd3d599e9e90..2f343ce15747 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3991,6 +3991,29 @@ out_put_link: return ret; } +#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd + +static int link_detach(union bpf_attr *attr) +{ + struct bpf_link *link; + int ret; + + if (CHECK_ATTR(BPF_LINK_DETACH)) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->link_detach.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + if (link->ops->detach) + ret = link->ops->detach(link); + else + ret = -EOPNOTSUPP; + + bpf_link_put(link); + return ret; +} + static int bpf_link_inc_not_zero(struct bpf_link *link) { return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; @@ -4240,6 +4263,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ITER_CREATE: err = bpf_iter_create(&attr); break; + case BPF_LINK_DETACH: + err = link_detach(&attr); + break; default: err = -EINVAL; break; diff --git a/net/core/dev.c b/net/core/dev.c index a2a57988880a..c8b911b10187 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8979,12 +8979,20 @@ static void bpf_xdp_link_release(struct bpf_link *link) /* if racing with net_device's tear down, xdp_link->dev might be * already NULL, in which case link was already auto-detached */ - if (xdp_link->dev) + if (xdp_link->dev) { WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + xdp_link->dev = NULL; + } rtnl_unlock(); } +static int bpf_xdp_link_detach(struct bpf_link *link) +{ + bpf_xdp_link_release(link); + return 0; +} + static void bpf_xdp_link_dealloc(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); @@ -9066,6 +9074,7 @@ out_unlock: static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, + .detach = bpf_xdp_link_detach, .show_fdinfo = bpf_xdp_link_show_fdinfo, .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, -- cgit v1.2.3 From f8951902b9daa65ba240ce8a054c727748df2147 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:32:08 -0700 Subject: MTD: mtd-abi.h: drop a duplicated word Drop the repeated word "mode" in a comment. Signed-off-by: Randy Dunlap Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Cc: linux-mtd@lists.infradead.org Signed-off-by: Richard Weinberger --- include/uapi/mtd/mtd-abi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/mtd/mtd-abi.h b/include/uapi/mtd/mtd-abi.h index 4b48fbf7d343..65b9db936557 100644 --- a/include/uapi/mtd/mtd-abi.h +++ b/include/uapi/mtd/mtd-abi.h @@ -262,7 +262,7 @@ struct mtd_ecc_stats { * @MTD_FILE_MODE_OTP_USER: OTP enabled in user mode * @MTD_FILE_MODE_RAW: OTP disabled, ECC disabled * - * These modes can be set via ioctl(MTDFILEMODE). The mode mode will be retained + * These modes can be set via ioctl(MTDFILEMODE). The mode will be retained * separately for each open file descriptor. * * Note: %MTD_FILE_MODE_RAW provides the same functionality as %MTD_OPS_RAW - -- cgit v1.2.3 From e5a52fd2b8cdb700b3c07b030e050a49ef3156b9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Jul 2020 17:33:17 -0700 Subject: xen/gntdev: gntdev.h: drop a duplicated word Drop the repeated word "of" in a comment. Signed-off-by: Randy Dunlap Reviewed-by: Juergen Gross Cc: Boris Ostrovsky Cc: Juergen Gross Cc: xen-devel@lists.xenproject.org Link: https://lore.kernel.org/r/20200719003317.21454-1-rdunlap@infradead.org Signed-off-by: Juergen Gross --- include/uapi/xen/gntdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/xen/gntdev.h b/include/uapi/xen/gntdev.h index fe4423e518c6..9ac5515b9bc2 100644 --- a/include/uapi/xen/gntdev.h +++ b/include/uapi/xen/gntdev.h @@ -66,7 +66,7 @@ struct ioctl_gntdev_map_grant_ref { /* * Removes the grant references from the mapping table of an instance of - * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) + * gntdev. N.B. munmap() must be called on the relevant virtual address(es) * before this ioctl is called, or an error will result. */ #define IOCTL_GNTDEV_UNMAP_GRANT_REF \ -- cgit v1.2.3 From 321bd212619a7269308696e4ddc446930ea73fad Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Jun 2020 18:24:33 -0400 Subject: virtio: VIRTIO_F_IOMMU_PLATFORM -> VIRTIO_F_ACCESS_PLATFORM Rename the bit to match latest virtio spec. Add a compat macro to avoid breaking existing userspace. Signed-off-by: Michael S. Tsirkin Reviewed-by: David Hildenbrand --- arch/um/drivers/virtio_uml.c | 2 +- drivers/vdpa/ifcvf/ifcvf_base.h | 2 +- drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 ++-- drivers/vhost/net.c | 4 ++-- drivers/vhost/vdpa.c | 2 +- drivers/virtio/virtio_balloon.c | 2 +- drivers/virtio/virtio_ring.c | 2 +- include/linux/virtio_config.h | 2 +- include/uapi/linux/virtio_config.h | 10 +++++++--- tools/virtio/linux/virtio_config.h | 2 +- 10 files changed, 18 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 351aee52aca6..a6c4bb6c2c01 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -385,7 +385,7 @@ static irqreturn_t vu_req_interrupt(int irq, void *data) } break; case VHOST_USER_SLAVE_IOTLB_MSG: - /* not supported - VIRTIO_F_IOMMU_PLATFORM */ + /* not supported - VIRTIO_F_ACCESS_PLATFORM */ case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG: /* not supported - VHOST_USER_PROTOCOL_F_HOST_NOTIFIER */ default: diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index f4554412e607..24af422b5a3e 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -29,7 +29,7 @@ (1ULL << VIRTIO_F_VERSION_1) | \ (1ULL << VIRTIO_NET_F_STATUS) | \ (1ULL << VIRTIO_F_ORDER_PLATFORM) | \ - (1ULL << VIRTIO_F_IOMMU_PLATFORM) | \ + (1ULL << VIRTIO_F_ACCESS_PLATFORM) | \ (1ULL << VIRTIO_NET_F_MRG_RXBUF)) /* Only one queue pair for now. */ diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index c7334cc65bb2..a9bc5e0fb353 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -55,7 +55,7 @@ struct vdpasim_virtqueue { static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | (1ULL << VIRTIO_F_VERSION_1) | - (1ULL << VIRTIO_F_IOMMU_PLATFORM); + (1ULL << VIRTIO_F_ACCESS_PLATFORM); /* State of each vdpasim device */ struct vdpasim { @@ -450,7 +450,7 @@ static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); /* DMA mapping must be done by driver */ - if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) + if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) return -EINVAL; vdpasim->features = features & vdpasim_features; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e992decfec53..8e0921d3805d 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -73,7 +73,7 @@ enum { VHOST_NET_FEATURES = VHOST_FEATURES | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | - (1ULL << VIRTIO_F_IOMMU_PLATFORM) + (1ULL << VIRTIO_F_ACCESS_PLATFORM) }; enum { @@ -1653,7 +1653,7 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) !vhost_log_access_ok(&n->dev)) goto out_unlock; - if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) { + if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { if (vhost_init_device_iotlb(&n->dev, true)) goto out_unlock; } diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index a54b60d6623f..18869a35d408 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -31,7 +31,7 @@ enum { (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | (1ULL << VIRTIO_F_ANY_LAYOUT) | (1ULL << VIRTIO_F_VERSION_1) | - (1ULL << VIRTIO_F_IOMMU_PLATFORM) | + (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_F_RING_PACKED) | (1ULL << VIRTIO_F_ORDER_PLATFORM) | (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 8be02f333b7a..54fd989f9353 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -1129,7 +1129,7 @@ static int virtballoon_validate(struct virtio_device *vdev) else if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_REPORTING); - __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM); + __virtio_clear_bit(vdev, VIRTIO_F_ACCESS_PLATFORM); return 0; } diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 58b96baa8d48..a1a5c2a91426 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2225,7 +2225,7 @@ void vring_transport_features(struct virtio_device *vdev) break; case VIRTIO_F_VERSION_1: break; - case VIRTIO_F_IOMMU_PLATFORM: + case VIRTIO_F_ACCESS_PLATFORM: break; case VIRTIO_F_RING_PACKED: break; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index bb4cc4910750..f2cc2a0df174 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -171,7 +171,7 @@ static inline bool virtio_has_iommu_quirk(const struct virtio_device *vdev) * Note the reverse polarity of the quirk feature (compared to most * other features), this is for compatibility with legacy systems. */ - return !virtio_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + return !virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM); } static inline diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index ff8e7dc9d4dd..b5eda06f0d57 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -67,13 +67,17 @@ #define VIRTIO_F_VERSION_1 32 /* - * If clear - device has the IOMMU bypass quirk feature. - * If set - use platform tools to detect the IOMMU. + * If clear - device has the platform DMA (e.g. IOMMU) bypass quirk feature. + * If set - use platform DMA tools to access the memory. * * Note the reverse polarity (compared to most other features), * this is for compatibility with legacy systems. */ -#define VIRTIO_F_IOMMU_PLATFORM 33 +#define VIRTIO_F_ACCESS_PLATFORM 33 +#ifndef __KERNEL__ +/* Legacy name for VIRTIO_F_ACCESS_PLATFORM (for compatibility with old userspace) */ +#define VIRTIO_F_IOMMU_PLATFORM VIRTIO_F_ACCESS_PLATFORM +#endif /* __KERNEL__ */ /* This feature indicates support for the packed virtqueue layout. */ #define VIRTIO_F_RING_PACKED 34 diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h index dbf14c1e2188..f99ae42668e0 100644 --- a/tools/virtio/linux/virtio_config.h +++ b/tools/virtio/linux/virtio_config.h @@ -51,7 +51,7 @@ static inline bool virtio_has_iommu_quirk(const struct virtio_device *vdev) * Note the reverse polarity of the quirk feature (compared to most * other features), this is for compatibility with legacy systems. */ - return !virtio_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + return !virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM); } static inline bool virtio_is_little_endian(struct virtio_device *vdev) -- cgit v1.2.3 From 9d2f627b7ec9d5d3246b6cec17f290ee6778c83b Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 31 Jul 2020 14:20:56 +0200 Subject: net: openvswitch: add masks cache hit counter Add a counter that counts the number of masks cache hits, and export it through the megaflow netlink statistics. Reviewed-by: Paolo Abeni Reviewed-by: Tonghao Zhang Signed-off-by: Eelco Chaudron Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 +- net/openvswitch/datapath.c | 5 ++++- net/openvswitch/datapath.h | 3 +++ net/openvswitch/flow_table.c | 19 ++++++++++++++----- net/openvswitch/flow_table.h | 3 ++- 5 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 9b14519e74d9..7cb76e5ca7cf 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -102,8 +102,8 @@ struct ovs_dp_megaflow_stats { __u64 n_mask_hit; /* Number of masks used for flow lookups. */ __u32 n_masks; /* Number of masks for the datapath. */ __u32 pad0; /* Pad for future expension. */ + __u64 n_cache_hit; /* Number of cache matches for flow lookups. */ __u64 pad1; /* Pad for future expension. */ - __u64 pad2; /* Pad for future expension. */ }; struct ovs_vport_stats { diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6b6822f82f70..f45fee760504 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -225,13 +225,14 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; + u32 n_cache_hit; int error; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), - &n_mask_hit); + &n_mask_hit, &n_cache_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -262,6 +263,7 @@ out: u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; + stats->n_cache_hit += n_cache_hit; u64_stats_update_end(&stats->syncp); } @@ -699,6 +701,7 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; mega_stats->n_mask_hit += local_stats.n_mask_hit; + mega_stats->n_cache_hit += local_stats.n_cache_hit; } } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 24fcec22fde2..38f7d3e66ca6 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -38,12 +38,15 @@ * @n_mask_hit: Number of masks looked up for flow match. * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked * up per packet. + * @n_cache_hit: The number of received packets that had their mask found using + * the mask cache. */ struct dp_stats_percpu { u64 n_hit; u64 n_missed; u64 n_lost; u64 n_mask_hit; + u64 n_cache_hit; struct u64_stats_sync syncp; }; diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index af22c9ee28dd..a5912ea05352 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -667,6 +667,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, + u32 *n_cache_hit, u32 *index) { u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr); @@ -682,6 +683,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, u64_stats_update_begin(&ma->syncp); usage_counters[*index]++; u64_stats_update_end(&ma->syncp); + (*n_cache_hit)++; return flow; } } @@ -719,7 +721,8 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, - u32 *n_mask_hit) + u32 *n_mask_hit, + u32 *n_cache_hit) { struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); @@ -729,10 +732,13 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, int seg; *n_mask_hit = 0; + *n_cache_hit = 0; if (unlikely(!skb_hash)) { u32 mask_index = 0; + u32 cache = 0; - return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); + return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache, + &mask_index); } /* Pre and post recirulation flows usually have the same skb_hash @@ -753,7 +759,7 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, e = &entries[index]; if (e->skb_hash == skb_hash) { flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, - &e->mask_index); + n_cache_hit, &e->mask_index); if (!flow) e->skb_hash = 0; return flow; @@ -766,10 +772,12 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, } /* Cache miss, do full lookup. */ - flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, + &ce->mask_index); if (flow) ce->skb_hash = skb_hash; + *n_cache_hit = 0; return flow; } @@ -779,9 +787,10 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; + u32 __always_unused n_cache_hit; u32 index = 0; - return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); + return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 1f664b050e3b..325e939371d8 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -82,7 +82,8 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, const struct sw_flow_key *, u32 skb_hash, - u32 *n_mask_hit); + u32 *n_mask_hit, + u32 *n_cache_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, -- cgit v1.2.3 From 9bf24f594c6acf676fb8c229f152c21bfb915ddb Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 31 Jul 2020 14:21:34 +0200 Subject: net: openvswitch: make masks cache size configurable This patch makes the masks cache size configurable, or with a size of 0, disable it. Reviewed-by: Paolo Abeni Reviewed-by: Tonghao Zhang Signed-off-by: Eelco Chaudron Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/datapath.c | 17 +++++++ net/openvswitch/flow_table.c | 101 ++++++++++++++++++++++++++++++++++----- net/openvswitch/flow_table.h | 10 +++- 4 files changed, 115 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 7cb76e5ca7cf..8300cc29dec8 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -86,6 +86,7 @@ enum ovs_datapath_attr { OVS_DP_ATTR_MEGAFLOW_STATS, /* struct ovs_dp_megaflow_stats */ OVS_DP_ATTR_USER_FEATURES, /* OVS_DP_F_* */ OVS_DP_ATTR_PAD, + OVS_DP_ATTR_MASKS_CACHE_SIZE, __OVS_DP_ATTR_MAX }; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f45fee760504..42f8cc70bb2c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1498,6 +1498,7 @@ static size_t ovs_dp_cmd_msg_size(void) msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */ return msgsize; } @@ -1535,6 +1536,10 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; + if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, + ovs_flow_tbl_masks_cache_size(&dp->table))) + goto nla_put_failure; + genlmsg_end(skb, ovs_header); return 0; @@ -1599,6 +1604,16 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) #endif } + if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) { + int err; + u32 cache_size; + + cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); + err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size); + if (err) + return err; + } + dp->user_features = user_features; if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) @@ -1887,6 +1902,8 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, + [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, + PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), }; static const struct genl_ops dp_datapath_genl_ops[] = { diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index a5912ea05352..6527d84c3ea6 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -38,8 +38,8 @@ #define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) +#define MC_DEFAULT_HASH_ENTRIES 256 #define MC_HASH_SHIFT 8 -#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT) #define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) static struct kmem_cache *flow_cache; @@ -341,15 +341,79 @@ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) } } +static void __mask_cache_destroy(struct mask_cache *mc) +{ + free_percpu(mc->mask_cache); + kfree(mc); +} + +static void mask_cache_rcu_cb(struct rcu_head *rcu) +{ + struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu); + + __mask_cache_destroy(mc); +} + +static struct mask_cache *tbl_mask_cache_alloc(u32 size) +{ + struct mask_cache_entry __percpu *cache = NULL; + struct mask_cache *new; + + /* Only allow size to be 0, or a power of 2, and does not exceed + * percpu allocation size. + */ + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return NULL; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->cache_size = size; + if (new->cache_size > 0) { + cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry), + new->cache_size), + __alignof__(struct mask_cache_entry)); + if (!cache) { + kfree(new); + return NULL; + } + } + + new->mask_cache = cache; + return new; +} +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size) +{ + struct mask_cache *mc = rcu_dereference(table->mask_cache); + struct mask_cache *new; + + if (size == mc->cache_size) + return 0; + + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return -EINVAL; + + new = tbl_mask_cache_alloc(size); + if (!new) + return -ENOMEM; + + rcu_assign_pointer(table->mask_cache, new); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + + return 0; +} + int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; + struct mask_cache *mc; struct mask_array *ma; - table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * - MC_HASH_ENTRIES, - __alignof__(struct mask_cache_entry)); - if (!table->mask_cache) + mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES); + if (!mc) return -ENOMEM; ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); @@ -367,6 +431,7 @@ int ovs_flow_tbl_init(struct flow_table *table) rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); rcu_assign_pointer(table->mask_array, ma); + rcu_assign_pointer(table->mask_cache, mc); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; @@ -377,7 +442,7 @@ free_ti: free_mask_array: __mask_array_destroy(ma); free_mask_cache: - free_percpu(table->mask_cache); + __mask_cache_destroy(mc); return -ENOMEM; } @@ -453,9 +518,11 @@ void ovs_flow_tbl_destroy(struct flow_table *table) { struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); + struct mask_cache *mc = rcu_dereference(table->mask_cache); + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); - free_percpu(table->mask_cache); - call_rcu(&table->mask_array->rcu, mask_array_rcu_cb); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + call_rcu(&ma->rcu, mask_array_rcu_cb); table_instance_destroy(table, ti, ufid_ti, false); } @@ -724,6 +791,7 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, u32 *n_mask_hit, u32 *n_cache_hit) { + struct mask_cache *mc = rcu_dereference(tbl->mask_cache); struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); struct mask_cache_entry *entries, *ce; @@ -733,7 +801,7 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, *n_mask_hit = 0; *n_cache_hit = 0; - if (unlikely(!skb_hash)) { + if (unlikely(!skb_hash || mc->cache_size == 0)) { u32 mask_index = 0; u32 cache = 0; @@ -749,11 +817,11 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, ce = NULL; hash = skb_hash; - entries = this_cpu_ptr(tbl->mask_cache); + entries = this_cpu_ptr(mc->mask_cache); /* Find the cache entry 'ce' to operate on. */ for (seg = 0; seg < MC_HASH_SEGS; seg++) { - int index = hash & (MC_HASH_ENTRIES - 1); + int index = hash & (mc->cache_size - 1); struct mask_cache_entry *e; e = &entries[index]; @@ -867,6 +935,13 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table) return READ_ONCE(ma->count); } +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table) +{ + struct mask_cache *mc = rcu_dereference(table->mask_cache); + + return READ_ONCE(mc->cache_size); +} + static struct table_instance *table_instance_expand(struct table_instance *ti, bool ufid) { @@ -1095,8 +1170,8 @@ void ovs_flow_masks_rebalance(struct flow_table *table) for (i = 0; i < masks_entries; i++) { int index = masks_and_count[i].index; - new->masks[new->count++] = - rcu_dereference_ovsl(ma->masks[index]); + if (ovsl_dereference(ma->masks[index])) + new->masks[new->count++] = ma->masks[index]; } rcu_assign_pointer(table->mask_array, new); diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 325e939371d8..74ce48fecba9 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -27,6 +27,12 @@ struct mask_cache_entry { u32 mask_index; }; +struct mask_cache { + struct rcu_head rcu; + u32 cache_size; /* Must be ^2 value. */ + struct mask_cache_entry __percpu *mask_cache; +}; + struct mask_count { int index; u64 counter; @@ -53,7 +59,7 @@ struct table_instance { struct flow_table { struct table_instance __rcu *ti; struct table_instance __rcu *ufid_ti; - struct mask_cache_entry __percpu *mask_cache; + struct mask_cache __rcu *mask_cache; struct mask_array __rcu *mask_array; unsigned long last_rehash; unsigned int count; @@ -77,6 +83,8 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, const struct sw_flow_mask *mask); void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); int ovs_flow_tbl_num_masks(const struct flow_table *table); +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table); +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, -- cgit v1.2.3 From 88fab21c691bb1ff164e540735237a385e3afeaf Mon Sep 17 00:00:00 2001 From: Ioana-Ruxandra Stăncioi Date: Mon, 3 Aug 2020 07:33:33 +0000 Subject: seg6_iptunnel: Refactor seg6_lwt_headroom out of uapi header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the function seg6_lwt_headroom out of the seg6_iptunnel.h uapi header, because it is only used in seg6_iptunnel.c. Moreover, it is only used in the kernel code, as indicated by the "#ifdef __KERNEL__". Suggested-by: David Miller Signed-off-by: Ioana-Ruxandra Stăncioi Signed-off-by: David S. Miller --- include/uapi/linux/seg6_iptunnel.h | 21 --------------------- net/ipv6/seg6_iptunnel.c | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 21 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h index 09fb608a35ec..eb815e0d0ac3 100644 --- a/include/uapi/linux/seg6_iptunnel.h +++ b/include/uapi/linux/seg6_iptunnel.h @@ -37,25 +37,4 @@ enum { SEG6_IPTUN_MODE_L2ENCAP, }; -#ifdef __KERNEL__ - -static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) -{ - int head = 0; - - switch (tuninfo->mode) { - case SEG6_IPTUN_MODE_INLINE: - break; - case SEG6_IPTUN_MODE_ENCAP: - head = sizeof(struct ipv6hdr); - break; - case SEG6_IPTUN_MODE_L2ENCAP: - return 0; - } - - return ((tuninfo->srh->hdrlen + 1) << 3) + head; -} - -#endif - #endif diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index e0e9f48ab14f..897fa59c47de 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -27,6 +27,23 @@ #include #endif +static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) +{ + int head = 0; + + switch (tuninfo->mode) { + case SEG6_IPTUN_MODE_INLINE: + break; + case SEG6_IPTUN_MODE_ENCAP: + head = sizeof(struct ipv6hdr); + break; + case SEG6_IPTUN_MODE_L2ENCAP: + return 0; + } + + return ((tuninfo->srh->hdrlen + 1) << 3) + head; +} + struct seg6_lwt { struct dst_cache cache; struct seg6_iptunnel_encap tuninfo[]; -- cgit v1.2.3 From 4476770881d7ac647e3bcae0943f37e00b9c3f3c Mon Sep 17 00:00:00 2001 From: Siddharth Gupta Date: Wed, 29 Jul 2020 10:40:00 -0700 Subject: remoteproc: Add remoteproc character device interface Add the character device interface into remoteproc framework. This interface can be used in order to boot/shutdown remote subsystems and provides a basic ioctl based interface to implement supplementary functionality. An ioctl call is implemented to enable the shutdown on release feature which will allow remote processors to be shutdown when the controlling userspace application crashes or hangs. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Rishabh Bhatnagar Signed-off-by: Siddharth Gupta Link: https://lore.kernel.org/r/1596044401-22083-2-git-send-email-sidgup@codeaurora.org [bjorn: s/int32_t/s32/ per checkpatch] Signed-off-by: Bjorn Andersson --- Documentation/userspace-api/ioctl/ioctl-number.rst | 1 + drivers/remoteproc/Kconfig | 9 ++ drivers/remoteproc/Makefile | 1 + drivers/remoteproc/remoteproc_cdev.c | 124 +++++++++++++++++++++ drivers/remoteproc/remoteproc_internal.h | 28 +++++ include/linux/remoteproc.h | 5 + include/uapi/linux/remoteproc_cdev.h | 37 ++++++ 7 files changed, 205 insertions(+) create mode 100644 drivers/remoteproc/remoteproc_cdev.c create mode 100644 include/uapi/linux/remoteproc_cdev.h (limited to 'include/uapi') diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 59472cd6a11d..2a198838fca9 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -339,6 +339,7 @@ Code Seq# Include File Comments 0xB4 00-0F linux/gpio.h 0xB5 00-0F uapi/linux/rpmsg.h 0xB6 all linux/fpga-dfl.h +0xB7 all uapi/linux/remoteproc_cdev.h 0xC0 00-0F linux/usb/iowarrior.h 0xCA 00-0F uapi/misc/cxl.h 0xCA 10-2F uapi/misc/ocxl.h diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index 48315dc4a30c..c6659dfea7c7 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -14,6 +14,15 @@ config REMOTEPROC if REMOTEPROC +config REMOTEPROC_CDEV + bool "Remoteproc character device interface" + help + Say y here to have a character device interface for the remoteproc + framework. Userspace can boot/shutdown remote processors through + this interface. + + It's safe to say N if you don't want to use this interface. + config IMX_REMOTEPROC tristate "IMX6/7 remoteproc support" depends on ARCH_MXC diff --git a/drivers/remoteproc/Makefile b/drivers/remoteproc/Makefile index 4d4307dc8fa9..3dfa28e6c701 100644 --- a/drivers/remoteproc/Makefile +++ b/drivers/remoteproc/Makefile @@ -10,6 +10,7 @@ remoteproc-y += remoteproc_debugfs.o remoteproc-y += remoteproc_sysfs.o remoteproc-y += remoteproc_virtio.o remoteproc-y += remoteproc_elf_loader.o +obj-$(CONFIG_REMOTEPROC_CDEV) += remoteproc_cdev.o obj-$(CONFIG_IMX_REMOTEPROC) += imx_rproc.o obj-$(CONFIG_INGENIC_VPU_RPROC) += ingenic_rproc.o obj-$(CONFIG_MTK_SCP) += mtk_scp.o mtk_scp_ipi.o diff --git a/drivers/remoteproc/remoteproc_cdev.c b/drivers/remoteproc/remoteproc_cdev.c new file mode 100644 index 000000000000..b19ea3057bde --- /dev/null +++ b/drivers/remoteproc/remoteproc_cdev.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Character device interface driver for Remoteproc framework. + * + * Copyright (c) 2020, The Linux Foundation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "remoteproc_internal.h" + +#define NUM_RPROC_DEVICES 64 +static dev_t rproc_major; + +static ssize_t rproc_cdev_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) +{ + struct rproc *rproc = container_of(filp->f_inode->i_cdev, struct rproc, cdev); + int ret = 0; + char cmd[10]; + + if (!len || len > sizeof(cmd)) + return -EINVAL; + + ret = copy_from_user(cmd, buf, len); + if (ret) + return -EFAULT; + + if (!strncmp(cmd, "start", len)) { + if (rproc->state == RPROC_RUNNING) + return -EBUSY; + + ret = rproc_boot(rproc); + } else if (!strncmp(cmd, "stop", len)) { + if (rproc->state != RPROC_RUNNING) + return -EINVAL; + + rproc_shutdown(rproc); + } else { + dev_err(&rproc->dev, "Unrecognized option\n"); + ret = -EINVAL; + } + + return ret ? ret : len; +} + +static long rproc_device_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct rproc *rproc = container_of(filp->f_inode->i_cdev, struct rproc, cdev); + void __user *argp = (void __user *)arg; + s32 param; + + switch (ioctl) { + case RPROC_SET_SHUTDOWN_ON_RELEASE: + if (copy_from_user(¶m, argp, sizeof(s32))) + return -EFAULT; + + rproc->cdev_put_on_release = !!param; + break; + case RPROC_GET_SHUTDOWN_ON_RELEASE: + param = (s32)rproc->cdev_put_on_release; + if (copy_to_user(argp, ¶m, sizeof(s32))) + return -EFAULT; + + break; + default: + dev_err(&rproc->dev, "Unsupported ioctl\n"); + return -EINVAL; + } + + return 0; +} + +static int rproc_cdev_release(struct inode *inode, struct file *filp) +{ + struct rproc *rproc = container_of(inode->i_cdev, struct rproc, cdev); + + if (rproc->cdev_put_on_release && rproc->state == RPROC_RUNNING) + rproc_shutdown(rproc); + + return 0; +} + +static const struct file_operations rproc_fops = { + .write = rproc_cdev_write, + .unlocked_ioctl = rproc_device_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .release = rproc_cdev_release, +}; + +int rproc_char_device_add(struct rproc *rproc) +{ + int ret; + + cdev_init(&rproc->cdev, &rproc_fops); + rproc->cdev.owner = THIS_MODULE; + + rproc->dev.devt = MKDEV(MAJOR(rproc_major), rproc->index); + cdev_set_parent(&rproc->cdev, &rproc->dev.kobj); + ret = cdev_add(&rproc->cdev, rproc->dev.devt, 1); + if (ret < 0) + dev_err(&rproc->dev, "Failed to add char dev for %s\n", rproc->name); + + return ret; +} + +void rproc_char_device_remove(struct rproc *rproc) +{ + __unregister_chrdev(MAJOR(rproc->dev.devt), rproc->index, 1, "remoteproc"); +} + +void __init rproc_init_cdev(void) +{ + int ret; + + ret = alloc_chrdev_region(&rproc_major, 0, NUM_RPROC_DEVICES, "remoteproc"); + if (ret < 0) + pr_err("Failed to alloc rproc_cdev region, err %d\n", ret); +} diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h index cd4176b033a3..c34002888d2c 100644 --- a/drivers/remoteproc/remoteproc_internal.h +++ b/drivers/remoteproc/remoteproc_internal.h @@ -53,6 +53,34 @@ void rproc_exit_sysfs(void); void rproc_coredump_cleanup(struct rproc *rproc); void rproc_coredump(struct rproc *rproc); +#ifdef CONFIG_REMOTEPROC_CDEV +void rproc_init_cdev(void); +void rproc_exit_cdev(void); +int rproc_char_device_add(struct rproc *rproc); +void rproc_char_device_remove(struct rproc *rproc); +#else +static inline void rproc_init_cdev(void) +{ +} + +static inline void rproc_exit_cdev(void) +{ +} + +/* + * The character device interface is an optional feature, if it is not enabled + * the function should not return an error. + */ +static inline int rproc_char_device_add(struct rproc *rproc) +{ + return 0; +} + +static inline void rproc_char_device_remove(struct rproc *rproc) +{ +} +#endif + void rproc_free_vring(struct rproc_vring *rvring); int rproc_alloc_vring(struct rproc_vdev *rvdev, int i); diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 0e8d2ff575b4..2fa68bf5aa4f 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -509,6 +510,8 @@ struct rproc_dump_segment { * @autonomous: true if an external entity has booted the remote processor * @dump_segments: list of segments in the firmware * @nb_vdev: number of vdev currently handled by rproc + * @char_dev: character device of the rproc + * @cdev_put_on_release: flag to indicate if remoteproc should be shutdown on @char_dev release */ struct rproc { struct list_head node; @@ -546,6 +549,8 @@ struct rproc { int nb_vdev; u8 elf_class; u16 elf_machine; + struct cdev cdev; + bool cdev_put_on_release; }; /** diff --git a/include/uapi/linux/remoteproc_cdev.h b/include/uapi/linux/remoteproc_cdev.h new file mode 100644 index 000000000000..c43768e4b0dc --- /dev/null +++ b/include/uapi/linux/remoteproc_cdev.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * IOCTLs for Remoteproc's character device interface. + * + * Copyright (c) 2020, The Linux Foundation. All rights reserved. + */ + +#ifndef _UAPI_REMOTEPROC_CDEV_H_ +#define _UAPI_REMOTEPROC_CDEV_H_ + +#include +#include + +#define RPROC_MAGIC 0xB7 + +/* + * The RPROC_SET_SHUTDOWN_ON_RELEASE ioctl allows to enable/disable the shutdown of a remote + * processor automatically when the controlling userpsace closes the char device interface. + * + * input parameter: integer + * 0 : disable automatic shutdown + * other : enable automatic shutdown + */ +#define RPROC_SET_SHUTDOWN_ON_RELEASE _IOW(RPROC_MAGIC, 1, __s32) + +/* + * The RPROC_GET_SHUTDOWN_ON_RELEASE ioctl gets information about whether the automatic shutdown of + * a remote processor is enabled or disabled when the controlling userspace closes the char device + * interface. + * + * output parameter: integer + * 0 : automatic shutdown disable + * other : automatic shutdown enable + */ +#define RPROC_GET_SHUTDOWN_ON_RELEASE _IOR(RPROC_MAGIC, 2, __s32) + +#endif -- cgit v1.2.3 From cae19a6386c86dec8ec2810a96d7497a2eec8d38 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_9p: correct tags for config space fields Tag config space fields as having virtio endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_9p.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_9p.h b/include/uapi/linux/virtio_9p.h index 277c4ad44e84..441047432258 100644 --- a/include/uapi/linux/virtio_9p.h +++ b/include/uapi/linux/virtio_9p.h @@ -25,7 +25,7 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include +#include #include #include @@ -36,7 +36,7 @@ struct virtio_9p_config { /* length of the tag name */ - __u16 tag_len; + __virtio16 tag_len; /* non-NULL terminated tag name */ __u8 tag[0]; } __attribute__((packed)); -- cgit v1.2.3 From c73cb10cc4421baa669c1edd8211086280273216 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_balloon: correct tags for config space fields Tag config space fields as having little endian-ness. Note that balloon is special: LE even when using the legacy interface. Signed-off-by: Michael S. Tsirkin Acked-by: David Hildenbrand Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_balloon.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index dc3e656470dd..ddaa45e723c4 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -45,20 +45,20 @@ #define VIRTIO_BALLOON_CMD_ID_DONE 1 struct virtio_balloon_config { /* Number of pages host wants Guest to give up. */ - __u32 num_pages; + __le32 num_pages; /* Number of pages we've actually got in balloon. */ - __u32 actual; + __le32 actual; /* * Free page hint command id, readonly by guest. * Was previously named free_page_report_cmd_id so we * need to carry that name for legacy support. */ union { - __u32 free_page_hint_cmd_id; - __u32 free_page_report_cmd_id; /* deprecated */ + __le32 free_page_hint_cmd_id; + __le32 free_page_report_cmd_id; /* deprecated */ }; /* Stores PAGE_POISON if page poisoning is in use */ - __u32 poison_val; + __le32 poison_val; }; #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */ -- cgit v1.2.3 From 40e04c488bd6ab1859778d40bf9bb3ca294ab97b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_blk: correct tags for config space fields Tag config space fields as having virtio endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck Reviewed-by: Stefano Garzarella Reviewed-by: Stefano Garzarella --- include/uapi/linux/virtio_blk.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h index 0f99d7b49ede..d888f013d9ff 100644 --- a/include/uapi/linux/virtio_blk.h +++ b/include/uapi/linux/virtio_blk.h @@ -57,20 +57,20 @@ struct virtio_blk_config { /* The capacity (in 512-byte sectors). */ - __u64 capacity; + __virtio64 capacity; /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */ - __u32 size_max; + __virtio32 size_max; /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */ - __u32 seg_max; + __virtio32 seg_max; /* geometry of the device (if VIRTIO_BLK_F_GEOMETRY) */ struct virtio_blk_geometry { - __u16 cylinders; + __virtio16 cylinders; __u8 heads; __u8 sectors; } geometry; /* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */ - __u32 blk_size; + __virtio32 blk_size; /* the next 4 entries are guarded by VIRTIO_BLK_F_TOPOLOGY */ /* exponent for physical block per logical block. */ @@ -78,42 +78,42 @@ struct virtio_blk_config { /* alignment offset in logical blocks. */ __u8 alignment_offset; /* minimum I/O size without performance penalty in logical blocks. */ - __u16 min_io_size; + __virtio16 min_io_size; /* optimal sustained I/O size in logical blocks. */ - __u32 opt_io_size; + __virtio32 opt_io_size; /* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */ __u8 wce; __u8 unused; /* number of vqs, only available when VIRTIO_BLK_F_MQ is set */ - __u16 num_queues; + __virtio16 num_queues; /* the next 3 entries are guarded by VIRTIO_BLK_F_DISCARD */ /* * The maximum discard sectors (in 512-byte sectors) for * one segment. */ - __u32 max_discard_sectors; + __virtio32 max_discard_sectors; /* * The maximum number of discard segments in a * discard command. */ - __u32 max_discard_seg; + __virtio32 max_discard_seg; /* Discard commands must be aligned to this number of sectors. */ - __u32 discard_sector_alignment; + __virtio32 discard_sector_alignment; /* the next 3 entries are guarded by VIRTIO_BLK_F_WRITE_ZEROES */ /* * The maximum number of write zeroes sectors (in 512-byte sectors) in * one segment. */ - __u32 max_write_zeroes_sectors; + __virtio32 max_write_zeroes_sectors; /* * The maximum number of segments in a write zeroes * command. */ - __u32 max_write_zeroes_seg; + __virtio32 max_write_zeroes_seg; /* * Set if a VIRTIO_BLK_T_WRITE_ZEROES request may result in the * deallocation of one or more of the sectors. -- cgit v1.2.3 From dbe2dc8c5838ef415620a4fe691570b062ab046f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_console: correct tags for config space fields Tag config space fields as having virtio endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_console.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_console.h b/include/uapi/linux/virtio_console.h index b7fb108c9310..7e6ec2ff0560 100644 --- a/include/uapi/linux/virtio_console.h +++ b/include/uapi/linux/virtio_console.h @@ -45,13 +45,13 @@ struct virtio_console_config { /* colums of the screens */ - __u16 cols; + __virtio16 cols; /* rows of the screens */ - __u16 rows; + __virtio16 rows; /* max. number of ports this device can hold */ - __u32 max_nr_ports; + __virtio32 max_nr_ports; /* emergency write register */ - __u32 emerg_wr; + __virtio32 emerg_wr; } __attribute__((packed)); /* -- cgit v1.2.3 From 24bcf35b695ef5228f3035eb979cc2de571d560b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_crypto: correct tags for config space fields Since crypto is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_crypto.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_crypto.h b/include/uapi/linux/virtio_crypto.h index 50cdc8aebfcf..a03932f10565 100644 --- a/include/uapi/linux/virtio_crypto.h +++ b/include/uapi/linux/virtio_crypto.h @@ -414,33 +414,33 @@ struct virtio_crypto_op_data_req { struct virtio_crypto_config { /* See VIRTIO_CRYPTO_OP_* above */ - __u32 status; + __le32 status; /* * Maximum number of data queue */ - __u32 max_dataqueues; + __le32 max_dataqueues; /* * Specifies the services mask which the device support, * see VIRTIO_CRYPTO_SERVICE_* above */ - __u32 crypto_services; + __le32 crypto_services; /* Detailed algorithms mask */ - __u32 cipher_algo_l; - __u32 cipher_algo_h; - __u32 hash_algo; - __u32 mac_algo_l; - __u32 mac_algo_h; - __u32 aead_algo; + __le32 cipher_algo_l; + __le32 cipher_algo_h; + __le32 hash_algo; + __le32 mac_algo_l; + __le32 mac_algo_h; + __le32 aead_algo; /* Maximum length of cipher key */ - __u32 max_cipher_key_len; + __le32 max_cipher_key_len; /* Maximum length of authenticated key */ - __u32 max_auth_key_len; - __u32 reserve; + __le32 max_auth_key_len; + __le32 reserve; /* Maximum size of each crypto request's content */ - __u64 max_size; + __le64 max_size; }; struct virtio_crypto_inhdr { -- cgit v1.2.3 From fc4a1accbb4ef372bb55b7ab161cf88e3b631935 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_fs: correct tags for config space fields Since fs is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Acked-by: Vivek Goyal Acked-by: Vivek Goyal Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h index b02eb2ac3d99..3056b6e9f8ce 100644 --- a/include/uapi/linux/virtio_fs.h +++ b/include/uapi/linux/virtio_fs.h @@ -13,7 +13,7 @@ struct virtio_fs_config { __u8 tag[36]; /* Number of request queues */ - __u32 num_request_queues; + __le32 num_request_queues; } __attribute__((packed)); #endif /* _UAPI_LINUX_VIRTIO_FS_H */ -- cgit v1.2.3 From f378444b7c97e39358de5d50d01fb0e92f259073 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_gpu: correct tags for config space fields Since gpu is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_gpu.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_gpu.h b/include/uapi/linux/virtio_gpu.h index 0c85914d9369..ccbd174ef321 100644 --- a/include/uapi/linux/virtio_gpu.h +++ b/include/uapi/linux/virtio_gpu.h @@ -320,10 +320,10 @@ struct virtio_gpu_resp_edid { #define VIRTIO_GPU_EVENT_DISPLAY (1 << 0) struct virtio_gpu_config { - __u32 events_read; - __u32 events_clear; - __u32 num_scanouts; - __u32 num_capsets; + __le32 events_read; + __le32 events_clear; + __le32 num_scanouts; + __le32 num_capsets; }; /* simple formats for fbcon/X use */ -- cgit v1.2.3 From 924b59a6dfa85dc9eac4c7f2fe1857bba2cb2510 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_input: correct tags for config space fields Since this is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Gerd Hoffmann Reviewed-by: Gerd Hoffmann Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_input.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_input.h b/include/uapi/linux/virtio_input.h index a7fe5c8fb135..52084b1fb965 100644 --- a/include/uapi/linux/virtio_input.h +++ b/include/uapi/linux/virtio_input.h @@ -40,18 +40,18 @@ enum virtio_input_config_select { }; struct virtio_input_absinfo { - __u32 min; - __u32 max; - __u32 fuzz; - __u32 flat; - __u32 res; + __le32 min; + __le32 max; + __le32 fuzz; + __le32 flat; + __le32 res; }; struct virtio_input_devids { - __u16 bustype; - __u16 vendor; - __u16 product; - __u16 version; + __le16 bustype; + __le16 vendor; + __le16 product; + __le16 version; }; struct virtio_input_config { -- cgit v1.2.3 From 0ebcffcc2731682777bab19b51a512d8f31e1bdd Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_iommu: correct tags for config space fields Since this is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Jean-Philippe Brucker Reviewed-by: Jean-Philippe Brucker Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_iommu.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h index 48e3c29223b5..237e36a280cb 100644 --- a/include/uapi/linux/virtio_iommu.h +++ b/include/uapi/linux/virtio_iommu.h @@ -18,24 +18,24 @@ #define VIRTIO_IOMMU_F_MMIO 5 struct virtio_iommu_range_64 { - __u64 start; - __u64 end; + __le64 start; + __le64 end; }; struct virtio_iommu_range_32 { - __u32 start; - __u32 end; + __le32 start; + __le32 end; }; struct virtio_iommu_config { /* Supported page sizes */ - __u64 page_size_mask; + __le64 page_size_mask; /* Supported IOVA range */ struct virtio_iommu_range_64 input_range; /* Max domain ID size */ struct virtio_iommu_range_32 domain_range; /* Probe buffer size */ - __u32 probe_size; + __le32 probe_size; }; /* Request types */ -- cgit v1.2.3 From 79268954424771185fb4ca304786dd561a272246 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_mem: correct tags for config space fields Since this is a modern-only device, tag config space fields as having little endian-ness. TODO: check other uses of __virtioXX types in this header, should probably be __leXX. Signed-off-by: Michael S. Tsirkin Acked-by: David Hildenbrand Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_mem.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_mem.h b/include/uapi/linux/virtio_mem.h index a9ffe041843c..70e01c687d5e 100644 --- a/include/uapi/linux/virtio_mem.h +++ b/include/uapi/linux/virtio_mem.h @@ -185,27 +185,27 @@ struct virtio_mem_resp { struct virtio_mem_config { /* Block size and alignment. Cannot change. */ - __u64 block_size; + __le64 block_size; /* Valid with VIRTIO_MEM_F_ACPI_PXM. Cannot change. */ - __u16 node_id; + __le16 node_id; __u8 padding[6]; /* Start address of the memory region. Cannot change. */ - __u64 addr; + __le64 addr; /* Region size (maximum). Cannot change. */ - __u64 region_size; + __le64 region_size; /* * Currently usable region size. Can grow up to region_size. Can * shrink due to VIRTIO_MEM_REQ_UNPLUG_ALL (in which case no config * update will be sent). */ - __u64 usable_region_size; + __le64 usable_region_size; /* * Currently used size. Changes due to plug/unplug requests, but no * config updates will be sent. */ - __u64 plugged_size; + __le64 plugged_size; /* Requested size. New plug requests cannot exceed it. Can change. */ - __u64 requested_size; + __le64 requested_size; }; #endif /* _LINUX_VIRTIO_MEM_H */ -- cgit v1.2.3 From 577e677a785357542311a645eeb1756cd83988be Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_net: correct tags for config space fields Tag config space fields as having virtio endian-ness. Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_net.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 19d23e5baa4e..27d996f29dd1 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -87,19 +87,19 @@ struct virtio_net_config { /* The config defining mac address (if VIRTIO_NET_F_MAC) */ __u8 mac[ETH_ALEN]; /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ - __u16 status; + __virtio16 status; /* Maximum number of each of transmit and receive queues; * see VIRTIO_NET_F_MQ and VIRTIO_NET_CTRL_MQ. * Legal values are between 1 and 0x8000 */ - __u16 max_virtqueue_pairs; + __virtio16 max_virtqueue_pairs; /* Default maximum transmit unit advice */ - __u16 mtu; + __virtio16 mtu; /* * speed, in units of 1Mb. All values 0 to INT_MAX are legal. * Any other value stands for unknown. */ - __u32 speed; + __virtio32 speed; /* * 0x00 - half duplex * 0x01 - full duplex -- cgit v1.2.3 From a28feb855cc0ad452acd3dfe2b8f2841927da5f1 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_pmem: correct tags for config space fields Since this is a modern-only device, tag config space fields as having little endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_pmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h index b022787ffb94..d676b3620383 100644 --- a/include/uapi/linux/virtio_pmem.h +++ b/include/uapi/linux/virtio_pmem.h @@ -15,8 +15,8 @@ #include struct virtio_pmem_config { - __u64 start; - __u64 size; + __le64 start; + __le64 size; }; #define VIRTIO_PMEM_REQ_TYPE_FLUSH 0 -- cgit v1.2.3 From 965b5350514b597dc6347b733127e180844aeb43 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 10 Jul 2020 07:17:13 -0400 Subject: virtio_scsi: correct tags for config space fields Tag config space fields as having virtio endian-ness. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- drivers/scsi/virtio_scsi.c | 4 ++-- include/uapi/linux/virtio_scsi.h | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/uapi') diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 0e0910c5b942..c36aeb9a1330 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -746,14 +746,14 @@ static struct scsi_host_template virtscsi_host_template = { #define virtscsi_config_get(vdev, fld) \ ({ \ - typeof(((struct virtio_scsi_config *)0)->fld) __val; \ + __virtio_native_type(struct virtio_scsi_config, fld) __val; \ virtio_cread(vdev, struct virtio_scsi_config, fld, &__val); \ __val; \ }) #define virtscsi_config_set(vdev, fld, val) \ do { \ - typeof(((struct virtio_scsi_config *)0)->fld) __val = (val); \ + __virtio_native_type(struct virtio_scsi_config, fld) __val = (val); \ virtio_cwrite(vdev, struct virtio_scsi_config, fld, &__val); \ } while(0) diff --git a/include/uapi/linux/virtio_scsi.h b/include/uapi/linux/virtio_scsi.h index cc18ef8825c0..0abaae4027c0 100644 --- a/include/uapi/linux/virtio_scsi.h +++ b/include/uapi/linux/virtio_scsi.h @@ -103,16 +103,16 @@ struct virtio_scsi_event { } __attribute__((packed)); struct virtio_scsi_config { - __u32 num_queues; - __u32 seg_max; - __u32 max_sectors; - __u32 cmd_per_lun; - __u32 event_info_size; - __u32 sense_size; - __u32 cdb_size; - __u16 max_channel; - __u16 max_target; - __u32 max_lun; + __virtio32 num_queues; + __virtio32 seg_max; + __virtio32 max_sectors; + __virtio32 cmd_per_lun; + __virtio32 event_info_size; + __virtio32 sense_size; + __virtio32 cdb_size; + __virtio16 max_channel; + __virtio16 max_target; + __virtio32 max_lun; } __attribute__((packed)); /* Feature Bits */ -- cgit v1.2.3 From 64ffa39dc860fb9772225c694353f73eca5801c6 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 5 Aug 2020 05:39:36 -0400 Subject: virtio_net: use LE accessors for speed/duplex Speed and duplex config fields depend on VIRTIO_NET_F_SPEED_DUPLEX which being 63>31 depends on VIRTIO_F_VERSION_1. Accordingly, use LE accessors for these fields. Reported-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 9 +++++---- include/uapi/linux/virtio_net.h | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index ba38765dc490..0934b1ec5320 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2264,12 +2264,13 @@ static void virtnet_update_settings(struct virtnet_info *vi) if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX)) return; - speed = virtio_cread32(vi->vdev, offsetof(struct virtio_net_config, - speed)); + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed); + if (ethtool_validate_speed(speed)) vi->speed = speed; - duplex = virtio_cread8(vi->vdev, offsetof(struct virtio_net_config, - duplex)); + + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex); + if (ethtool_validate_duplex(duplex)) vi->duplex = duplex; } diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 27d996f29dd1..3f55a4215f11 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -99,7 +99,7 @@ struct virtio_net_config { * speed, in units of 1Mb. All values 0 to INT_MAX are legal. * Any other value stands for unknown. */ - __virtio32 speed; + __le32 speed; /* * 0x00 - half duplex * 0x01 - full duplex -- cgit v1.2.3 From 25abc060d282132ea5c945392f900dca0a7e9bbb Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 4 Aug 2020 19:20:40 +0300 Subject: vhost-vdpa: support IOTLB batching hints This patches extend the vhost IOTLB API to accept batch updating hints form userspace. When userspace wants update the device IOTLB in a batch, it may do: 1) Write vhost_iotlb_msg with VHOST_IOTLB_BATCH_BEGIN flag 2) Perform a batch of IOTLB updating via VHOST_IOTLB_UPDATE/INVALIDATE 3) Write vhost_iotlb_msg with VHOST_IOTLB_BATCH_END flag Vhost-vdpa may decide to batch the IOMMU/IOTLB updating in step 3 when vDPA device support set_map() ops. This is useful for the vDPA device that want to know all the mappings to tweak their own DMA translation logic. For vDPA device that doesn't require set_map(), no behavior changes. This capability is advertised via VHOST_BACKEND_F_IOTLB_BATCH capability. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20200804162048.22587-5-eli@mellanox.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 36 +++++++++++++++++++++++++++--------- include/uapi/linux/vhost.h | 2 ++ include/uapi/linux/vhost_types.h | 11 +++++++++++ 3 files changed, 40 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 61c17d34cb39..e80db051845d 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -27,7 +27,9 @@ #include "vhost.h" enum { - VHOST_VDPA_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) + VHOST_VDPA_BACKEND_FEATURES = + (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) | + (1ULL << VHOST_BACKEND_F_IOTLB_BATCH), }; /* Currently, only network backend w/o multiqueue is supported. */ @@ -48,6 +50,7 @@ struct vhost_vdpa { int virtio_id; int minor; struct eventfd_ctx *config_ctx; + int in_batch; }; static DEFINE_IDA(vhost_vdpa_ida); @@ -124,6 +127,7 @@ static void vhost_vdpa_reset(struct vhost_vdpa *v) struct vdpa_device *vdpa = v->vdpa; vdpa_reset(vdpa); + v->in_batch = 0; } static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp) @@ -546,13 +550,15 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, if (r) return r; - if (ops->dma_map) + if (ops->dma_map) { r = ops->dma_map(vdpa, iova, size, pa, perm); - else if (ops->set_map) - r = ops->set_map(vdpa, dev->iotlb); - else + } else if (ops->set_map) { + if (!v->in_batch) + r = ops->set_map(vdpa, dev->iotlb); + } else { r = iommu_map(v->domain, iova, pa, size, perm_to_iommu_flags(perm)); + } return r; } @@ -565,12 +571,14 @@ static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size) vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1); - if (ops->dma_map) + if (ops->dma_map) { ops->dma_unmap(vdpa, iova, size); - else if (ops->set_map) - ops->set_map(vdpa, dev->iotlb); - else + } else if (ops->set_map) { + if (!v->in_batch) + ops->set_map(vdpa, dev->iotlb); + } else { iommu_unmap(v->domain, iova, size); + } } static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, @@ -663,6 +671,8 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, struct vhost_iotlb_msg *msg) { struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; int r = 0; r = vhost_dev_check_owner(dev); @@ -676,6 +686,14 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, case VHOST_IOTLB_INVALIDATE: vhost_vdpa_unmap(v, msg->iova, msg->size); break; + case VHOST_IOTLB_BATCH_BEGIN: + v->in_batch = true; + break; + case VHOST_IOTLB_BATCH_END: + if (v->in_batch && ops->set_map) + ops->set_map(vdpa, dev->iotlb); + v->in_batch = false; + break; default: r = -EINVAL; break; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 0c2349612e77..75232185324a 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -91,6 +91,8 @@ /* Use message type V2 */ #define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1 +/* IOTLB can accept batching hints */ +#define VHOST_BACKEND_F_IOTLB_BATCH 0x2 #define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) #define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64) diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index 669457ce5c48..9a269a88a6ff 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -60,6 +60,17 @@ struct vhost_iotlb_msg { #define VHOST_IOTLB_UPDATE 2 #define VHOST_IOTLB_INVALIDATE 3 #define VHOST_IOTLB_ACCESS_FAIL 4 +/* + * VHOST_IOTLB_BATCH_BEGIN and VHOST_IOTLB_BATCH_END allow modifying + * multiple mappings in one go: beginning with + * VHOST_IOTLB_BATCH_BEGIN, followed by any number of + * VHOST_IOTLB_UPDATE messages, and ending with VHOST_IOTLB_BATCH_END. + * When one of these two values is used as the message type, the rest + * of the fields in the message are ignored. There's no guarantee that + * these changes take place automatically in the device. + */ +#define VHOST_IOTLB_BATCH_BEGIN 5 +#define VHOST_IOTLB_BATCH_END 6 __u8 type; }; -- cgit v1.2.3 From 5e7b30205cef80f6bb922e61834437ca7bff5837 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 4 Aug 2020 22:50:56 -0700 Subject: bpf: Change uapi for bpf iterator map elements Commit a5cbe05a6673 ("bpf: Implement bpf iterator for map elements") added bpf iterator support for map elements. The map element bpf iterator requires info to identify a particular map. In the above commit, the attr->link_create.target_fd is used to carry map_fd and an enum bpf_iter_link_info is added to uapi to specify the target_fd actually representing a map_fd: enum bpf_iter_link_info { BPF_ITER_LINK_UNSPEC = 0, BPF_ITER_LINK_MAP_FD = 1, MAX_BPF_ITER_LINK_INFO, }; This is an extensible approach as we can grow enumerator for pid, cgroup_id, etc. and we can unionize target_fd for pid, cgroup_id, etc. But in the future, there are chances that more complex customization may happen, e.g., for tasks, it could be filtered based on both cgroup_id and user_id. This patch changed the uapi to have fields __aligned_u64 iter_info; __u32 iter_info_len; for additional iter_info for link_create. The iter_info is defined as union bpf_iter_link_info { struct { __u32 map_fd; } map; }; So future extension for additional customization will be easier. The bpf_iter_link_info will be passed to target callback to validate and generic bpf_iter framework does not need to deal it any more. Note that map_fd = 0 will be considered invalid and -EBADF will be returned to user space. Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com --- include/linux/bpf.h | 10 ++++---- include/uapi/linux/bpf.h | 15 ++++++------ kernel/bpf/bpf_iter.c | 58 +++++++++++++++++++++++------------------------ kernel/bpf/map_iter.c | 37 +++++++++++++++++++++++------- kernel/bpf/syscall.c | 2 +- net/core/bpf_sk_storage.c | 37 +++++++++++++++++++++++------- 6 files changed, 102 insertions(+), 57 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cef4ef0d2b4e..55f694b63164 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1214,15 +1214,17 @@ struct bpf_iter_aux_info { struct bpf_map *map; }; -typedef int (*bpf_iter_check_target_t)(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux); +typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux); +typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; - bpf_iter_check_target_t check_target; + bpf_iter_attach_target_t attach_target; + bpf_iter_detach_target_t detach_target; u32 ctx_arg_info_size; - enum bpf_iter_link_info req_linfo; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b134e679e9db..0480f893facd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -81,6 +81,12 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type */ }; +union bpf_iter_link_info { + struct { + __u32 map_fd; + } map; +}; + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -249,13 +255,6 @@ enum bpf_link_type { MAX_BPF_LINK_TYPE, }; -enum bpf_iter_link_info { - BPF_ITER_LINK_UNSPEC = 0, - BPF_ITER_LINK_MAP_FD = 1, - - MAX_BPF_ITER_LINK_INFO, -}; - /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -623,6 +622,8 @@ union bpf_attr { }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 363b9cafc2d8..b6715964b685 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -338,8 +338,8 @@ static void bpf_iter_link_release(struct bpf_link *link) struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); - if (iter_link->aux.map) - bpf_map_put_with_uref(iter_link->aux.map); + if (iter_link->tinfo->reg_info->detach_target) + iter_link->tinfo->reg_info->detach_target(&iter_link->aux); } static void bpf_iter_link_dealloc(struct bpf_link *link) @@ -390,15 +390,35 @@ bool bpf_link_is_iter(struct bpf_link *link) int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + union bpf_iter_link_info __user *ulinfo; struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; - struct bpf_iter_aux_info aux = {}; + union bpf_iter_link_info linfo; struct bpf_iter_link *link; - u32 prog_btf_id, target_fd; + u32 prog_btf_id, linfo_len; bool existed = false; - struct bpf_map *map; int err; + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + memset(&linfo, 0, sizeof(union bpf_iter_link_info)); + + ulinfo = u64_to_user_ptr(attr->link_create.iter_info); + linfo_len = attr->link_create.iter_info_len; + if (!ulinfo ^ !linfo_len) + return -EINVAL; + + if (ulinfo) { + err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo), + linfo_len); + if (err) + return err; + linfo_len = min_t(u32, linfo_len, sizeof(linfo)); + if (copy_from_user(&linfo, ulinfo, linfo_len)) + return -EFAULT; + } + prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { @@ -411,13 +431,6 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!existed) return -ENOENT; - /* Make sure user supplied flags are target expected. */ - target_fd = attr->link_create.target_fd; - if (attr->link_create.flags != tinfo->reg_info->req_linfo) - return -EINVAL; - if (!attr->link_create.flags && target_fd) - return -EINVAL; - link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -431,28 +444,15 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } - if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) { - map = bpf_map_get_with_uref(target_fd); - if (IS_ERR(map)) { - err = PTR_ERR(map); - goto cleanup_link; - } - - aux.map = map; - err = tinfo->reg_info->check_target(prog, &aux); + if (tinfo->reg_info->attach_target) { + err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux); if (err) { - bpf_map_put_with_uref(map); - goto cleanup_link; + bpf_link_cleanup(&link_primer); + return err; } - - link->aux.map = map; } return bpf_link_settle(&link_primer); - -cleanup_link: - bpf_link_cleanup(&link_primer); - return err; } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index fbe1f557cb88..af86048e5afd 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -98,12 +98,21 @@ static struct bpf_iter_reg bpf_map_reg_info = { .seq_info = &bpf_map_seq_info, }; -static int bpf_iter_check_map(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux) +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) { u32 key_acc_size, value_acc_size, key_size, value_size; - struct bpf_map *map = aux->map; + struct bpf_map *map; bool is_percpu = false; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -112,7 +121,7 @@ static int bpf_iter_check_map(struct bpf_prog *prog, else if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) - return -EINVAL; + goto put_map; key_acc_size = prog->aux->max_rdonly_access; value_acc_size = prog->aux->max_rdwr_access; @@ -122,10 +131,22 @@ static int bpf_iter_check_map(struct bpf_prog *prog, else value_size = round_up(map->value_size, 8) * num_possible_cpus(); - if (key_acc_size > key_size || value_acc_size > value_size) - return -EACCES; + if (key_acc_size > key_size || value_acc_size > value_size) { + err = -EACCES; + goto put_map; + } + aux->map = map; return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); } DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, @@ -133,8 +154,8 @@ DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, static const struct bpf_iter_reg bpf_map_elem_reg_info = { .target = "bpf_map_elem", - .check_target = bpf_iter_check_map, - .req_linfo = BPF_ITER_LINK_MAP_FD, + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2f343ce15747..86299a292214 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3883,7 +3883,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog * return -EINVAL; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.flags +#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len static int link_create(union bpf_attr *attr) { enum bpf_prog_type ptype; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d3377c90a291..b988f48153a4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -1384,18 +1384,39 @@ static int bpf_iter_init_sk_storage_map(void *priv_data, return 0; } -static int bpf_iter_check_map(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux) +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) { - struct bpf_map *map = aux->map; + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) - return -EINVAL; + goto put_map; - if (prog->aux->max_rdonly_access > map->value_size) - return -EACCES; + if (prog->aux->max_rdonly_access > map->value_size) { + err = -EACCES; + goto put_map; + } + aux->map = map; return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); } static const struct seq_operations bpf_sk_storage_map_seq_ops = { @@ -1414,8 +1435,8 @@ static const struct bpf_iter_seq_info iter_seq_info = { static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { .target = "bpf_sk_storage_map", - .check_target = bpf_iter_check_map, - .req_linfo = BPF_ITER_LINK_MAP_FD, + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), -- cgit v1.2.3 From 7f317d34906c1033f0752fc137dda04e43979bb8 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Tue, 11 Aug 2020 18:34:19 -0700 Subject: include/: replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Signed-off-by: Alexander A. Klimov Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Link: http://lkml.kernel.org/r/20200726110117.16346-1-grandmaster@al2klimov.de Signed-off-by: Linus Torvalds --- include/clocksource/timer-ti-dm.h | 2 +- include/linux/btree.h | 2 +- include/linux/delay.h | 2 +- include/linux/dma/k3-psil.h | 2 +- include/linux/dma/k3-udma-glue.h | 2 +- include/linux/dma/ti-cppi5.h | 2 +- include/linux/irqchip/irq-omap-intc.h | 2 +- include/linux/jhash.h | 2 +- include/linux/leds-ti-lmu-common.h | 2 +- include/linux/platform_data/davinci-cpufreq.h | 2 +- include/linux/platform_data/davinci_asp.h | 2 +- include/linux/platform_data/elm.h | 2 +- include/linux/platform_data/gpio-davinci.h | 2 +- include/linux/platform_data/gpmc-omap.h | 2 +- include/linux/platform_data/mtd-davinci-aemif.h | 2 +- include/linux/platform_data/omap-twl4030.h | 2 +- include/linux/platform_data/uio_pruss.h | 2 +- include/linux/platform_data/usb-omap.h | 2 +- include/linux/soc/ti/k3-ringacc.h | 2 +- include/linux/soc/ti/knav_qmss.h | 2 +- include/linux/soc/ti/ti-msgmgr.h | 2 +- include/linux/wkup_m3_ipc.h | 2 +- include/linux/xxhash.h | 2 +- include/linux/xz.h | 2 +- include/linux/zlib.h | 2 +- include/soc/arc/aux.h | 2 +- include/uapi/linux/elf.h | 2 +- include/uapi/linux/map_to_7segment.h | 2 +- include/uapi/linux/types.h | 2 +- include/uapi/linux/usb/ch9.h | 2 +- 30 files changed, 30 insertions(+), 30 deletions(-) (limited to 'include/uapi') diff --git a/include/clocksource/timer-ti-dm.h b/include/clocksource/timer-ti-dm.h index 531ca87fcd08..4c61dade8835 100644 --- a/include/clocksource/timer-ti-dm.h +++ b/include/clocksource/timer-ti-dm.h @@ -1,7 +1,7 @@ /* * OMAP Dual-Mode Timers * - * Copyright (C) 2010 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2010 Texas Instruments Incorporated - https://www.ti.com/ * Tarun Kanti DebBarma * Thara Gopinath * diff --git a/include/linux/btree.h b/include/linux/btree.h index 68f858c831b1..243ee544397a 100644 --- a/include/linux/btree.h +++ b/include/linux/btree.h @@ -10,7 +10,7 @@ * * A B+Tree is a data structure for looking up arbitrary (currently allowing * unsigned long, u32, u64 and 2 * u64) keys into pointers. The data structure - * is described at http://en.wikipedia.org/wiki/B-tree, we currently do not + * is described at https://en.wikipedia.org/wiki/B-tree, we currently do not * use binary search to find the key on lookups. * * Each B+Tree consists of a head, that contains bookkeeping information and diff --git a/include/linux/delay.h b/include/linux/delay.h index 5e016a4029d9..1d0e2ce6b6d9 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -16,7 +16,7 @@ * 3. CPU clock rate changes. * * Please see this thread: - * http://lists.openwall.net/linux-kernel/2011/01/09/56 + * https://lists.openwall.net/linux-kernel/2011/01/09/56 */ #include diff --git a/include/linux/dma/k3-psil.h b/include/linux/dma/k3-psil.h index 61d5cc0ad601..1962f75fa2d3 100644 --- a/include/linux/dma/k3-psil.h +++ b/include/linux/dma/k3-psil.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2019 Texas Instruments Incorporated - https://www.ti.com */ #ifndef K3_PSIL_H_ diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h index caadbab1632a..5eb34ad973a7 100644 --- a/include/linux/dma/k3-udma-glue.h +++ b/include/linux/dma/k3-udma-glue.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2019 Texas Instruments Incorporated - https://www.ti.com */ #ifndef K3_UDMA_GLUE_H_ diff --git a/include/linux/dma/ti-cppi5.h b/include/linux/dma/ti-cppi5.h index 579356ae447e..5896441ee604 100644 --- a/include/linux/dma/ti-cppi5.h +++ b/include/linux/dma/ti-cppi5.h @@ -2,7 +2,7 @@ /* * CPPI5 descriptors interface * - * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2019 Texas Instruments Incorporated - https://www.ti.com */ #ifndef __TI_CPPI5_H__ diff --git a/include/linux/irqchip/irq-omap-intc.h b/include/linux/irqchip/irq-omap-intc.h index 216e5adf80ce..dca379c0d7eb 100644 --- a/include/linux/irqchip/irq-omap-intc.h +++ b/include/linux/irqchip/irq-omap-intc.h @@ -2,7 +2,7 @@ /** * irq-omap-intc.h - INTC Idle Functions * - * Copyright (C) 2014 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2014 Texas Instruments Incorporated - https://www.ti.com * * Author: Felipe Balbi */ diff --git a/include/linux/jhash.h b/include/linux/jhash.h index ba2f6a9776b6..19ddd43aee68 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -5,7 +5,7 @@ * * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) * - * http://burtleburtle.net/bob/hash/ + * https://burtleburtle.net/bob/hash/ * * These are the credits from Bob's sources: * diff --git a/include/linux/leds-ti-lmu-common.h b/include/linux/leds-ti-lmu-common.h index 5eb111f38803..420b61e5a213 100644 --- a/include/linux/leds-ti-lmu-common.h +++ b/include/linux/leds-ti-lmu-common.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ // TI LMU Common Core -// Copyright (C) 2018 Texas Instruments Incorporated - http://www.ti.com/ +// Copyright (C) 2018 Texas Instruments Incorporated - https://www.ti.com/ #ifndef _TI_LMU_COMMON_H_ #define _TI_LMU_COMMON_H_ diff --git a/include/linux/platform_data/davinci-cpufreq.h b/include/linux/platform_data/davinci-cpufreq.h index 3fbf9f2793b5..bc208c64e3d7 100644 --- a/include/linux/platform_data/davinci-cpufreq.h +++ b/include/linux/platform_data/davinci-cpufreq.h @@ -2,7 +2,7 @@ /* * TI DaVinci CPUFreq platform support. * - * Copyright (C) 2009 Texas Instruments, Inc. http://www.ti.com/ + * Copyright (C) 2009 Texas Instruments, Inc. https://www.ti.com/ */ #ifndef _MACH_DAVINCI_CPUFREQ_H diff --git a/include/linux/platform_data/davinci_asp.h b/include/linux/platform_data/davinci_asp.h index 7fe80f1c7e08..5d1fb0d78a22 100644 --- a/include/linux/platform_data/davinci_asp.h +++ b/include/linux/platform_data/davinci_asp.h @@ -1,7 +1,7 @@ /* * TI DaVinci Audio Serial Port support * - * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com/ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as diff --git a/include/linux/platform_data/elm.h b/include/linux/platform_data/elm.h index 0f491d8abfdd..3cc78f0447b1 100644 --- a/include/linux/platform_data/elm.h +++ b/include/linux/platform_data/elm.h @@ -2,7 +2,7 @@ /* * BCH Error Location Module * - * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com/ */ #ifndef __ELM_H diff --git a/include/linux/platform_data/gpio-davinci.h b/include/linux/platform_data/gpio-davinci.h index a93841bfb9f7..e182a46e609f 100644 --- a/include/linux/platform_data/gpio-davinci.h +++ b/include/linux/platform_data/gpio-davinci.h @@ -1,7 +1,7 @@ /* * DaVinci GPIO Platform Related Defines * - * Copyright (C) 2013 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2013 Texas Instruments Incorporated - https://www.ti.com/ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as diff --git a/include/linux/platform_data/gpmc-omap.h b/include/linux/platform_data/gpmc-omap.h index ef663e570552..c9cc4e32435d 100644 --- a/include/linux/platform_data/gpmc-omap.h +++ b/include/linux/platform_data/gpmc-omap.h @@ -2,7 +2,7 @@ /* * OMAP GPMC Platform data * - * Copyright (C) 2014 Texas Instruments, Inc. - http://www.ti.com + * Copyright (C) 2014 Texas Instruments, Inc. - https://www.ti.com * Roger Quadros */ diff --git a/include/linux/platform_data/mtd-davinci-aemif.h b/include/linux/platform_data/mtd-davinci-aemif.h index a403dd51dacc..a49826214a39 100644 --- a/include/linux/platform_data/mtd-davinci-aemif.h +++ b/include/linux/platform_data/mtd-davinci-aemif.h @@ -1,7 +1,7 @@ /* * TI DaVinci AEMIF support * - * Copyright 2010 (C) Texas Instruments, Inc. http://www.ti.com/ + * Copyright 2010 (C) Texas Instruments, Inc. https://www.ti.com/ * * This file is licensed under the terms of the GNU General Public License * version 2. This program is licensed "as is" without any warranty of any diff --git a/include/linux/platform_data/omap-twl4030.h b/include/linux/platform_data/omap-twl4030.h index 8419c8caf54e..0dd851ea1c72 100644 --- a/include/linux/platform_data/omap-twl4030.h +++ b/include/linux/platform_data/omap-twl4030.h @@ -3,7 +3,7 @@ * omap-twl4030.h - ASoC machine driver for TI SoC based boards with twl4030 * codec, header. * - * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com * All rights reserved. * * Author: Peter Ujfalusi diff --git a/include/linux/platform_data/uio_pruss.h b/include/linux/platform_data/uio_pruss.h index 3d47d219827f..31f2e22661bc 100644 --- a/include/linux/platform_data/uio_pruss.h +++ b/include/linux/platform_data/uio_pruss.h @@ -3,7 +3,7 @@ * * Platform data for uio_pruss driver * - * Copyright (C) 2010-11 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2010-11 Texas Instruments Incorporated - https://www.ti.com/ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as diff --git a/include/linux/platform_data/usb-omap.h b/include/linux/platform_data/usb-omap.h index fa579b4c666b..5e70d667031c 100644 --- a/include/linux/platform_data/usb-omap.h +++ b/include/linux/platform_data/usb-omap.h @@ -1,7 +1,7 @@ /* * usb-omap.h - Platform data for the various OMAP USB IPs * - * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com * * This software is distributed under the terms of the GNU General Public * License ("GPL") version 2, as published by the Free Software Foundation. diff --git a/include/linux/soc/ti/k3-ringacc.h b/include/linux/soc/ti/k3-ringacc.h index 7ac115432fa1..5a472eca5ee4 100644 --- a/include/linux/soc/ti/k3-ringacc.h +++ b/include/linux/soc/ti/k3-ringacc.h @@ -2,7 +2,7 @@ /* * K3 Ring Accelerator (RA) subsystem interface * - * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2019 Texas Instruments Incorporated - https://www.ti.com */ #ifndef __SOC_TI_K3_RINGACC_API_H_ diff --git a/include/linux/soc/ti/knav_qmss.h b/include/linux/soc/ti/knav_qmss.h index 9745df6ed9d3..c75ef99c99ca 100644 --- a/include/linux/soc/ti/knav_qmss.h +++ b/include/linux/soc/ti/knav_qmss.h @@ -1,7 +1,7 @@ /* * Keystone Navigator Queue Management Sub-System header * - * Copyright (C) 2014 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2014 Texas Instruments Incorporated - https://www.ti.com * Author: Sandeep Nair * Cyril Chemparathy * Santosh Shilimkar diff --git a/include/linux/soc/ti/ti-msgmgr.h b/include/linux/soc/ti/ti-msgmgr.h index eac8e0c6fe11..1f6e76d423cf 100644 --- a/include/linux/soc/ti/ti-msgmgr.h +++ b/include/linux/soc/ti/ti-msgmgr.h @@ -1,7 +1,7 @@ /* * Texas Instruments' Message Manager * - * Copyright (C) 2015-2016 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2015-2016 Texas Instruments Incorporated - https://www.ti.com/ * Nishanth Menon * * This program is free software; you can redistribute it and/or modify diff --git a/include/linux/wkup_m3_ipc.h b/include/linux/wkup_m3_ipc.h index e497e621dbb7..3f496967b538 100644 --- a/include/linux/wkup_m3_ipc.h +++ b/include/linux/wkup_m3_ipc.h @@ -1,7 +1,7 @@ /* * TI Wakeup M3 for AMx3 SoCs Power Management Routines * - * Copyright (C) 2015 Texas Instruments Incorporated - http://www.ti.com/ + * Copyright (C) 2015 Texas Instruments Incorporated - https://www.ti.com/ * Dave Gerlach * * This program is free software; you can redistribute it and/or diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h index 52b073fea17f..df42511438d0 100644 --- a/include/linux/xxhash.h +++ b/include/linux/xxhash.h @@ -34,7 +34,7 @@ * ("BSD"). * * You can contact the author at: - * - xxHash homepage: http://cyan4973.github.io/xxHash/ + * - xxHash homepage: https://cyan4973.github.io/xxHash/ * - xxHash source repository: https://github.com/Cyan4973/xxHash */ diff --git a/include/linux/xz.h b/include/linux/xz.h index 24ad7d875977..9884c8440188 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -2,7 +2,7 @@ * XZ decompressor * * Authors: Lasse Collin - * Igor Pavlov + * Igor Pavlov * * This file has been put into the public domain. * You can do whatever you want with this file. diff --git a/include/linux/zlib.h b/include/linux/zlib.h index c757d848a758..78ede944c082 100644 --- a/include/linux/zlib.h +++ b/include/linux/zlib.h @@ -23,7 +23,7 @@ The data format used by the zlib library is described by RFCs (Request for - Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt + Comments) 1950 to 1952 in the files https://www.ietf.org/rfc/rfc1950.txt (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). */ diff --git a/include/soc/arc/aux.h b/include/soc/arc/aux.h index e223c4ffa153..9c2eff6140b6 100644 --- a/include/soc/arc/aux.h +++ b/include/soc/arc/aux.h @@ -22,7 +22,7 @@ static inline int read_aux_reg(u32 r) /* * function helps elide unused variable warning - * see: http://lists.infradead.org/pipermail/linux-snps-arc/2016-November/001748.html + * see: https://lists.infradead.org/pipermail/linux-snps-arc/2016-November/001748.html */ static inline void write_aux_reg(u32 r, u32 v) { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index c6dd0215482e..22220945a5fd 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -53,7 +53,7 @@ typedef __s64 Elf64_Sxword; * * - Oracle: Linker and Libraries. * Part No: 817–1984–19, August 2011. - * http://docs.oracle.com/cd/E18752_01/pdf/817-1984.pdf + * https://docs.oracle.com/cd/E18752_01/pdf/817-1984.pdf * * - System V ABI AMD64 Architecture Processor Supplement * Draft Version 0.99.4, diff --git a/include/uapi/linux/map_to_7segment.h b/include/uapi/linux/map_to_7segment.h index f9ed18134b83..13a06e5e966e 100644 --- a/include/uapi/linux/map_to_7segment.h +++ b/include/uapi/linux/map_to_7segment.h @@ -24,7 +24,7 @@ * of (ASCII) characters to a 7-segments notation. * * The 7 segment's wikipedia notation below is used as standard. - * See: http://en.wikipedia.org/wiki/Seven_segment_display + * See: https://en.wikipedia.org/wiki/Seven_segment_display * * Notation: +-a-+ * f b diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h index 2fce8b6876e9..f6d2f83cbe29 100644 --- a/include/uapi/linux/types.h +++ b/include/uapi/linux/types.h @@ -7,7 +7,7 @@ #ifndef __ASSEMBLY__ #ifndef __KERNEL__ #ifndef __EXPORTED_HEADERS__ -#warning "Attempt to use kernel headers from user space, see http://kernelnewbies.org/KernelHeaders" +#warning "Attempt to use kernel headers from user space, see https://kernelnewbies.org/KernelHeaders" #endif /* __EXPORTED_HEADERS__ */ #endif diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 48766fdf6580..0f865ae4ba89 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -1229,7 +1229,7 @@ struct usb_set_sel_req { * As per USB compliance update, a device that is actively drawing * more than 100mA from USB must report itself as bus-powered in * the GetStatus(DEVICE) call. - * http://compliance.usb.org/index.asp?UpdateFile=Electrical&Format=Standard#34 + * https://compliance.usb.org/index.asp?UpdateFile=Electrical&Format=Standard#34 */ #define USB_SELF_POWER_VBUS_MAX_DRAW 100 -- cgit v1.2.3 From 2fb3244f0a58ceb3d866ac63f644dfa31cae430f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 11 Aug 2020 18:35:21 -0700 Subject: autofs: fix doubled word Change doubled word "is" to "it is". Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Acked-by: Ian Kent Link: http://lkml.kernel.org/r/5a82befd-40f8-8dc0-3498-cbc0436cad9b@infradead.org Signed-off-by: Linus Torvalds --- include/uapi/linux/auto_dev-ioctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h index 374742651c30..62e625356dc8 100644 --- a/include/uapi/linux/auto_dev-ioctl.h +++ b/include/uapi/linux/auto_dev-ioctl.h @@ -82,7 +82,7 @@ struct args_ismountpoint { /* * All the ioctls use this structure. * When sending a path size must account for the total length - * of the chunk of memory otherwise is is the size of the + * of the chunk of memory otherwise it is the size of the * structure. */ -- cgit v1.2.3 From 004a01241c5a0d375266ebf1c72f208de99294e9 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:04 +0200 Subject: arm64/x86: KVM: Introduce steal-time cap arm64 requires a vcpu fd (KVM_HAS_DEVICE_ATTR vcpu ioctl) to probe support for steal-time. However this is unnecessary, as only a KVM fd is required, and it complicates userspace (userspace may prefer delaying vcpu creation until after feature probing). Introduce a cap that can be checked instead. While x86 can already probe steal-time support with a kvm fd (KVM_GET_SUPPORTED_CPUID), we add the cap there too for consistency. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Reviewed-by: Steven Price Link: https://lore.kernel.org/r/20200804170604.42662-7-drjones@redhat.com --- Documentation/virt/kvm/api.rst | 13 +++++++++++++ arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 3 +++ arch/arm64/kvm/pvtime.c | 2 +- arch/x86/kvm/x86.c | 3 +++ include/uapi/linux/kvm.h | 1 + 6 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 49af23d2b462..d2b733dc7892 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6160,3 +6160,16 @@ KVM can therefore start protected VMs. This capability governs the KVM_S390_PV_COMMAND ioctl and the KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected guests when the state change is invalid. + +8.24 KVM_CAP_STEAL_TIME +----------------------- + +:Architectures: arm64, x86 + +This capability indicates that KVM supports steal time accounting. +When steal time accounting is supported it may be enabled with +architecture-specific interfaces. This capability and the architecture- +specific interfaces must be consistent, i.e. if one says the feature +is supported, than the other should as well and vice versa. For arm64 +see Documentation/virt/kvm/devices/vcpu.rst "KVM_ARM_VCPU_PVTIME_CTRL". +For x86 see Documentation/virt/kvm/msr.rst "MSR_KVM_STEAL_TIME". diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index dd9c3b25aa1e..af4989a25bb7 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -543,6 +543,7 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); void kvm_update_stolen_time(struct kvm_vcpu *vcpu); +bool kvm_arm_pvtime_supported(void); int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 691d21e4c717..57876b0b870b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -206,6 +206,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = 1; break; + case KVM_CAP_STEAL_TIME: + r = kvm_arm_pvtime_supported(); + break; default: r = kvm_arch_vm_ioctl_check_extension(kvm, ext); break; diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 75234321d896..920ac43077ad 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -71,7 +71,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) return base; } -static bool kvm_arm_pvtime_supported(void) +bool kvm_arm_pvtime_supported(void) { return !!sched_info_on(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 599d73206299..c44d3a73b8eb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3581,6 +3581,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SMALLER_MAXPHYADDR: r = (int) allow_smaller_maxphyaddr; break; + case KVM_CAP_STEAL_TIME: + r = sched_info_on(); + break; default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f6d86033c4fa..3d8023474f2a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1035,6 +1035,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_LAST_CPU 184 #define KVM_CAP_SMALLER_MAXPHYADDR 185 #define KVM_CAP_S390_DIAG318 186 +#define KVM_CAP_STEAL_TIME 187 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From da9125df854ea48a6240c66e8a67be06e2c12c03 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 20 Aug 2020 14:12:55 +0200 Subject: netfilter: nf_tables: incorrect enum nft_list_attributes definition This should be NFTA_LIST_UNSPEC instead of NFTA_LIST_UNPEC, all other similar attribute definitions are postfixed with _UNSPEC. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 42f351c1f5c5..2b8e12f7a4a6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -133,7 +133,7 @@ enum nf_tables_msg_types { * @NFTA_LIST_ELEM: list element (NLA_NESTED) */ enum nft_list_attributes { - NFTA_LIST_UNPEC, + NFTA_LIST_UNSPEC, NFTA_LIST_ELEM, __NFTA_LIST_MAX }; -- cgit v1.2.3 From b16fc097bc283184cde40e5b30d15705e1590410 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 21 Aug 2020 15:36:42 +0200 Subject: bpf: Fix two typos in uapi/linux/bpf.h Also remove trailing whitespaces in bpf_skb_get_tunnel_key example code. Signed-off-by: Tobias Klauser Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200821133642.18870-1-tklauser@distanz.ch --- include/uapi/linux/bpf.h | 10 +++++----- tools/include/uapi/linux/bpf.h | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0480f893facd..b6238b2209b7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -767,7 +767,7 @@ union bpf_attr { * * Also, note that **bpf_trace_printk**\ () is slow, and should * only be used for debugging purposes. For this reason, a notice - * bloc (spanning several lines) is printed to kernel logs and + * block (spanning several lines) is printed to kernel logs and * states that the helper should not be used "for production use" * the first time this helper is used (or more precisely, when * **trace_printk**\ () buffers are allocated). For passing values @@ -1033,14 +1033,14 @@ union bpf_attr { * * int ret; * struct bpf_tunnel_key key = {}; - * + * * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); * if (ret < 0) * return TC_ACT_SHOT; // drop packet - * + * * if (key.remote_ipv4 != 0x0a000001) * return TC_ACT_SHOT; // drop packet - * + * * return TC_ACT_OK; // accept packet * * This interface can also be used with all encapsulation devices @@ -1147,7 +1147,7 @@ union bpf_attr { * Description * Retrieve the realm or the route, that is to say the * **tclassid** field of the destination for the *skb*. The - * indentifier retrieved is a user-provided tag, similar to the + * identifier retrieved is a user-provided tag, similar to the * one used with the net_cls cgroup (see description for * **bpf_get_cgroup_classid**\ () helper), but here this tag is * held by a route (a destination entry), not by a task. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0480f893facd..b6238b2209b7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -767,7 +767,7 @@ union bpf_attr { * * Also, note that **bpf_trace_printk**\ () is slow, and should * only be used for debugging purposes. For this reason, a notice - * bloc (spanning several lines) is printed to kernel logs and + * block (spanning several lines) is printed to kernel logs and * states that the helper should not be used "for production use" * the first time this helper is used (or more precisely, when * **trace_printk**\ () buffers are allocated). For passing values @@ -1033,14 +1033,14 @@ union bpf_attr { * * int ret; * struct bpf_tunnel_key key = {}; - * + * * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); * if (ret < 0) * return TC_ACT_SHOT; // drop packet - * + * * if (key.remote_ipv4 != 0x0a000001) * return TC_ACT_SHOT; // drop packet - * + * * return TC_ACT_OK; // accept packet * * This interface can also be used with all encapsulation devices @@ -1147,7 +1147,7 @@ union bpf_attr { * Description * Retrieve the realm or the route, that is to say the * **tclassid** field of the destination for the *skb*. The - * indentifier retrieved is a user-provided tag, similar to the + * identifier retrieved is a user-provided tag, similar to the * one used with the net_cls cgroup (see description for * **bpf_get_cgroup_classid**\ () helper), but here this tag is * held by a route (a destination entry), not by a task. -- cgit v1.2.3 From 645f08975f49441b3e753d8dc5b740cbcb226594 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 27 Aug 2020 07:27:49 -0400 Subject: net: Fix some comments Fix some comments, including wrong function name, duplicated word and so on. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++-- include/uapi/linux/in.h | 2 +- net/core/sock.c | 2 +- net/ipv4/raw.c | 2 +- net/l3mdev/l3mdev.c | 2 +- net/socket.c | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e8bca74857a3..8d9ab50b08c9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -71,7 +71,7 @@ * NETIF_F_IPV6_CSUM - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or - * IPv4|UDP where the Next Header field in the IPv6 + * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with @@ -2667,7 +2667,7 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) * * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS) * to reduce average number of cache lines per packet. - * get_rps_cpus() for example only access one 64 bytes aligned block : + * get_rps_cpu() for example only access one 64 bytes aligned block : * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8) */ #ifndef NET_SKB_PAD diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 3d0d8231dc19..7d6687618d80 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -135,7 +135,7 @@ struct in_addr { * this socket to prevent accepting spoofed ones. */ #define IP_PMTUDISC_INTERFACE 4 -/* weaker version of IP_PMTUDISC_INTERFACE, which allos packets to get +/* weaker version of IP_PMTUDISC_INTERFACE, which allows packets to get * fragmented if they exeed the interface mtu */ #define IP_PMTUDISC_OMIT 5 diff --git a/net/core/sock.c b/net/core/sock.c index e4f40b175acb..8eb2c924805a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3254,7 +3254,7 @@ void sk_common_release(struct sock *sk) sk->sk_prot->destroy(sk); /* - * Observation: when sock_common_release is called, processes have + * Observation: when sk_common_release is called, processes have * no access to socket. But net still has. * Step one, detach it from networking: * diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 6fd4330287c2..407956be7deb 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -610,7 +610,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) { ipc.oif = inet->uc_index; } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { - /* oif is set, packet is to local broadcast and + /* oif is set, packet is to local broadcast * and uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index e71ca5aec684..864326f150e2 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -154,7 +154,7 @@ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu); /** - * l3mdev_fib_table - get FIB table id associated with an L3 + * l3mdev_fib_table_rcu - get FIB table id associated with an L3 * master interface * @dev: targeted interface */ diff --git a/net/socket.c b/net/socket.c index dbbe8ea7d395..0c0144604f81 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3610,7 +3610,7 @@ int kernel_getsockname(struct socket *sock, struct sockaddr *addr) EXPORT_SYMBOL(kernel_getsockname); /** - * kernel_peername - get the address which the socket is connected (kernel space) + * kernel_getpeername - get the address which the socket is connected (kernel space) * @sock: socket * @addr: address holder * @@ -3671,7 +3671,7 @@ int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, EXPORT_SYMBOL(kernel_sendpage_locked); /** - * kernel_shutdown - shut down part of a full-duplex connection (kernel space) + * kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space) * @sock: socket * @how: connection part * -- cgit v1.2.3 From 15e9e35cd1dec2bc138464de6bf8ef828df19235 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 10 Sep 2020 18:33:51 +0800 Subject: KVM: MIPS: Change the definition of kvm type MIPS defines two kvm types: #define KVM_VM_MIPS_TE 0 #define KVM_VM_MIPS_VZ 1 In Documentation/virt/kvm/api.rst it is said that "You probably want to use 0 as machine type", which implies that type 0 be the "automatic" or "default" type. And, in user-space libvirt use the null-machine (with type 0) to detect the kvm capability, which returns "KVM not supported" on a VZ platform. I try to fix it in QEMU but it is ugly: https://lists.nongnu.org/archive/html/qemu-devel/2020-08/msg05629.html And Thomas Huth suggests me to change the definition of kvm type: https://lists.nongnu.org/archive/html/qemu-devel/2020-09/msg03281.html So I define like this: #define KVM_VM_MIPS_AUTO 0 #define KVM_VM_MIPS_VZ 1 #define KVM_VM_MIPS_TE 2 Since VZ and TE cannot co-exists, using type 0 on a TE platform will still return success (so old user-space tools have no problems on new kernels); the advantage is that using type 0 on a VZ platform will not return failure. So, the only problem is "new user-space tools use type 2 on old kernels", but if we treat this as a kernel bug, we can backport this patch to old stable kernels. Signed-off-by: Huacai Chen Message-Id: <1599734031-28746-1-git-send-email-chenhc@lemote.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 2 ++ include/uapi/linux/kvm.h | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 7de85d2253ff..0c50ac444222 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -137,6 +137,8 @@ extern void kvm_init_loongson_ipi(struct kvm *kvm); int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { switch (type) { + case KVM_VM_MIPS_AUTO: + break; #ifdef CONFIG_KVM_MIPS_VZ case KVM_VM_MIPS_VZ: #else diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3d8023474f2a..7d8eced6f459 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -790,9 +790,10 @@ struct kvm_ppc_resize_hpt { #define KVM_VM_PPC_HV 1 #define KVM_VM_PPC_PR 2 -/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */ -#define KVM_VM_MIPS_TE 0 +/* on MIPS, 0 indicates auto, 1 forces VZ ASE, 2 forces trap & emulate */ +#define KVM_VM_MIPS_AUTO 0 #define KVM_VM_MIPS_VZ 1 +#define KVM_VM_MIPS_TE 2 #define KVM_S390_SIE_PAGE_OFFSET 1 -- cgit v1.2.3 From 129134e5415d46f38b9978b3809af94ed649b57d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 11 Sep 2020 05:07:58 +0200 Subject: media: media/v4l2: remove V4L2_FLAG_MEMORY_NON_CONSISTENT flag The patch partially reverts some of the UAPI bits of the buffer cache management hints. Namely, the queue consistency (memory coherency) user-space hint because, as it turned out, the kernel implementation of this feature was misusing DMA_ATTR_NON_CONSISTENT. The patch reverts both kernel and user space parts: removes the DMA consistency attr functions, rolls back changes to v4l2_requestbuffers, v4l2_create_buffers structures and corresponding UAPI functions (plus compat32 layer) and cleans up the documentation. [hverkuil: fixed a few typos in the commit log] [hverkuil: fixed vb2_core_reqbufs call in drivers/media/dvb-core/dvb_vb2.c] [mchehab: fixed a typo in the commit log: revers->reverts] Signed-off-by: Christoph Hellwig Signed-off-by: Sergey Senozhatsky Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/userspace-api/media/v4l/buffer.rst | 17 -------- .../userspace-api/media/v4l/vidioc-create-bufs.rst | 6 +-- .../userspace-api/media/v4l/vidioc-reqbufs.rst | 12 +----- drivers/media/common/videobuf2/videobuf2-core.c | 46 +++------------------- .../media/common/videobuf2/videobuf2-dma-contig.c | 19 --------- drivers/media/common/videobuf2/videobuf2-dma-sg.c | 3 +- drivers/media/common/videobuf2/videobuf2-v4l2.c | 18 +-------- drivers/media/dvb-core/dvb_vb2.c | 2 +- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 10 +---- drivers/media/v4l2-core/v4l2-ioctl.c | 5 ++- include/media/videobuf2-core.h | 7 +--- include/uapi/linux/videodev2.h | 13 +----- 12 files changed, 23 insertions(+), 135 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/userspace-api/media/v4l/buffer.rst b/Documentation/userspace-api/media/v4l/buffer.rst index 57e752aaf414..2044ed13cd9d 100644 --- a/Documentation/userspace-api/media/v4l/buffer.rst +++ b/Documentation/userspace-api/media/v4l/buffer.rst @@ -701,23 +701,6 @@ Memory Consistency Flags :stub-columns: 0 :widths: 3 1 4 - * .. _`V4L2-FLAG-MEMORY-NON-CONSISTENT`: - - - ``V4L2_FLAG_MEMORY_NON_CONSISTENT`` - - 0x00000001 - - A buffer is allocated either in consistent (it will be automatically - coherent between the CPU and the bus) or non-consistent memory. The - latter can provide performance gains, for instance the CPU cache - sync/flush operations can be avoided if the buffer is accessed by the - corresponding device only and the CPU does not read/write to/from that - buffer. However, this requires extra care from the driver -- it must - guarantee memory consistency by issuing a cache flush/sync when - consistency is needed. If this flag is set V4L2 will attempt to - allocate the buffer in non-consistent memory. The flag takes effect - only if the buffer is used for :ref:`memory mapping ` I/O and the - queue reports the :ref:`V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS - ` capability. - .. c:type:: v4l2_memory enum v4l2_memory diff --git a/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst b/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst index f2a702870fad..12cf6b44f414 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst @@ -120,13 +120,9 @@ than the number requested. If you want to just query the capabilities without making any other changes, then set ``count`` to 0, ``memory`` to ``V4L2_MEMORY_MMAP`` and ``format.type`` to the buffer type. - * - __u32 - - ``flags`` - - Specifies additional buffer management attributes. - See :ref:`memory-flags`. * - __u32 - - ``reserved``\ [6] + - ``reserved``\ [7] - A place holder for future extensions. Drivers and applications must set the array to zero. diff --git a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst index 75d894d9c36c..0e3e2fde65e8 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst @@ -112,17 +112,10 @@ aborting or finishing any DMA in progress, an implicit ``V4L2_MEMORY_MMAP`` and ``type`` set to the buffer type. This will free any previously allocated buffers, so this is typically something that will be done at the start of the application. - * - union { - - (anonymous) - * - __u32 - - ``flags`` - - Specifies additional buffer management attributes. - See :ref:`memory-flags`. * - __u32 - ``reserved``\ [1] - - Kept for backwards compatibility. Use ``flags`` instead. - * - } - - + - A place holder for future extensions. Drivers and applications + must set the array to zero. .. tabularcolumns:: |p{6.1cm}|p{2.2cm}|p{8.7cm}| @@ -169,7 +162,6 @@ aborting or finishing any DMA in progress, an implicit - This capability is set by the driver to indicate that the queue supports cache and memory management hints. However, it's only valid when the queue is used for :ref:`memory mapping ` streaming I/O. See - :ref:`V4L2_FLAG_MEMORY_NON_CONSISTENT `, :ref:`V4L2_BUF_FLAG_NO_CACHE_INVALIDATE ` and :ref:`V4L2_BUF_FLAG_NO_CACHE_CLEAN `. diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c index f544d3393e9d..4eab6d81cce1 100644 --- a/drivers/media/common/videobuf2/videobuf2-core.c +++ b/drivers/media/common/videobuf2/videobuf2-core.c @@ -721,39 +721,14 @@ int vb2_verify_memory_type(struct vb2_queue *q, } EXPORT_SYMBOL(vb2_verify_memory_type); -static void set_queue_consistency(struct vb2_queue *q, bool consistent_mem) -{ - q->dma_attrs &= ~DMA_ATTR_NON_CONSISTENT; - - if (!vb2_queue_allows_cache_hints(q)) - return; - if (!consistent_mem) - q->dma_attrs |= DMA_ATTR_NON_CONSISTENT; -} - -static bool verify_consistency_attr(struct vb2_queue *q, bool consistent_mem) -{ - bool queue_is_consistent = !(q->dma_attrs & DMA_ATTR_NON_CONSISTENT); - - if (consistent_mem != queue_is_consistent) { - dprintk(q, 1, "memory consistency model mismatch\n"); - return false; - } - return true; -} - int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, - unsigned int flags, unsigned int *count) + unsigned int *count) { unsigned int num_buffers, allocated_buffers, num_planes = 0; unsigned plane_sizes[VB2_MAX_PLANES] = { }; - bool consistent_mem = true; unsigned int i; int ret; - if (flags & V4L2_FLAG_MEMORY_NON_CONSISTENT) - consistent_mem = false; - if (q->streaming) { dprintk(q, 1, "streaming active\n"); return -EBUSY; @@ -765,8 +740,7 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, } if (*count == 0 || q->num_buffers != 0 || - (q->memory != VB2_MEMORY_UNKNOWN && q->memory != memory) || - !verify_consistency_attr(q, consistent_mem)) { + (q->memory != VB2_MEMORY_UNKNOWN && q->memory != memory)) { /* * We already have buffers allocated, so first check if they * are not in use and can be freed. @@ -803,7 +777,6 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, num_buffers = min_t(unsigned int, num_buffers, VB2_MAX_FRAME); memset(q->alloc_devs, 0, sizeof(q->alloc_devs)); q->memory = memory; - set_queue_consistency(q, consistent_mem); /* * Ask the driver how many buffers and planes per buffer it requires. @@ -888,18 +861,14 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, EXPORT_SYMBOL_GPL(vb2_core_reqbufs); int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory, - unsigned int flags, unsigned int *count, + unsigned int *count, unsigned int requested_planes, const unsigned int requested_sizes[]) { unsigned int num_planes = 0, num_buffers, allocated_buffers; unsigned plane_sizes[VB2_MAX_PLANES] = { }; - bool consistent_mem = true; int ret; - if (flags & V4L2_FLAG_MEMORY_NON_CONSISTENT) - consistent_mem = false; - if (q->num_buffers == VB2_MAX_FRAME) { dprintk(q, 1, "maximum number of buffers already allocated\n"); return -ENOBUFS; @@ -912,15 +881,12 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory, } memset(q->alloc_devs, 0, sizeof(q->alloc_devs)); q->memory = memory; - set_queue_consistency(q, consistent_mem); q->waiting_for_buffers = !q->is_output; } else { if (q->memory != memory) { dprintk(q, 1, "memory model mismatch\n"); return -EINVAL; } - if (!verify_consistency_attr(q, consistent_mem)) - return -EINVAL; } num_buffers = min(*count, VB2_MAX_FRAME - q->num_buffers); @@ -2581,7 +2547,7 @@ static int __vb2_init_fileio(struct vb2_queue *q, int read) fileio->memory = VB2_MEMORY_MMAP; fileio->type = q->type; q->fileio = fileio; - ret = vb2_core_reqbufs(q, fileio->memory, 0, &fileio->count); + ret = vb2_core_reqbufs(q, fileio->memory, &fileio->count); if (ret) goto err_kfree; @@ -2638,7 +2604,7 @@ static int __vb2_init_fileio(struct vb2_queue *q, int read) err_reqbufs: fileio->count = 0; - vb2_core_reqbufs(q, fileio->memory, 0, &fileio->count); + vb2_core_reqbufs(q, fileio->memory, &fileio->count); err_kfree: q->fileio = NULL; @@ -2658,7 +2624,7 @@ static int __vb2_cleanup_fileio(struct vb2_queue *q) vb2_core_streamoff(q, q->type); q->fileio = NULL; fileio->count = 0; - vb2_core_reqbufs(q, fileio->memory, 0, &fileio->count); + vb2_core_reqbufs(q, fileio->memory, &fileio->count); kfree(fileio); dprintk(q, 3, "file io emulator closed\n"); } diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c index ec3446cc45b8..7b1b86ec942d 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c @@ -42,11 +42,6 @@ struct vb2_dc_buf { struct dma_buf_attachment *db_attach; }; -static inline bool vb2_dc_buffer_consistent(unsigned long attr) -{ - return !(attr & DMA_ATTR_NON_CONSISTENT); -} - /*********************************************/ /* scatterlist table functions */ /*********************************************/ @@ -341,13 +336,6 @@ static int vb2_dc_dmabuf_ops_begin_cpu_access(struct dma_buf *dbuf, enum dma_data_direction direction) { - struct vb2_dc_buf *buf = dbuf->priv; - struct sg_table *sgt = buf->dma_sgt; - - if (vb2_dc_buffer_consistent(buf->attrs)) - return 0; - - dma_sync_sg_for_cpu(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir); return 0; } @@ -355,13 +343,6 @@ static int vb2_dc_dmabuf_ops_end_cpu_access(struct dma_buf *dbuf, enum dma_data_direction direction) { - struct vb2_dc_buf *buf = dbuf->priv; - struct sg_table *sgt = buf->dma_sgt; - - if (vb2_dc_buffer_consistent(buf->attrs)) - return 0; - - dma_sync_sg_for_device(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir); return 0; } diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c index 0a40e00f0d7e..a86fce5d8ea8 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c @@ -123,8 +123,7 @@ static void *vb2_dma_sg_alloc(struct device *dev, unsigned long dma_attrs, /* * NOTE: dma-sg allocates memory using the page allocator directly, so * there is no memory consistency guarantee, hence dma-sg ignores DMA - * attributes passed from the upper layer. That means that - * V4L2_FLAG_MEMORY_NON_CONSISTENT has no effect on dma-sg buffers. + * attributes passed from the upper layer. */ buf->pages = kvmalloc_array(buf->num_pages, sizeof(struct page *), GFP_KERNEL | __GFP_ZERO); diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index 30caad27281e..cfe197df970d 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -722,22 +722,12 @@ static void fill_buf_caps(struct vb2_queue *q, u32 *caps) #endif } -static void clear_consistency_attr(struct vb2_queue *q, - int memory, - unsigned int *flags) -{ - if (!q->allow_cache_hints || memory != V4L2_MEMORY_MMAP) - *flags &= ~V4L2_FLAG_MEMORY_NON_CONSISTENT; -} - int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req) { int ret = vb2_verify_memory_type(q, req->memory, req->type); fill_buf_caps(q, &req->capabilities); - clear_consistency_attr(q, req->memory, &req->flags); - return ret ? ret : vb2_core_reqbufs(q, req->memory, - req->flags, &req->count); + return ret ? ret : vb2_core_reqbufs(q, req->memory, &req->count); } EXPORT_SYMBOL_GPL(vb2_reqbufs); @@ -769,7 +759,6 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create) unsigned i; fill_buf_caps(q, &create->capabilities); - clear_consistency_attr(q, create->memory, &create->flags); create->index = q->num_buffers; if (create->count == 0) return ret != -EBUSY ? ret : 0; @@ -813,7 +802,6 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create) if (requested_sizes[i] == 0) return -EINVAL; return ret ? ret : vb2_core_create_bufs(q, create->memory, - create->flags, &create->count, requested_planes, requested_sizes); @@ -998,12 +986,11 @@ int vb2_ioctl_reqbufs(struct file *file, void *priv, int res = vb2_verify_memory_type(vdev->queue, p->memory, p->type); fill_buf_caps(vdev->queue, &p->capabilities); - clear_consistency_attr(vdev->queue, p->memory, &p->flags); if (res) return res; if (vb2_queue_is_busy(vdev, file)) return -EBUSY; - res = vb2_core_reqbufs(vdev->queue, p->memory, p->flags, &p->count); + res = vb2_core_reqbufs(vdev->queue, p->memory, &p->count); /* If count == 0, then the owner has released all buffers and he is no longer owner of the queue. Otherwise we have a new owner. */ if (res == 0) @@ -1021,7 +1008,6 @@ int vb2_ioctl_create_bufs(struct file *file, void *priv, p->index = vdev->queue->num_buffers; fill_buf_caps(vdev->queue, &p->capabilities); - clear_consistency_attr(vdev->queue, p->memory, &p->flags); /* * If count == 0, then just check if memory and type are valid. * Any -EBUSY result from vb2_verify_memory_type can be mapped to 0. diff --git a/drivers/media/dvb-core/dvb_vb2.c b/drivers/media/dvb-core/dvb_vb2.c index 959d110407a4..6974f1731529 100644 --- a/drivers/media/dvb-core/dvb_vb2.c +++ b/drivers/media/dvb-core/dvb_vb2.c @@ -342,7 +342,7 @@ int dvb_vb2_reqbufs(struct dvb_vb2_ctx *ctx, struct dmx_requestbuffers *req) ctx->buf_siz = req->size; ctx->buf_cnt = req->count; - ret = vb2_core_reqbufs(&ctx->vb_q, VB2_MEMORY_MMAP, 0, &req->count); + ret = vb2_core_reqbufs(&ctx->vb_q, VB2_MEMORY_MMAP, &req->count); if (ret) { ctx->state = DVB_VB2_STATE_NONE; dprintk(1, "[%s] count=%d size=%d errno=%d\n", ctx->name, diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index 593bcf6c3735..a99e82ec9ab6 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -246,9 +246,6 @@ struct v4l2_format32 { * @memory: buffer memory type * @format: frame format, for which buffers are requested * @capabilities: capabilities of this buffer type. - * @flags: additional buffer management attributes (ignored unless the - * queue has V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS capability and - * configured for MMAP streaming I/O). * @reserved: future extensions */ struct v4l2_create_buffers32 { @@ -257,8 +254,7 @@ struct v4l2_create_buffers32 { __u32 memory; /* enum v4l2_memory */ struct v4l2_format32 format; __u32 capabilities; - __u32 flags; - __u32 reserved[6]; + __u32 reserved[7]; }; static int __bufsize_v4l2_format(struct v4l2_format32 __user *p32, u32 *size) @@ -359,8 +355,7 @@ static int get_v4l2_create32(struct v4l2_create_buffers __user *p64, { if (!access_ok(p32, sizeof(*p32)) || copy_in_user(p64, p32, - offsetof(struct v4l2_create_buffers32, format)) || - assign_in_user(&p64->flags, &p32->flags)) + offsetof(struct v4l2_create_buffers32, format))) return -EFAULT; return __get_v4l2_format32(&p64->format, &p32->format, aux_buf, aux_space); @@ -422,7 +417,6 @@ static int put_v4l2_create32(struct v4l2_create_buffers __user *p64, copy_in_user(p32, p64, offsetof(struct v4l2_create_buffers32, format)) || assign_in_user(&p32->capabilities, &p64->capabilities) || - assign_in_user(&p32->flags, &p64->flags) || copy_in_user(p32->reserved, p64->reserved, sizeof(p64->reserved))) return -EFAULT; return __put_v4l2_format32(&p64->format, &p32->format); diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index e3a25ea913ac..ccf947632a3b 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -2044,6 +2044,9 @@ static int v4l_reqbufs(const struct v4l2_ioctl_ops *ops, if (ret) return ret; + + CLEAR_AFTER_FIELD(p, capabilities); + return ops->vidioc_reqbufs(file, fh, p); } @@ -2083,7 +2086,7 @@ static int v4l_create_bufs(const struct v4l2_ioctl_ops *ops, if (ret) return ret; - CLEAR_AFTER_FIELD(create, flags); + CLEAR_AFTER_FIELD(create, capabilities); v4l_sanitize_format(&create->format); diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h index 52ef92049073..bbb3f26fbde9 100644 --- a/include/media/videobuf2-core.h +++ b/include/media/videobuf2-core.h @@ -744,8 +744,6 @@ void vb2_core_querybuf(struct vb2_queue *q, unsigned int index, void *pb); * vb2_core_reqbufs() - Initiate streaming. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @memory: memory type, as defined by &enum vb2_memory. - * @flags: auxiliary queue/buffer management flags. Currently, the only - * used flag is %V4L2_FLAG_MEMORY_NON_CONSISTENT. * @count: requested buffer count. * * Videobuf2 core helper to implement VIDIOC_REQBUF() operation. It is called @@ -770,13 +768,12 @@ void vb2_core_querybuf(struct vb2_queue *q, unsigned int index, void *pb); * Return: returns zero on success; an error code otherwise. */ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, - unsigned int flags, unsigned int *count); + unsigned int *count); /** * vb2_core_create_bufs() - Allocate buffers and any required auxiliary structs * @q: pointer to &struct vb2_queue with videobuf2 queue. * @memory: memory type, as defined by &enum vb2_memory. - * @flags: auxiliary queue/buffer management flags. * @count: requested buffer count. * @requested_planes: number of planes requested. * @requested_sizes: array with the size of the planes. @@ -794,7 +791,7 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, * Return: returns zero on success; an error code otherwise. */ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory, - unsigned int flags, unsigned int *count, + unsigned int *count, unsigned int requested_planes, const unsigned int requested_sizes[]); diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index c7b70ff53bc1..235db7754606 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -191,8 +191,6 @@ enum v4l2_memory { V4L2_MEMORY_DMABUF = 4, }; -#define V4L2_FLAG_MEMORY_NON_CONSISTENT (1 << 0) - /* see also http://vektor.theorem.ca/graphics/ycbcr/ */ enum v4l2_colorspace { /* @@ -949,10 +947,7 @@ struct v4l2_requestbuffers { __u32 type; /* enum v4l2_buf_type */ __u32 memory; /* enum v4l2_memory */ __u32 capabilities; - union { - __u32 flags; - __u32 reserved[1]; - }; + __u32 reserved[1]; }; /* capabilities for struct v4l2_requestbuffers and v4l2_create_buffers */ @@ -2456,9 +2451,6 @@ struct v4l2_dbg_chip_info { * @memory: enum v4l2_memory; buffer memory type * @format: frame format, for which buffers are requested * @capabilities: capabilities of this buffer type. - * @flags: additional buffer management attributes (ignored unless the - * queue has V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS capability - * and configured for MMAP streaming I/O). * @reserved: future extensions */ struct v4l2_create_buffers { @@ -2467,8 +2459,7 @@ struct v4l2_create_buffers { __u32 memory; struct v4l2_format format; __u32 capabilities; - __u32 flags; - __u32 reserved[6]; + __u32 reserved[7]; }; /* -- cgit v1.2.3 From 19a83d36f9837e8bd27435ebb31564a717a5d15a Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 17 Sep 2020 01:04:10 +0200 Subject: ethtool: add and use message type for tunnel info reply Tunnel offload info code uses ETHTOOL_MSG_TUNNEL_INFO_GET message type (cmd field in genetlink header) for replies to tunnel info netlink request, i.e. the same value as the request have. This is a problem because we are using two separate enums for userspace to kernel and kernel to userspace message types so that this ETHTOOL_MSG_TUNNEL_INFO_GET (28) collides with ETHTOOL_MSG_CABLE_TEST_TDR_NTF which is what message type 28 means for kernel to userspace messages. As the tunnel info request reached mainline in 5.9 merge window, we should still be able to fix the reply message type without breaking backward compatibility. Fixes: c7d759eb7b12 ("ethtool: add tunnel info interface") Signed-off-by: Michal Kubecek Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 3 +++ include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/tunnels.c | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index d53bcb31645a..b5a79881551f 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -206,6 +206,7 @@ Userspace to kernel: ``ETHTOOL_MSG_TSINFO_GET`` get timestamping info ``ETHTOOL_MSG_CABLE_TEST_ACT`` action start cable test ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT`` action start raw TDR cable test + ``ETHTOOL_MSG_TUNNEL_INFO_GET`` get tunnel offload info ===================================== ================================ Kernel to userspace: @@ -239,6 +240,7 @@ Kernel to userspace: ``ETHTOOL_MSG_TSINFO_GET_REPLY`` timestamping info ``ETHTOOL_MSG_CABLE_TEST_NTF`` Cable test results ``ETHTOOL_MSG_CABLE_TEST_TDR_NTF`` Cable test TDR results + ``ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY`` tunnel offload info ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1363,4 +1365,5 @@ are netlink only. ``ETHTOOL_SFECPARAM`` n/a n/a ''ETHTOOL_MSG_CABLE_TEST_ACT'' n/a ''ETHTOOL_MSG_CABLE_TEST_TDR_ACT'' + n/a ``ETHTOOL_MSG_TUNNEL_INFO_GET`` =================================== ===================================== diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 5dcd24cb33ea..72ba36be9655 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -79,6 +79,7 @@ enum { ETHTOOL_MSG_TSINFO_GET_REPLY, ETHTOOL_MSG_CABLE_TEST_NTF, ETHTOOL_MSG_CABLE_TEST_TDR_NTF, + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c index 84f23289475b..d93bf2da0f34 100644 --- a/net/ethtool/tunnels.c +++ b/net/ethtool/tunnels.c @@ -200,7 +200,7 @@ int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info) reply_len = ret + ethnl_reply_header_size(); rskb = ethnl_reply_init(reply_len, req_info.dev, - ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, ETHTOOL_A_TUNNEL_INFO_HEADER, info, &reply_payload); if (!rskb) { @@ -273,7 +273,7 @@ int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) goto cont; ehdr = ethnl_dump_put(skb, cb, - ETHTOOL_MSG_TUNNEL_INFO_GET); + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); if (!ehdr) { ret = -EMSGSIZE; goto out; -- cgit v1.2.3 From ad2b9b0f8d0107602bdc1f3b1ab90719842ace11 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Thu, 24 Sep 2020 15:23:14 -0700 Subject: tcp: skip DSACKs with dubious sequence ranges Currently, we use length of DSACKed range to compute number of delivered packets. And if sequence range in DSACK is corrupted, we can get bogus dsacked/acked count, and bogus cwnd. This patch put bounds on DSACKed range to skip update of data delivery and spurious retransmission information, if the DSACK is unlikely caused by sender's action: - DSACKed range shouldn't be greater than maximum advertised rwnd. - Total no. of DSACKed segments shouldn't be greater than total no. of retransmitted segs. Unlike spurious retransmits, network duplicates or corrupted DSACKs shouldn't be counted as delivery. Signed-off-by: Priyaranjan Jha Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_input.c | 32 +++++++++++++++++++++++++------- 3 files changed, 27 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index cee9f8e6fce3..f84e7bcad6de 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -288,6 +288,7 @@ enum LINUX_MIB_TCPTIMEOUTREHASH, /* TCPTimeoutRehash */ LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ LINUX_MIB_TCPDSACKRECVSEGS, /* TCPDSACKRecvSegs */ + LINUX_MIB_TCPDSACKIGNOREDDUBIOUS, /* TCPDSACKIgnoredDubious */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 1074df726ec0..8d5e1695b9aa 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -293,6 +293,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), + SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 184ea556f50e..b1ce2054291d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -885,21 +885,34 @@ struct tcp_sacktag_state { struct rate_sample *rate; }; -/* Take a notice that peer is sending D-SACKs */ +/* Take a notice that peer is sending D-SACKs. Skip update of data delivery + * and spurious retransmission information if this DSACK is unlikely caused by + * sender's action: + * - DSACKed sequence range is larger than maximum receiver's window. + * - Total no. of DSACKed segments exceed the total no. of retransmitted segs. + */ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, u32 end_seq, struct tcp_sacktag_state *state) { u32 seq_len, dup_segs = 1; - if (before(start_seq, end_seq)) { - seq_len = end_seq - start_seq; - if (seq_len > tp->mss_cache) - dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); - } + if (!before(start_seq, end_seq)) + return 0; + + seq_len = end_seq - start_seq; + /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */ + if (seq_len > tp->max_window) + return 0; + if (seq_len > tp->mss_cache) + dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); + + tp->dsack_dups += dup_segs; + /* Skip the DSACK if dup segs weren't retransmitted by sender */ + if (tp->dsack_dups > tp->total_retrans) + return 0; tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; tp->rack.dsack_seen = 1; - tp->dsack_dups += dup_segs; state->flag |= FLAG_DSACKING_ACK; /* A spurious retransmission is delivered */ @@ -1153,6 +1166,11 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); + if (!dup_segs) { /* Skip dubious DSACK */ + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS); + return false; + } + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); /* D-SACK for already forgotten data... Do dumb counting. */ -- cgit v1.2.3 From 2d914c1bf079491d1113051a7232250267f3f2e4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 30 Sep 2020 21:27:18 +0100 Subject: rxrpc: Fix accept on a connection that need securing When a new incoming call arrives at an userspace rxrpc socket on a new connection that has a security class set, the code currently pushes it onto the accept queue to hold a ref on it for the socket. This doesn't work, however, as recvmsg() pops it off, notices that it's in the SERVER_SECURING state and discards the ref. This means that the call runs out of refs too early and the kernel oopses. By contrast, a kernel rxrpc socket manually pre-charges the incoming call pool with calls that already have user call IDs assigned, so they are ref'd by the call tree on the socket. Change the mode of operation for userspace rxrpc server sockets to work like this too. Although this is a UAPI change, server sockets aren't currently functional. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells --- include/uapi/linux/rxrpc.h | 2 +- net/rxrpc/ar-internal.h | 7 +- net/rxrpc/call_accept.c | 263 +++++++-------------------------------------- net/rxrpc/call_object.c | 5 +- net/rxrpc/conn_event.c | 2 +- net/rxrpc/recvmsg.c | 36 +------ net/rxrpc/sendmsg.c | 15 +-- 7 files changed, 49 insertions(+), 281 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index 4accfa7e266d..8f8dc7a937a4 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -51,11 +51,11 @@ enum rxrpc_cmsg_type { RXRPC_BUSY = 6, /* -r: server busy received [terminal] */ RXRPC_LOCAL_ERROR = 7, /* -r: local error generated [terminal] */ RXRPC_NEW_CALL = 8, /* -r: [Service] new incoming call notification */ - RXRPC_ACCEPT = 9, /* s-: [Service] accept request */ RXRPC_EXCLUSIVE_CALL = 10, /* s-: Call should be on exclusive connection */ RXRPC_UPGRADE_SERVICE = 11, /* s-: Request service upgrade for client call */ RXRPC_TX_LENGTH = 12, /* s-: Total length of Tx data */ RXRPC_SET_CALL_TIMEOUT = 13, /* s-: Set one or more call timeouts */ + RXRPC_CHARGE_ACCEPT = 14, /* s-: Charge the accept pool with a user call ID */ RXRPC__SUPPORTED }; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 884cff7bb169..97aebb5d19db 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -518,7 +518,6 @@ enum rxrpc_call_state { RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ - RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ @@ -714,8 +713,8 @@ struct rxrpc_ack_summary { enum rxrpc_command { RXRPC_CMD_SEND_DATA, /* send data message */ RXRPC_CMD_SEND_ABORT, /* request abort generation */ - RXRPC_CMD_ACCEPT, /* [server] accept incoming call */ RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ + RXRPC_CMD_CHARGE_ACCEPT, /* [server] charge accept preallocation */ }; struct rxrpc_call_params { @@ -755,9 +754,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *, struct rxrpc_sock *, struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); -struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long, - rxrpc_notify_rx_t); -int rxrpc_reject_call(struct rxrpc_sock *); +int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); /* * call_event.c diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index ef160566aa9a..8df1964db333 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -39,8 +39,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, unsigned int debug_id) { const void *here = __builtin_return_address(0); - struct rxrpc_call *call; + struct rxrpc_call *call, *xcall; struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); + struct rb_node *parent, **pp; int max, tmp; unsigned int size = RXRPC_BACKLOG_MAX; unsigned int head, tail, call_head, call_tail; @@ -94,7 +95,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, } /* Now it gets complicated, because calls get registered with the - * socket here, particularly if a user ID is preassigned by the user. + * socket here, with a user ID preassigned by the user. */ call = rxrpc_alloc_call(rx, gfp, debug_id); if (!call) @@ -107,34 +108,33 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, here, (const void *)user_call_ID); write_lock(&rx->call_lock); - if (user_attach_call) { - struct rxrpc_call *xcall; - struct rb_node *parent, **pp; - - /* Check the user ID isn't already in use */ - pp = &rx->calls.rb_node; - parent = NULL; - while (*pp) { - parent = *pp; - xcall = rb_entry(parent, struct rxrpc_call, sock_node); - if (user_call_ID < xcall->user_call_ID) - pp = &(*pp)->rb_left; - else if (user_call_ID > xcall->user_call_ID) - pp = &(*pp)->rb_right; - else - goto id_in_use; - } - call->user_call_ID = user_call_ID; - call->notify_rx = notify_rx; + /* Check the user ID isn't already in use */ + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + xcall = rb_entry(parent, struct rxrpc_call, sock_node); + if (user_call_ID < xcall->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > xcall->user_call_ID) + pp = &(*pp)->rb_right; + else + goto id_in_use; + } + + call->user_call_ID = user_call_ID; + call->notify_rx = notify_rx; + if (user_attach_call) { rxrpc_get_call(call, rxrpc_call_got_kernel); user_attach_call(call, user_call_ID); - rxrpc_get_call(call, rxrpc_call_got_userid); - rb_link_node(&call->sock_node, parent, pp); - rb_insert_color(&call->sock_node, &rx->calls); - set_bit(RXRPC_CALL_HAS_USERID, &call->flags); } + rxrpc_get_call(call, rxrpc_call_got_userid); + rb_link_node(&call->sock_node, parent, pp); + rb_insert_color(&call->sock_node, &rx->calls); + set_bit(RXRPC_CALL_HAS_USERID, &call->flags); + list_add(&call->sock_link, &rx->sock_calls); write_unlock(&rx->call_lock); @@ -157,11 +157,8 @@ id_in_use: } /* - * Preallocate sufficient service connections, calls and peers to cover the - * entire backlog of a socket. When a new call comes in, if we don't have - * sufficient of each available, the call gets rejected as busy or ignored. - * - * The backlog is replenished when a connection is accepted or rejected. + * Allocate the preallocation buffers for incoming service calls. These must + * be charged manually. */ int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp) { @@ -174,13 +171,6 @@ int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp) rx->backlog = b; } - if (rx->discard_new_call) - return 0; - - while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp, - atomic_inc_return(&rxrpc_debug_id)) == 0) - ; - return 0; } @@ -333,6 +323,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, rxrpc_see_call(call); call->conn = conn; call->security = conn->security; + call->security_ix = conn->security_ix; call->peer = rxrpc_get_peer(conn->params.peer); call->cong_cwnd = call->peer->cong_cwnd; return call; @@ -402,8 +393,6 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, if (rx->notify_new_call) rx->notify_new_call(&rx->sk, call, call->user_call_ID); - else - sk_acceptq_added(&rx->sk); spin_lock(&conn->state_lock); switch (conn->state) { @@ -415,12 +404,8 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, case RXRPC_CONN_SERVICE: write_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) { - if (rx->discard_new_call) - call->state = RXRPC_CALL_SERVER_RECV_REQUEST; - else - call->state = RXRPC_CALL_SERVER_ACCEPTING; - } + if (call->state < RXRPC_CALL_COMPLETE) + call->state = RXRPC_CALL_SERVER_RECV_REQUEST; write_unlock(&call->state_lock); break; @@ -440,9 +425,6 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, rxrpc_send_ping(call, skb); - if (call->state == RXRPC_CALL_SERVER_ACCEPTING) - rxrpc_notify_socket(call); - /* We have to discard the prealloc queue's ref here and rely on a * combination of the RCU read lock and refs held either by the socket * (recvmsg queue, to-be-accepted queue or user ID tree) or the kernel @@ -460,187 +442,18 @@ no_call: } /* - * handle acceptance of a call by userspace - * - assign the user call ID to the call at the front of the queue - * - called with the socket locked. + * Charge up socket with preallocated calls, attaching user call IDs. */ -struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, - unsigned long user_call_ID, - rxrpc_notify_rx_t notify_rx) - __releases(&rx->sk.sk_lock.slock) - __acquires(call->user_mutex) +int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) { - struct rxrpc_call *call; - struct rb_node *parent, **pp; - int ret; - - _enter(",%lx", user_call_ID); - - ASSERT(!irqs_disabled()); - - write_lock(&rx->call_lock); - - if (list_empty(&rx->to_be_accepted)) { - write_unlock(&rx->call_lock); - release_sock(&rx->sk); - kleave(" = -ENODATA [empty]"); - return ERR_PTR(-ENODATA); - } - - /* check the user ID isn't already in use */ - pp = &rx->calls.rb_node; - parent = NULL; - while (*pp) { - parent = *pp; - call = rb_entry(parent, struct rxrpc_call, sock_node); - - if (user_call_ID < call->user_call_ID) - pp = &(*pp)->rb_left; - else if (user_call_ID > call->user_call_ID) - pp = &(*pp)->rb_right; - else - goto id_in_use; - } - - /* Dequeue the first call and check it's still valid. We gain - * responsibility for the queue's reference. - */ - call = list_entry(rx->to_be_accepted.next, - struct rxrpc_call, accept_link); - write_unlock(&rx->call_lock); - - /* We need to gain the mutex from the interrupt handler without - * upsetting lockdep, so we have to release it there and take it here. - * We are, however, still holding the socket lock, so other accepts - * must wait for us and no one can add the user ID behind our backs. - */ - if (mutex_lock_interruptible(&call->user_mutex) < 0) { - release_sock(&rx->sk); - kleave(" = -ERESTARTSYS"); - return ERR_PTR(-ERESTARTSYS); - } - - write_lock(&rx->call_lock); - list_del_init(&call->accept_link); - sk_acceptq_removed(&rx->sk); - rxrpc_see_call(call); - - /* Find the user ID insertion point. */ - pp = &rx->calls.rb_node; - parent = NULL; - while (*pp) { - parent = *pp; - call = rb_entry(parent, struct rxrpc_call, sock_node); - - if (user_call_ID < call->user_call_ID) - pp = &(*pp)->rb_left; - else if (user_call_ID > call->user_call_ID) - pp = &(*pp)->rb_right; - else - BUG(); - } - - write_lock_bh(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_SERVER_ACCEPTING: - call->state = RXRPC_CALL_SERVER_RECV_REQUEST; - break; - case RXRPC_CALL_COMPLETE: - ret = call->error; - goto out_release; - default: - BUG(); - } - - /* formalise the acceptance */ - call->notify_rx = notify_rx; - call->user_call_ID = user_call_ID; - rxrpc_get_call(call, rxrpc_call_got_userid); - rb_link_node(&call->sock_node, parent, pp); - rb_insert_color(&call->sock_node, &rx->calls); - if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags)) - BUG(); - - write_unlock_bh(&call->state_lock); - write_unlock(&rx->call_lock); - rxrpc_notify_socket(call); - rxrpc_service_prealloc(rx, GFP_KERNEL); - release_sock(&rx->sk); - _leave(" = %p{%d}", call, call->debug_id); - return call; - -out_release: - _debug("release %p", call); - write_unlock_bh(&call->state_lock); - write_unlock(&rx->call_lock); - rxrpc_release_call(rx, call); - rxrpc_put_call(call, rxrpc_call_put); - goto out; - -id_in_use: - ret = -EBADSLT; - write_unlock(&rx->call_lock); -out: - rxrpc_service_prealloc(rx, GFP_KERNEL); - release_sock(&rx->sk); - _leave(" = %d", ret); - return ERR_PTR(ret); -} - -/* - * Handle rejection of a call by userspace - * - reject the call at the front of the queue - */ -int rxrpc_reject_call(struct rxrpc_sock *rx) -{ - struct rxrpc_call *call; - bool abort = false; - int ret; - - _enter(""); - - ASSERT(!irqs_disabled()); - - write_lock(&rx->call_lock); - - if (list_empty(&rx->to_be_accepted)) { - write_unlock(&rx->call_lock); - return -ENODATA; - } - - /* Dequeue the first call and check it's still valid. We gain - * responsibility for the queue's reference. - */ - call = list_entry(rx->to_be_accepted.next, - struct rxrpc_call, accept_link); - list_del_init(&call->accept_link); - sk_acceptq_removed(&rx->sk); - rxrpc_see_call(call); + struct rxrpc_backlog *b = rx->backlog; - write_lock_bh(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_SERVER_ACCEPTING: - __rxrpc_abort_call("REJ", call, 1, RX_USER_ABORT, -ECONNABORTED); - abort = true; - fallthrough; - case RXRPC_CALL_COMPLETE: - ret = call->error; - goto out_discard; - default: - BUG(); - } + if (rx->sk.sk_state == RXRPC_CLOSE) + return -ESHUTDOWN; -out_discard: - write_unlock_bh(&call->state_lock); - write_unlock(&rx->call_lock); - if (abort) { - rxrpc_send_abort_packet(call); - rxrpc_release_call(rx, call); - rxrpc_put_call(call, rxrpc_call_put); - } - rxrpc_service_prealloc(rx, GFP_KERNEL); - _leave(" = %d", ret); - return ret; + return rxrpc_service_prealloc_one(rx, b, NULL, NULL, user_call_ID, + GFP_KERNEL, + atomic_inc_return(&rxrpc_debug_id)); } /* diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index a40fae013942..ed49769b459d 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -23,7 +23,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", [RXRPC_CALL_SERVER_SECURING] = "SvSecure", - [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", @@ -352,9 +351,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->call_id = sp->hdr.callNumber; call->service_id = sp->hdr.serviceId; call->cid = sp->hdr.cid; - call->state = RXRPC_CALL_SERVER_ACCEPTING; - if (sp->hdr.securityIndex > 0) - call->state = RXRPC_CALL_SERVER_SECURING; + call->state = RXRPC_CALL_SERVER_SECURING; call->cong_tstamp = skb->tstamp; /* Set the channel for this call. We don't get channel_lock as we're diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 6e972b4823ef..64ace2960ecc 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -269,7 +269,7 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call) if (call) { write_lock_bh(&call->state_lock); if (call->state == RXRPC_CALL_SERVER_SECURING) { - call->state = RXRPC_CALL_SERVER_ACCEPTING; + call->state = RXRPC_CALL_SERVER_RECV_REQUEST; rxrpc_notify_socket(call); } write_unlock_bh(&call->state_lock); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index c4684dde1f16..2c842851d72e 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -178,37 +178,6 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) return ret; } -/* - * Pass back notification of a new call. The call is added to the - * to-be-accepted list. This means that the next call to be accepted might not - * be the last call seen awaiting acceptance, but unless we leave this on the - * front of the queue and block all other messages until someone gives us a - * user_ID for it, there's not a lot we can do. - */ -static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx, - struct rxrpc_call *call, - struct msghdr *msg, int flags) -{ - int tmp = 0, ret; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &tmp); - - if (ret == 0 && !(flags & MSG_PEEK)) { - _debug("to be accepted"); - write_lock_bh(&rx->recvmsg_lock); - list_del_init(&call->recvmsg_link); - write_unlock_bh(&rx->recvmsg_lock); - - rxrpc_get_call(call, rxrpc_call_got); - write_lock(&rx->call_lock); - list_add_tail(&call->accept_link, &rx->to_be_accepted); - write_unlock(&rx->call_lock); - } - - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_to_be_accepted, 1, 0, 0, ret); - return ret; -} - /* * End the packet reception phase. */ @@ -630,9 +599,6 @@ try_again: } switch (READ_ONCE(call->state)) { - case RXRPC_CALL_SERVER_ACCEPTING: - ret = rxrpc_recvmsg_new_call(rx, call, msg, flags); - break; case RXRPC_CALL_CLIENT_RECV_REPLY: case RXRPC_CALL_SERVER_RECV_REQUEST: case RXRPC_CALL_SERVER_ACK_REQUEST: @@ -728,7 +694,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, call->debug_id, rxrpc_call_states[call->state], iov_iter_count(iter), want_more); - ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING); + ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_SECURING); mutex_lock(&call->user_mutex); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 0824e103d037..d27140c836cc 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -530,10 +530,10 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) return -EINVAL; break; - case RXRPC_ACCEPT: + case RXRPC_CHARGE_ACCEPT: if (p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; - p->command = RXRPC_CMD_ACCEPT; + p->command = RXRPC_CMD_CHARGE_ACCEPT; if (len != 0) return -EINVAL; break; @@ -659,16 +659,12 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (ret < 0) goto error_release_sock; - if (p.command == RXRPC_CMD_ACCEPT) { + if (p.command == RXRPC_CMD_CHARGE_ACCEPT) { ret = -EINVAL; if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) goto error_release_sock; - call = rxrpc_accept_call(rx, p.call.user_call_ID, NULL); - /* The socket is now unlocked. */ - if (IS_ERR(call)) - return PTR_ERR(call); - ret = 0; - goto out_put_unlock; + ret = rxrpc_user_charge_accept(rx, p.call.user_call_ID); + goto error_release_sock; } call = rxrpc_find_call_by_user_ID(rx, p.call.user_call_ID); @@ -690,7 +686,6 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) case RXRPC_CALL_CLIENT_AWAIT_CONN: case RXRPC_CALL_SERVER_PREALLOC: case RXRPC_CALL_SERVER_SECURING: - case RXRPC_CALL_SERVER_ACCEPTING: rxrpc_put_call(call, rxrpc_call_put); ret = -EBUSY; goto error_release_sock; -- cgit v1.2.3